| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-sse2 | FileCheck %s --check-prefixes=ALL,SCALAR |
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE2-ONLY |
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE3 |
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSSE3-ONLY |
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE41 |
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE42 |
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX1 |
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX2-ONLY |
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512F |
| ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512BW |
| |
| define void @vec32_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec32_v2i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %ecx |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %cl, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %cl, 3(%rdx) |
| ; SCALAR-NEXT: movb %al, 2(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE-LABEL: vec32_v2i8: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: movl (%rdi), %eax |
| ; SSE-NEXT: notl %eax |
| ; SSE-NEXT: movw %ax, (%rsi) |
| ; SSE-NEXT: movw %ax, (%rdx) |
| ; SSE-NEXT: movw %ax, 2(%rdx) |
| ; SSE-NEXT: retq |
| %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> |
| store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 |
| store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 |
| store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 |
| ret void |
| } |
| |
| define void @vec64_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec64_v2i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %ecx |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %cl, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %cl, 3(%rdx) |
| ; SCALAR-NEXT: movb %al, 2(%rdx) |
| ; SCALAR-NEXT: movb %cl, 5(%rdx) |
| ; SCALAR-NEXT: movb %al, 4(%rdx) |
| ; SCALAR-NEXT: movb %cl, 7(%rdx) |
| ; SCALAR-NEXT: movb %al, 6(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE-LABEL: vec64_v2i8: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: movl (%rdi), %eax |
| ; SSE-NEXT: notl %eax |
| ; SSE-NEXT: movw %ax, (%rsi) |
| ; SSE-NEXT: movw %ax, (%rdx) |
| ; SSE-NEXT: movw %ax, 2(%rdx) |
| ; SSE-NEXT: movw %ax, 4(%rdx) |
| ; SSE-NEXT: movw %ax, 6(%rdx) |
| ; SSE-NEXT: retq |
| %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> |
| store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 |
| store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 |
| store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 |
| %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 |
| store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 |
| %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 |
| store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 |
| ret void |
| } |
| |
| define void @vec64_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec64_v2i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzwl 2(%rdi), %eax |
| ; SCALAR-NEXT: movl (%rdi), %ecx |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: movw %ax, 2(%rsi) |
| ; SCALAR-NEXT: movw %cx, (%rsi) |
| ; SCALAR-NEXT: movw %ax, 2(%rdx) |
| ; SCALAR-NEXT: movw %cx, (%rdx) |
| ; SCALAR-NEXT: movw %ax, 6(%rdx) |
| ; SCALAR-NEXT: movw %cx, 4(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE-LABEL: vec64_v2i16: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: movl (%rdi), %eax |
| ; SSE-NEXT: notl %eax |
| ; SSE-NEXT: movl %eax, (%rsi) |
| ; SSE-NEXT: movl %eax, (%rdx) |
| ; SSE-NEXT: movl %eax, 4(%rdx) |
| ; SSE-NEXT: retq |
| %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1> |
| store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 |
| store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 |
| store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec64_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec64_v4i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzbl 3(%rdi), %eax |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 1(%rdi), %edi |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: movb %al, 3(%rsi) |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movb %dil, 1(%rsi) |
| ; SCALAR-NEXT: movb %r8b, (%rsi) |
| ; SCALAR-NEXT: movb %al, 3(%rdx) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movb %dil, 1(%rdx) |
| ; SCALAR-NEXT: movb %r8b, (%rdx) |
| ; SCALAR-NEXT: movb %al, 7(%rdx) |
| ; SCALAR-NEXT: movb %cl, 6(%rdx) |
| ; SCALAR-NEXT: movb %dil, 5(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 4(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE-LABEL: vec64_v4i8: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: movl (%rdi), %eax |
| ; SSE-NEXT: notl %eax |
| ; SSE-NEXT: movl %eax, (%rsi) |
| ; SSE-NEXT: movl %eax, (%rdx) |
| ; SSE-NEXT: movl %eax, 4(%rdx) |
| ; SSE-NEXT: retq |
| %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1> |
| store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 |
| store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 |
| store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec128_v2i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %ecx |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %cl, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %cl, 3(%rdx) |
| ; SCALAR-NEXT: movb %al, 2(%rdx) |
| ; SCALAR-NEXT: movb %cl, 5(%rdx) |
| ; SCALAR-NEXT: movb %al, 4(%rdx) |
| ; SCALAR-NEXT: movb %cl, 7(%rdx) |
| ; SCALAR-NEXT: movb %al, 6(%rdx) |
| ; SCALAR-NEXT: movb %cl, 9(%rdx) |
| ; SCALAR-NEXT: movb %al, 8(%rdx) |
| ; SCALAR-NEXT: movb %cl, 11(%rdx) |
| ; SCALAR-NEXT: movb %al, 10(%rdx) |
| ; SCALAR-NEXT: movb %cl, 13(%rdx) |
| ; SCALAR-NEXT: movb %al, 12(%rdx) |
| ; SCALAR-NEXT: movb %cl, 15(%rdx) |
| ; SCALAR-NEXT: movb %al, 14(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-ONLY-LABEL: vec128_v2i8: |
| ; SSE2-ONLY: # %bb.0: |
| ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-ONLY-NEXT: movd %xmm0, %eax |
| ; SSE2-ONLY-NEXT: movw %ax, (%rsi) |
| ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-ONLY-NEXT: retq |
| ; |
| ; SSE3-LABEL: vec128_v2i8: |
| ; SSE3: # %bb.0: |
| ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE3-NEXT: pxor (%rdi), %xmm0 |
| ; SSE3-NEXT: movd %xmm0, %eax |
| ; SSE3-NEXT: movw %ax, (%rsi) |
| ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE3-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE3-NEXT: retq |
| ; |
| ; SSSE3-ONLY-LABEL: vec128_v2i8: |
| ; SSSE3-ONLY: # %bb.0: |
| ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSSE3-ONLY-NEXT: movd %xmm0, %eax |
| ; SSSE3-ONLY-NEXT: movw %ax, (%rsi) |
| ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) |
| ; SSSE3-ONLY-NEXT: retq |
| ; |
| ; SSE41-LABEL: vec128_v2i8: |
| ; SSE41: # %bb.0: |
| ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE41-NEXT: pxor (%rdi), %xmm0 |
| ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) |
| ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE41-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE41-NEXT: retq |
| ; |
| ; SSE42-LABEL: vec128_v2i8: |
| ; SSE42: # %bb.0: |
| ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE42-NEXT: pxor (%rdi), %xmm0 |
| ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) |
| ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE42-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE42-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec128_v2i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec128_v2i8: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi) |
| ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> |
| store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 |
| store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 |
| store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 |
| %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 |
| store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 |
| %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 |
| store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 |
| %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 |
| store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 |
| %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 |
| store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 |
| %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 |
| store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 |
| %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 |
| store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 |
| ret void |
| } |
| |
| define void @vec128_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec128_v2i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzwl 2(%rdi), %eax |
| ; SCALAR-NEXT: movl (%rdi), %ecx |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: movw %ax, 2(%rsi) |
| ; SCALAR-NEXT: movw %cx, (%rsi) |
| ; SCALAR-NEXT: movw %ax, 2(%rdx) |
| ; SCALAR-NEXT: movw %cx, (%rdx) |
| ; SCALAR-NEXT: movw %ax, 6(%rdx) |
| ; SCALAR-NEXT: movw %cx, 4(%rdx) |
| ; SCALAR-NEXT: movw %ax, 10(%rdx) |
| ; SCALAR-NEXT: movw %cx, 8(%rdx) |
| ; SCALAR-NEXT: movw %ax, 14(%rdx) |
| ; SCALAR-NEXT: movw %cx, 12(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec128_v2i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movd %xmm0, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec128_v2i16: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec128_v2i16: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1> |
| store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 |
| store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 |
| store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 |
| store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 |
| store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec128_v2i32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: movl %ecx, 4(%rsi) |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movl %ecx, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movl %ecx, 12(%rdx) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec128_v2i32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec128_v2i32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec128_v2i32: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec128_v2i32: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> |
| store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 |
| store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 |
| store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec128_v2f32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: movl %ecx, 4(%rsi) |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movl %ecx, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movl %ecx, 12(%rdx) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec128_v2f32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec128_v2f32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec128_v2f32: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec128_v2f32: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> |
| %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> |
| store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 |
| store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 |
| store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec128_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec128_v4i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzbl 3(%rdi), %eax |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 1(%rdi), %edi |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: movb %al, 3(%rsi) |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movb %dil, 1(%rsi) |
| ; SCALAR-NEXT: movb %r8b, (%rsi) |
| ; SCALAR-NEXT: movb %al, 3(%rdx) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movb %dil, 1(%rdx) |
| ; SCALAR-NEXT: movb %r8b, (%rdx) |
| ; SCALAR-NEXT: movb %al, 7(%rdx) |
| ; SCALAR-NEXT: movb %cl, 6(%rdx) |
| ; SCALAR-NEXT: movb %dil, 5(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 4(%rdx) |
| ; SCALAR-NEXT: movb %al, 11(%rdx) |
| ; SCALAR-NEXT: movb %cl, 10(%rdx) |
| ; SCALAR-NEXT: movb %dil, 9(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 8(%rdx) |
| ; SCALAR-NEXT: movb %al, 15(%rdx) |
| ; SCALAR-NEXT: movb %cl, 14(%rdx) |
| ; SCALAR-NEXT: movb %dil, 13(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 12(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec128_v4i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movd %xmm0, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec128_v4i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec128_v4i8: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1> |
| store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 |
| store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 |
| store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 |
| store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 |
| store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec128_v4i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzwl 6(%rdi), %eax |
| ; SCALAR-NEXT: movzwl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movl (%rdi), %r8d |
| ; SCALAR-NEXT: movl 4(%rdi), %edi |
| ; SCALAR-NEXT: notl %r8d |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %edi |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: movw %ax, 6(%rsi) |
| ; SCALAR-NEXT: movw %di, 4(%rsi) |
| ; SCALAR-NEXT: movw %cx, 2(%rsi) |
| ; SCALAR-NEXT: movw %r8w, (%rsi) |
| ; SCALAR-NEXT: movw %ax, 6(%rdx) |
| ; SCALAR-NEXT: movw %di, 4(%rdx) |
| ; SCALAR-NEXT: movw %cx, 2(%rdx) |
| ; SCALAR-NEXT: movw %r8w, (%rdx) |
| ; SCALAR-NEXT: movw %ax, 14(%rdx) |
| ; SCALAR-NEXT: movw %di, 12(%rdx) |
| ; SCALAR-NEXT: movw %cx, 10(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 8(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec128_v4i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec128_v4i16: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec128_v4i16: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec128_v4i16: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1> |
| store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 |
| store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 |
| store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec128_v8i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzbl 7(%rdi), %ebx |
| ; SCALAR-NEXT: movzbl 6(%rdi), %r11d |
| ; SCALAR-NEXT: movzbl 5(%rdi), %r10d |
| ; SCALAR-NEXT: movzbl 4(%rdi), %r9d |
| ; SCALAR-NEXT: movzbl 3(%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %edi |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: notb %r10b |
| ; SCALAR-NEXT: notb %r11b |
| ; SCALAR-NEXT: notb %bl |
| ; SCALAR-NEXT: movb %bl, 7(%rsi) |
| ; SCALAR-NEXT: movb %r11b, 6(%rsi) |
| ; SCALAR-NEXT: movb %r10b, 5(%rsi) |
| ; SCALAR-NEXT: movb %r9b, 4(%rsi) |
| ; SCALAR-NEXT: movb %r8b, 3(%rsi) |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movb %dil, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %bl, 7(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 6(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 5(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 4(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 3(%rdx) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movb %dil, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %bl, 15(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 14(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 13(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 12(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 11(%rdx) |
| ; SCALAR-NEXT: movb %cl, 10(%rdx) |
| ; SCALAR-NEXT: movb %dil, 9(%rdx) |
| ; SCALAR-NEXT: movb %al, 8(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec128_v8i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec128_v8i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec128_v8i8: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec128_v8i8: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 |
| store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 |
| store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v2i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %ecx |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %cl, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %cl, 3(%rdx) |
| ; SCALAR-NEXT: movb %al, 2(%rdx) |
| ; SCALAR-NEXT: movb %cl, 5(%rdx) |
| ; SCALAR-NEXT: movb %al, 4(%rdx) |
| ; SCALAR-NEXT: movb %cl, 7(%rdx) |
| ; SCALAR-NEXT: movb %al, 6(%rdx) |
| ; SCALAR-NEXT: movb %cl, 9(%rdx) |
| ; SCALAR-NEXT: movb %al, 8(%rdx) |
| ; SCALAR-NEXT: movb %cl, 11(%rdx) |
| ; SCALAR-NEXT: movb %al, 10(%rdx) |
| ; SCALAR-NEXT: movb %cl, 13(%rdx) |
| ; SCALAR-NEXT: movb %al, 12(%rdx) |
| ; SCALAR-NEXT: movb %cl, 15(%rdx) |
| ; SCALAR-NEXT: movb %al, 14(%rdx) |
| ; SCALAR-NEXT: movb %cl, 17(%rdx) |
| ; SCALAR-NEXT: movb %al, 16(%rdx) |
| ; SCALAR-NEXT: movb %cl, 19(%rdx) |
| ; SCALAR-NEXT: movb %al, 18(%rdx) |
| ; SCALAR-NEXT: movb %cl, 21(%rdx) |
| ; SCALAR-NEXT: movb %al, 20(%rdx) |
| ; SCALAR-NEXT: movb %cl, 23(%rdx) |
| ; SCALAR-NEXT: movb %al, 22(%rdx) |
| ; SCALAR-NEXT: movb %cl, 25(%rdx) |
| ; SCALAR-NEXT: movb %al, 24(%rdx) |
| ; SCALAR-NEXT: movb %cl, 27(%rdx) |
| ; SCALAR-NEXT: movb %al, 26(%rdx) |
| ; SCALAR-NEXT: movb %cl, 29(%rdx) |
| ; SCALAR-NEXT: movb %al, 28(%rdx) |
| ; SCALAR-NEXT: movb %cl, 31(%rdx) |
| ; SCALAR-NEXT: movb %al, 30(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-ONLY-LABEL: vec256_v2i8: |
| ; SSE2-ONLY: # %bb.0: |
| ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-ONLY-NEXT: movd %xmm0, %eax |
| ; SSE2-ONLY-NEXT: movw %ax, (%rsi) |
| ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-ONLY-NEXT: retq |
| ; |
| ; SSE3-LABEL: vec256_v2i8: |
| ; SSE3: # %bb.0: |
| ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE3-NEXT: pxor (%rdi), %xmm0 |
| ; SSE3-NEXT: movd %xmm0, %eax |
| ; SSE3-NEXT: movw %ax, (%rsi) |
| ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE3-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE3-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE3-NEXT: retq |
| ; |
| ; SSSE3-ONLY-LABEL: vec256_v2i8: |
| ; SSSE3-ONLY: # %bb.0: |
| ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSSE3-ONLY-NEXT: movd %xmm0, %eax |
| ; SSSE3-ONLY-NEXT: movw %ax, (%rsi) |
| ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) |
| ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSSE3-ONLY-NEXT: retq |
| ; |
| ; SSE41-LABEL: vec256_v2i8: |
| ; SSE41: # %bb.0: |
| ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE41-NEXT: pxor (%rdi), %xmm0 |
| ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) |
| ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE41-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE41-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE41-NEXT: retq |
| ; |
| ; SSE42-LABEL: vec256_v2i8: |
| ; SSE42: # %bb.0: |
| ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE42-NEXT: pxor (%rdi), %xmm0 |
| ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) |
| ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE42-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE42-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec256_v2i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec256_v2i8: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi) |
| ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> |
| store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 |
| store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 |
| store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 |
| %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 |
| store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 |
| %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 |
| store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 |
| %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 |
| store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 |
| %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 |
| store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 |
| %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 |
| store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 |
| %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 |
| store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 |
| %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8 |
| store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16 |
| %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9 |
| store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2 |
| %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10 |
| store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4 |
| %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11 |
| store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2 |
| %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12 |
| store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8 |
| %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13 |
| store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2 |
| %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14 |
| store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4 |
| %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15 |
| store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2 |
| ret void |
| } |
| |
| define void @vec256_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v2i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzwl 2(%rdi), %eax |
| ; SCALAR-NEXT: movl (%rdi), %ecx |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: movw %ax, 2(%rsi) |
| ; SCALAR-NEXT: movw %cx, (%rsi) |
| ; SCALAR-NEXT: movw %ax, 2(%rdx) |
| ; SCALAR-NEXT: movw %cx, (%rdx) |
| ; SCALAR-NEXT: movw %ax, 6(%rdx) |
| ; SCALAR-NEXT: movw %cx, 4(%rdx) |
| ; SCALAR-NEXT: movw %ax, 10(%rdx) |
| ; SCALAR-NEXT: movw %cx, 8(%rdx) |
| ; SCALAR-NEXT: movw %ax, 14(%rdx) |
| ; SCALAR-NEXT: movw %cx, 12(%rdx) |
| ; SCALAR-NEXT: movw %ax, 18(%rdx) |
| ; SCALAR-NEXT: movw %cx, 16(%rdx) |
| ; SCALAR-NEXT: movw %ax, 22(%rdx) |
| ; SCALAR-NEXT: movw %cx, 20(%rdx) |
| ; SCALAR-NEXT: movw %ax, 26(%rdx) |
| ; SCALAR-NEXT: movw %cx, 24(%rdx) |
| ; SCALAR-NEXT: movw %ax, 30(%rdx) |
| ; SCALAR-NEXT: movw %cx, 28(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v2i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movd %xmm0, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec256_v2i16: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec256_v2i16: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1> |
| store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 |
| store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 |
| store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 |
| store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 |
| store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4 |
| store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16 |
| %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5 |
| store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4 |
| %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6 |
| store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8 |
| %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7 |
| store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v2i32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: movl %ecx, 4(%rsi) |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movl %ecx, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movl %ecx, 12(%rdx) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 20(%rdx) |
| ; SCALAR-NEXT: movl %eax, 16(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 28(%rdx) |
| ; SCALAR-NEXT: movl %eax, 24(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v2i32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec256_v2i32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec256_v2i32: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec256_v2i32: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> |
| store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 |
| store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 |
| store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2 |
| store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3 |
| store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v2f32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: movl %ecx, 4(%rsi) |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movl %ecx, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movl %ecx, 12(%rdx) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 20(%rdx) |
| ; SCALAR-NEXT: movl %eax, 16(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 28(%rdx) |
| ; SCALAR-NEXT: movl %eax, 24(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v2f32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec256_v2f32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec256_v2f32: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec256_v2f32: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> |
| %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> |
| store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 |
| store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 |
| store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2 |
| store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3 |
| store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec256_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v2i64: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 24(%rdx) |
| ; SCALAR-NEXT: movq %rax, 16(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v2i64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec256_v2i64: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> |
| store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0 |
| store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1 |
| store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec256_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v2f64: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 24(%rdx) |
| ; SCALAR-NEXT: movq %rax, 16(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v2f64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec256_v2f64: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> |
| %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double> |
| store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0 |
| store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1 |
| store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec256_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v4i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzbl 3(%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %edi |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: movb %r8b, 3(%rsi) |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movb %dil, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %r8b, 3(%rdx) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movb %dil, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %r8b, 7(%rdx) |
| ; SCALAR-NEXT: movb %cl, 6(%rdx) |
| ; SCALAR-NEXT: movb %dil, 5(%rdx) |
| ; SCALAR-NEXT: movb %al, 4(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 11(%rdx) |
| ; SCALAR-NEXT: movb %cl, 10(%rdx) |
| ; SCALAR-NEXT: movb %dil, 9(%rdx) |
| ; SCALAR-NEXT: movb %al, 8(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 15(%rdx) |
| ; SCALAR-NEXT: movb %cl, 14(%rdx) |
| ; SCALAR-NEXT: movb %dil, 13(%rdx) |
| ; SCALAR-NEXT: movb %al, 12(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 19(%rdx) |
| ; SCALAR-NEXT: movb %cl, 18(%rdx) |
| ; SCALAR-NEXT: movb %dil, 17(%rdx) |
| ; SCALAR-NEXT: movb %al, 16(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 23(%rdx) |
| ; SCALAR-NEXT: movb %cl, 22(%rdx) |
| ; SCALAR-NEXT: movb %dil, 21(%rdx) |
| ; SCALAR-NEXT: movb %al, 20(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 27(%rdx) |
| ; SCALAR-NEXT: movb %cl, 26(%rdx) |
| ; SCALAR-NEXT: movb %dil, 25(%rdx) |
| ; SCALAR-NEXT: movb %al, 24(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 31(%rdx) |
| ; SCALAR-NEXT: movb %cl, 30(%rdx) |
| ; SCALAR-NEXT: movb %dil, 29(%rdx) |
| ; SCALAR-NEXT: movb %al, 28(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v4i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movd %xmm0, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec256_v4i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec256_v4i8: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1> |
| store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 |
| store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 |
| store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 |
| store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 |
| store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4 |
| store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16 |
| %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5 |
| store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4 |
| %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6 |
| store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8 |
| %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7 |
| store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v4i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzwl 6(%rdi), %r8d |
| ; SCALAR-NEXT: movzwl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %edi |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %edi |
| ; SCALAR-NEXT: notl %r8d |
| ; SCALAR-NEXT: movw %r8w, 6(%rsi) |
| ; SCALAR-NEXT: movw %di, 4(%rsi) |
| ; SCALAR-NEXT: movw %cx, 2(%rsi) |
| ; SCALAR-NEXT: movw %ax, (%rsi) |
| ; SCALAR-NEXT: movw %r8w, 6(%rdx) |
| ; SCALAR-NEXT: movw %di, 4(%rdx) |
| ; SCALAR-NEXT: movw %cx, 2(%rdx) |
| ; SCALAR-NEXT: movw %ax, (%rdx) |
| ; SCALAR-NEXT: movw %r8w, 14(%rdx) |
| ; SCALAR-NEXT: movw %di, 12(%rdx) |
| ; SCALAR-NEXT: movw %cx, 10(%rdx) |
| ; SCALAR-NEXT: movw %ax, 8(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 22(%rdx) |
| ; SCALAR-NEXT: movw %di, 20(%rdx) |
| ; SCALAR-NEXT: movw %cx, 18(%rdx) |
| ; SCALAR-NEXT: movw %ax, 16(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 30(%rdx) |
| ; SCALAR-NEXT: movw %di, 28(%rdx) |
| ; SCALAR-NEXT: movw %cx, 26(%rdx) |
| ; SCALAR-NEXT: movw %ax, 24(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v4i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec256_v4i16: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec256_v4i16: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec256_v4i16: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1> |
| store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 |
| store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 |
| store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2 |
| store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3 |
| store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec256_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v4i32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movaps (%rdi), %xmm0 |
| ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| ; SCALAR-NEXT: movaps %xmm0, (%rsi) |
| ; SCALAR-NEXT: movaps %xmm0, (%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 16(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v4i32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec256_v4i32: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> |
| store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0 |
| store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1 |
| store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec256_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v4f32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movaps (%rdi), %xmm0 |
| ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| ; SCALAR-NEXT: movaps %xmm0, (%rsi) |
| ; SCALAR-NEXT: movaps %xmm0, (%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 16(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v4f32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec256_v4f32: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> |
| %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float> |
| store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0 |
| store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1 |
| store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v8i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzbl 7(%rdi), %ebx |
| ; SCALAR-NEXT: movzbl 6(%rdi), %r11d |
| ; SCALAR-NEXT: movzbl 5(%rdi), %r10d |
| ; SCALAR-NEXT: movzbl 4(%rdi), %r9d |
| ; SCALAR-NEXT: movzbl 3(%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %edi |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: notb %r10b |
| ; SCALAR-NEXT: notb %r11b |
| ; SCALAR-NEXT: notb %bl |
| ; SCALAR-NEXT: movb %bl, 7(%rsi) |
| ; SCALAR-NEXT: movb %r11b, 6(%rsi) |
| ; SCALAR-NEXT: movb %r10b, 5(%rsi) |
| ; SCALAR-NEXT: movb %r9b, 4(%rsi) |
| ; SCALAR-NEXT: movb %r8b, 3(%rsi) |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movb %dil, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %bl, 7(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 6(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 5(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 4(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 3(%rdx) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movb %dil, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %bl, 15(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 14(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 13(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 12(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 11(%rdx) |
| ; SCALAR-NEXT: movb %cl, 10(%rdx) |
| ; SCALAR-NEXT: movb %dil, 9(%rdx) |
| ; SCALAR-NEXT: movb %al, 8(%rdx) |
| ; SCALAR-NEXT: movb %bl, 23(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 22(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 21(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 20(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 19(%rdx) |
| ; SCALAR-NEXT: movb %cl, 18(%rdx) |
| ; SCALAR-NEXT: movb %dil, 17(%rdx) |
| ; SCALAR-NEXT: movb %al, 16(%rdx) |
| ; SCALAR-NEXT: movb %bl, 31(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 30(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 29(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 28(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 27(%rdx) |
| ; SCALAR-NEXT: movb %cl, 26(%rdx) |
| ; SCALAR-NEXT: movb %dil, 25(%rdx) |
| ; SCALAR-NEXT: movb %al, 24(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v8i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec256_v8i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec256_v8i8: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec256_v8i8: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 |
| store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 |
| store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2 |
| store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3 |
| store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec256_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v8i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzwl 14(%rdi), %ebx |
| ; SCALAR-NEXT: movl 12(%rdi), %r11d |
| ; SCALAR-NEXT: movzwl 10(%rdi), %r10d |
| ; SCALAR-NEXT: movl 8(%rdi), %r9d |
| ; SCALAR-NEXT: movzwl 6(%rdi), %r8d |
| ; SCALAR-NEXT: movzwl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %edi |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %edi |
| ; SCALAR-NEXT: notl %r8d |
| ; SCALAR-NEXT: notl %r9d |
| ; SCALAR-NEXT: notl %r10d |
| ; SCALAR-NEXT: notl %r11d |
| ; SCALAR-NEXT: notl %ebx |
| ; SCALAR-NEXT: movw %bx, 14(%rsi) |
| ; SCALAR-NEXT: movw %r11w, 12(%rsi) |
| ; SCALAR-NEXT: movw %r10w, 10(%rsi) |
| ; SCALAR-NEXT: movw %r9w, 8(%rsi) |
| ; SCALAR-NEXT: movw %r8w, 6(%rsi) |
| ; SCALAR-NEXT: movw %di, 4(%rsi) |
| ; SCALAR-NEXT: movw %cx, 2(%rsi) |
| ; SCALAR-NEXT: movw %ax, (%rsi) |
| ; SCALAR-NEXT: movw %bx, 14(%rdx) |
| ; SCALAR-NEXT: movw %r11w, 12(%rdx) |
| ; SCALAR-NEXT: movw %r10w, 10(%rdx) |
| ; SCALAR-NEXT: movw %r9w, 8(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 6(%rdx) |
| ; SCALAR-NEXT: movw %di, 4(%rdx) |
| ; SCALAR-NEXT: movw %cx, 2(%rdx) |
| ; SCALAR-NEXT: movw %ax, (%rdx) |
| ; SCALAR-NEXT: movw %bx, 30(%rdx) |
| ; SCALAR-NEXT: movw %r11w, 28(%rdx) |
| ; SCALAR-NEXT: movw %r10w, 26(%rdx) |
| ; SCALAR-NEXT: movw %r9w, 24(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 22(%rdx) |
| ; SCALAR-NEXT: movw %di, 20(%rdx) |
| ; SCALAR-NEXT: movw %cx, 18(%rdx) |
| ; SCALAR-NEXT: movw %ax, 16(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v8i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec256_v8i16: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> |
| store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0 |
| store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1 |
| store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec256_v16i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbp |
| ; SCALAR-NEXT: pushq %r15 |
| ; SCALAR-NEXT: pushq %r14 |
| ; SCALAR-NEXT: pushq %r13 |
| ; SCALAR-NEXT: pushq %r12 |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzbl 15(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 14(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 13(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 12(%rdi), %r15d |
| ; SCALAR-NEXT: movzbl 11(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 10(%rdi), %ebp |
| ; SCALAR-NEXT: movzbl 9(%rdi), %r14d |
| ; SCALAR-NEXT: movzbl 8(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 7(%rdi), %r12d |
| ; SCALAR-NEXT: movzbl 6(%rdi), %r10d |
| ; SCALAR-NEXT: movzbl 5(%rdi), %r9d |
| ; SCALAR-NEXT: movzbl 4(%rdi), %ebx |
| ; SCALAR-NEXT: movzbl 3(%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %r13d |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r13b |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %bl |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r10b |
| ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r12b |
| ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %r11b |
| ; SCALAR-NEXT: movl %r14d, %r10d |
| ; SCALAR-NEXT: notb %r10b |
| ; SCALAR-NEXT: notb %bpl |
| ; SCALAR-NEXT: movl %ebp, %r14d |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movl %r15d, %edi |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %bpl |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %r15b |
| ; SCALAR-NEXT: movb %r15b, 15(%rsi) |
| ; SCALAR-NEXT: movb %bpl, 14(%rsi) |
| ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movl %r9d, %eax |
| ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movb %r9b, 13(%rsi) |
| ; SCALAR-NEXT: movb %dil, 12(%rsi) |
| ; SCALAR-NEXT: movb %r8b, 11(%rsi) |
| ; SCALAR-NEXT: movb %r14b, 10(%rsi) |
| ; SCALAR-NEXT: movb %r10b, 9(%rsi) |
| ; SCALAR-NEXT: movl %r10d, %r8d |
| ; SCALAR-NEXT: movb %r11b, 8(%rsi) |
| ; SCALAR-NEXT: movl %r11d, %r9d |
| ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movb %r12b, 7(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %cl, 6(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %dil, 5(%rsi) |
| ; SCALAR-NEXT: movb %bl, 4(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %cl, 3(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movb %r13b, 1(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r10b, (%rsi) |
| ; SCALAR-NEXT: movb %r15b, 15(%rdx) |
| ; SCALAR-NEXT: movl %r15d, %r11d |
| ; SCALAR-NEXT: movb %bpl, 14(%rdx) |
| ; SCALAR-NEXT: movb %al, 13(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r12b, 12(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r15b, 11(%rdx) |
| ; SCALAR-NEXT: movb %r14b, 10(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 9(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 8(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r9b, 7(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 6(%rdx) |
| ; SCALAR-NEXT: movb %dil, 5(%rdx) |
| ; SCALAR-NEXT: movb %bl, 4(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 3(%rdx) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movb %r13b, 1(%rdx) |
| ; SCALAR-NEXT: movl %r10d, %edi |
| ; SCALAR-NEXT: movb %r10b, (%rdx) |
| ; SCALAR-NEXT: movb %r11b, 31(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r10b, 30(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r10b, 29(%rdx) |
| ; SCALAR-NEXT: movb %r12b, 28(%rdx) |
| ; SCALAR-NEXT: movb %r15b, 27(%rdx) |
| ; SCALAR-NEXT: movb %r14b, 26(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 25(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r10b, 24(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 23(%rdx) |
| ; SCALAR-NEXT: movb %al, 22(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 21(%rdx) |
| ; SCALAR-NEXT: movb %bl, 20(%rdx) |
| ; SCALAR-NEXT: movb %sil, 19(%rdx) |
| ; SCALAR-NEXT: movb %cl, 18(%rdx) |
| ; SCALAR-NEXT: movb %r13b, 17(%rdx) |
| ; SCALAR-NEXT: movb %dil, 16(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: popq %r12 |
| ; SCALAR-NEXT: popq %r13 |
| ; SCALAR-NEXT: popq %r14 |
| ; SCALAR-NEXT: popq %r15 |
| ; SCALAR-NEXT: popq %rbp |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec256_v16i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec256_v16i8: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0 |
| store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1 |
| store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v2i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %ecx |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %cl, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %cl, 3(%rdx) |
| ; SCALAR-NEXT: movb %al, 2(%rdx) |
| ; SCALAR-NEXT: movb %cl, 5(%rdx) |
| ; SCALAR-NEXT: movb %al, 4(%rdx) |
| ; SCALAR-NEXT: movb %cl, 7(%rdx) |
| ; SCALAR-NEXT: movb %al, 6(%rdx) |
| ; SCALAR-NEXT: movb %cl, 9(%rdx) |
| ; SCALAR-NEXT: movb %al, 8(%rdx) |
| ; SCALAR-NEXT: movb %cl, 11(%rdx) |
| ; SCALAR-NEXT: movb %al, 10(%rdx) |
| ; SCALAR-NEXT: movb %cl, 13(%rdx) |
| ; SCALAR-NEXT: movb %al, 12(%rdx) |
| ; SCALAR-NEXT: movb %cl, 15(%rdx) |
| ; SCALAR-NEXT: movb %al, 14(%rdx) |
| ; SCALAR-NEXT: movb %cl, 17(%rdx) |
| ; SCALAR-NEXT: movb %al, 16(%rdx) |
| ; SCALAR-NEXT: movb %cl, 19(%rdx) |
| ; SCALAR-NEXT: movb %al, 18(%rdx) |
| ; SCALAR-NEXT: movb %cl, 21(%rdx) |
| ; SCALAR-NEXT: movb %al, 20(%rdx) |
| ; SCALAR-NEXT: movb %cl, 23(%rdx) |
| ; SCALAR-NEXT: movb %al, 22(%rdx) |
| ; SCALAR-NEXT: movb %cl, 25(%rdx) |
| ; SCALAR-NEXT: movb %al, 24(%rdx) |
| ; SCALAR-NEXT: movb %cl, 27(%rdx) |
| ; SCALAR-NEXT: movb %al, 26(%rdx) |
| ; SCALAR-NEXT: movb %cl, 29(%rdx) |
| ; SCALAR-NEXT: movb %al, 28(%rdx) |
| ; SCALAR-NEXT: movb %cl, 31(%rdx) |
| ; SCALAR-NEXT: movb %al, 30(%rdx) |
| ; SCALAR-NEXT: movb %cl, 33(%rdx) |
| ; SCALAR-NEXT: movb %al, 32(%rdx) |
| ; SCALAR-NEXT: movb %cl, 35(%rdx) |
| ; SCALAR-NEXT: movb %al, 34(%rdx) |
| ; SCALAR-NEXT: movb %cl, 37(%rdx) |
| ; SCALAR-NEXT: movb %al, 36(%rdx) |
| ; SCALAR-NEXT: movb %cl, 39(%rdx) |
| ; SCALAR-NEXT: movb %al, 38(%rdx) |
| ; SCALAR-NEXT: movb %cl, 41(%rdx) |
| ; SCALAR-NEXT: movb %al, 40(%rdx) |
| ; SCALAR-NEXT: movb %cl, 43(%rdx) |
| ; SCALAR-NEXT: movb %al, 42(%rdx) |
| ; SCALAR-NEXT: movb %cl, 45(%rdx) |
| ; SCALAR-NEXT: movb %al, 44(%rdx) |
| ; SCALAR-NEXT: movb %cl, 47(%rdx) |
| ; SCALAR-NEXT: movb %al, 46(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-ONLY-LABEL: vec384_v2i8: |
| ; SSE2-ONLY: # %bb.0: |
| ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-ONLY-NEXT: movd %xmm0, %eax |
| ; SSE2-ONLY-NEXT: movw %ax, (%rsi) |
| ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-ONLY-NEXT: retq |
| ; |
| ; SSE3-LABEL: vec384_v2i8: |
| ; SSE3: # %bb.0: |
| ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE3-NEXT: pxor (%rdi), %xmm0 |
| ; SSE3-NEXT: movd %xmm0, %eax |
| ; SSE3-NEXT: movw %ax, (%rsi) |
| ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE3-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE3-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE3-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE3-NEXT: retq |
| ; |
| ; SSSE3-ONLY-LABEL: vec384_v2i8: |
| ; SSSE3-ONLY: # %bb.0: |
| ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSSE3-ONLY-NEXT: movd %xmm0, %eax |
| ; SSSE3-ONLY-NEXT: movw %ax, (%rsi) |
| ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) |
| ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSSE3-ONLY-NEXT: retq |
| ; |
| ; SSE41-LABEL: vec384_v2i8: |
| ; SSE41: # %bb.0: |
| ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE41-NEXT: pxor (%rdi), %xmm0 |
| ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) |
| ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE41-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE41-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE41-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE41-NEXT: retq |
| ; |
| ; SSE42-LABEL: vec384_v2i8: |
| ; SSE42: # %bb.0: |
| ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE42-NEXT: pxor (%rdi), %xmm0 |
| ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) |
| ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE42-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE42-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE42-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v2i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 |
| ; AVX1-NEXT: vmovaps %ymm1, (%rdx) |
| ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec384_v2i8: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi) |
| ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> |
| store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 |
| store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 |
| store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 |
| %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 |
| store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 |
| %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 |
| store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 |
| %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 |
| store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 |
| %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 |
| store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 |
| %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 |
| store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 |
| %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 |
| store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 |
| %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8 |
| store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16 |
| %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9 |
| store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2 |
| %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10 |
| store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4 |
| %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11 |
| store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2 |
| %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12 |
| store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8 |
| %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13 |
| store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2 |
| %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14 |
| store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4 |
| %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15 |
| store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2 |
| %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16 |
| store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32 |
| %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17 |
| store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2 |
| %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18 |
| store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4 |
| %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19 |
| store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2 |
| %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20 |
| store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8 |
| %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21 |
| store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2 |
| %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22 |
| store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4 |
| %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23 |
| store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2 |
| ret void |
| } |
| |
| define void @vec384_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v2i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzwl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: movw %cx, 2(%rsi) |
| ; SCALAR-NEXT: movw %ax, (%rsi) |
| ; SCALAR-NEXT: movw %cx, 2(%rdx) |
| ; SCALAR-NEXT: movw %ax, (%rdx) |
| ; SCALAR-NEXT: movw %cx, 6(%rdx) |
| ; SCALAR-NEXT: movw %ax, 4(%rdx) |
| ; SCALAR-NEXT: movw %cx, 10(%rdx) |
| ; SCALAR-NEXT: movw %ax, 8(%rdx) |
| ; SCALAR-NEXT: movw %cx, 14(%rdx) |
| ; SCALAR-NEXT: movw %ax, 12(%rdx) |
| ; SCALAR-NEXT: movw %cx, 18(%rdx) |
| ; SCALAR-NEXT: movw %ax, 16(%rdx) |
| ; SCALAR-NEXT: movw %cx, 22(%rdx) |
| ; SCALAR-NEXT: movw %ax, 20(%rdx) |
| ; SCALAR-NEXT: movw %cx, 26(%rdx) |
| ; SCALAR-NEXT: movw %ax, 24(%rdx) |
| ; SCALAR-NEXT: movw %cx, 30(%rdx) |
| ; SCALAR-NEXT: movw %ax, 28(%rdx) |
| ; SCALAR-NEXT: movw %cx, 34(%rdx) |
| ; SCALAR-NEXT: movw %ax, 32(%rdx) |
| ; SCALAR-NEXT: movw %cx, 38(%rdx) |
| ; SCALAR-NEXT: movw %ax, 36(%rdx) |
| ; SCALAR-NEXT: movw %cx, 42(%rdx) |
| ; SCALAR-NEXT: movw %ax, 40(%rdx) |
| ; SCALAR-NEXT: movw %cx, 46(%rdx) |
| ; SCALAR-NEXT: movw %ax, 44(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v2i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movd %xmm0, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v2i16: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec384_v2i16: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1> |
| store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 |
| store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 |
| store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 |
| store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 |
| store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4 |
| store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16 |
| %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5 |
| store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4 |
| %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6 |
| store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8 |
| %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7 |
| store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4 |
| %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8 |
| store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32 |
| %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9 |
| store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4 |
| %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10 |
| store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8 |
| %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11 |
| store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v2i32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: movl %ecx, 4(%rsi) |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movl %ecx, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movl %ecx, 12(%rdx) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 20(%rdx) |
| ; SCALAR-NEXT: movl %eax, 16(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 28(%rdx) |
| ; SCALAR-NEXT: movl %eax, 24(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 36(%rdx) |
| ; SCALAR-NEXT: movl %eax, 32(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 44(%rdx) |
| ; SCALAR-NEXT: movl %eax, 40(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v2i32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v2i32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 |
| ; AVX1-NEXT: vmovaps %ymm1, (%rdx) |
| ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec384_v2i32: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec384_v2i32: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> |
| store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 |
| store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 |
| store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2 |
| store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3 |
| store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4 |
| store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32 |
| %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5 |
| store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v2f32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: movl %ecx, 4(%rsi) |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movl %ecx, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movl %ecx, 12(%rdx) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 20(%rdx) |
| ; SCALAR-NEXT: movl %eax, 16(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 28(%rdx) |
| ; SCALAR-NEXT: movl %eax, 24(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 36(%rdx) |
| ; SCALAR-NEXT: movl %eax, 32(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 44(%rdx) |
| ; SCALAR-NEXT: movl %eax, 40(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v2f32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v2f32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 |
| ; AVX1-NEXT: vmovaps %ymm1, (%rdx) |
| ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec384_v2f32: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec384_v2f32: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> |
| %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> |
| store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 |
| store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 |
| store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2 |
| store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3 |
| store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4 |
| store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32 |
| %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5 |
| store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec384_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v2i64: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 24(%rdx) |
| ; SCALAR-NEXT: movq %rax, 16(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v2i64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec384_v2i64: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> |
| store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0 |
| store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1 |
| store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2 |
| store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec384_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v2f64: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 24(%rdx) |
| ; SCALAR-NEXT: movq %rax, 16(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v2f64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec384_v2f64: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> |
| %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double> |
| store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0 |
| store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1 |
| store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2 |
| store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v3i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl %eax, %ecx |
| ; SCALAR-NEXT: shrl $16, %ecx |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: movw %ax, (%rsi) |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movw %ax, (%rdx) |
| ; SCALAR-NEXT: movb %cl, 6(%rdx) |
| ; SCALAR-NEXT: movw %ax, 4(%rdx) |
| ; SCALAR-NEXT: movb %cl, 10(%rdx) |
| ; SCALAR-NEXT: movw %ax, 8(%rdx) |
| ; SCALAR-NEXT: movb %cl, 14(%rdx) |
| ; SCALAR-NEXT: movw %ax, 12(%rdx) |
| ; SCALAR-NEXT: movb %cl, 18(%rdx) |
| ; SCALAR-NEXT: movw %ax, 16(%rdx) |
| ; SCALAR-NEXT: movb %cl, 22(%rdx) |
| ; SCALAR-NEXT: movw %ax, 20(%rdx) |
| ; SCALAR-NEXT: movb %cl, 26(%rdx) |
| ; SCALAR-NEXT: movw %ax, 24(%rdx) |
| ; SCALAR-NEXT: movb %cl, 30(%rdx) |
| ; SCALAR-NEXT: movw %ax, 28(%rdx) |
| ; SCALAR-NEXT: movb %cl, 34(%rdx) |
| ; SCALAR-NEXT: movw %ax, 32(%rdx) |
| ; SCALAR-NEXT: movb %cl, 38(%rdx) |
| ; SCALAR-NEXT: movw %ax, 36(%rdx) |
| ; SCALAR-NEXT: movb %cl, 42(%rdx) |
| ; SCALAR-NEXT: movw %ax, 40(%rdx) |
| ; SCALAR-NEXT: movb %cl, 46(%rdx) |
| ; SCALAR-NEXT: movw %ax, 44(%rdx) |
| ; SCALAR-NEXT: movb %cl, 50(%rdx) |
| ; SCALAR-NEXT: movw %ax, 48(%rdx) |
| ; SCALAR-NEXT: movb %cl, 54(%rdx) |
| ; SCALAR-NEXT: movw %ax, 52(%rdx) |
| ; SCALAR-NEXT: movb %cl, 58(%rdx) |
| ; SCALAR-NEXT: movw %ax, 56(%rdx) |
| ; SCALAR-NEXT: movb %cl, 62(%rdx) |
| ; SCALAR-NEXT: movw %ax, 60(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-ONLY-LABEL: vec384_v3i8: |
| ; SSE2-ONLY: # %bb.0: |
| ; SSE2-ONLY-NEXT: movl (%rdi), %eax |
| ; SSE2-ONLY-NEXT: notl %eax |
| ; SSE2-ONLY-NEXT: movw %ax, (%rsi) |
| ; SSE2-ONLY-NEXT: movl %eax, %ecx |
| ; SSE2-ONLY-NEXT: shrl $16, %ecx |
| ; SSE2-ONLY-NEXT: movb %cl, 2(%rsi) |
| ; SSE2-ONLY-NEXT: movb %cl, 2(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, (%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 6(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 10(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 8(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 14(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 18(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 16(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 22(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 26(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 24(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 30(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 34(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 32(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 38(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 42(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 40(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 46(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 50(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 48(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 54(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 58(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 56(%rdx) |
| ; SSE2-ONLY-NEXT: movb %cl, 62(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) |
| ; SSE2-ONLY-NEXT: retq |
| ; |
| ; SSE3-LABEL: vec384_v3i8: |
| ; SSE3: # %bb.0: |
| ; SSE3-NEXT: movl (%rdi), %eax |
| ; SSE3-NEXT: notl %eax |
| ; SSE3-NEXT: movw %ax, (%rsi) |
| ; SSE3-NEXT: movl %eax, %ecx |
| ; SSE3-NEXT: shrl $16, %ecx |
| ; SSE3-NEXT: movb %cl, 2(%rsi) |
| ; SSE3-NEXT: movb %cl, 2(%rdx) |
| ; SSE3-NEXT: movw %ax, (%rdx) |
| ; SSE3-NEXT: movb %cl, 6(%rdx) |
| ; SSE3-NEXT: movw %ax, 4(%rdx) |
| ; SSE3-NEXT: movb %cl, 10(%rdx) |
| ; SSE3-NEXT: movw %ax, 8(%rdx) |
| ; SSE3-NEXT: movb %cl, 14(%rdx) |
| ; SSE3-NEXT: movw %ax, 12(%rdx) |
| ; SSE3-NEXT: movb %cl, 18(%rdx) |
| ; SSE3-NEXT: movw %ax, 16(%rdx) |
| ; SSE3-NEXT: movb %cl, 22(%rdx) |
| ; SSE3-NEXT: movw %ax, 20(%rdx) |
| ; SSE3-NEXT: movb %cl, 26(%rdx) |
| ; SSE3-NEXT: movw %ax, 24(%rdx) |
| ; SSE3-NEXT: movb %cl, 30(%rdx) |
| ; SSE3-NEXT: movw %ax, 28(%rdx) |
| ; SSE3-NEXT: movb %cl, 34(%rdx) |
| ; SSE3-NEXT: movw %ax, 32(%rdx) |
| ; SSE3-NEXT: movb %cl, 38(%rdx) |
| ; SSE3-NEXT: movw %ax, 36(%rdx) |
| ; SSE3-NEXT: movb %cl, 42(%rdx) |
| ; SSE3-NEXT: movw %ax, 40(%rdx) |
| ; SSE3-NEXT: movb %cl, 46(%rdx) |
| ; SSE3-NEXT: movw %ax, 44(%rdx) |
| ; SSE3-NEXT: movb %cl, 50(%rdx) |
| ; SSE3-NEXT: movw %ax, 48(%rdx) |
| ; SSE3-NEXT: movb %cl, 54(%rdx) |
| ; SSE3-NEXT: movw %ax, 52(%rdx) |
| ; SSE3-NEXT: movb %cl, 58(%rdx) |
| ; SSE3-NEXT: movw %ax, 56(%rdx) |
| ; SSE3-NEXT: movb %cl, 62(%rdx) |
| ; SSE3-NEXT: movw %ax, 60(%rdx) |
| ; SSE3-NEXT: retq |
| ; |
| ; SSSE3-ONLY-LABEL: vec384_v3i8: |
| ; SSSE3-ONLY: # %bb.0: |
| ; SSSE3-ONLY-NEXT: movl (%rdi), %eax |
| ; SSSE3-ONLY-NEXT: notl %eax |
| ; SSSE3-ONLY-NEXT: movw %ax, (%rsi) |
| ; SSSE3-ONLY-NEXT: movl %eax, %ecx |
| ; SSSE3-ONLY-NEXT: shrl $16, %ecx |
| ; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi) |
| ; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, (%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx) |
| ; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) |
| ; SSSE3-ONLY-NEXT: retq |
| ; |
| ; SSE41-LABEL: vec384_v3i8: |
| ; SSE41: # %bb.0: |
| ; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE41-NEXT: pxor %xmm1, %xmm0 |
| ; SSE41-NEXT: pextrb $2, %xmm0, 2(%rsi) |
| ; SSE41-NEXT: movd %xmm0, %eax |
| ; SSE41-NEXT: movw %ax, (%rsi) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 2(%rdx) |
| ; SSE41-NEXT: movw %ax, (%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 6(%rdx) |
| ; SSE41-NEXT: movw %ax, 4(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 10(%rdx) |
| ; SSE41-NEXT: movw %ax, 8(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 14(%rdx) |
| ; SSE41-NEXT: movw %ax, 12(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 18(%rdx) |
| ; SSE41-NEXT: movw %ax, 16(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 22(%rdx) |
| ; SSE41-NEXT: movw %ax, 20(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 26(%rdx) |
| ; SSE41-NEXT: movw %ax, 24(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 30(%rdx) |
| ; SSE41-NEXT: movw %ax, 28(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 34(%rdx) |
| ; SSE41-NEXT: movw %ax, 32(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 38(%rdx) |
| ; SSE41-NEXT: movw %ax, 36(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 42(%rdx) |
| ; SSE41-NEXT: movw %ax, 40(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 46(%rdx) |
| ; SSE41-NEXT: movw %ax, 44(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 50(%rdx) |
| ; SSE41-NEXT: movw %ax, 48(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 54(%rdx) |
| ; SSE41-NEXT: movw %ax, 52(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 58(%rdx) |
| ; SSE41-NEXT: movw %ax, 56(%rdx) |
| ; SSE41-NEXT: pextrb $2, %xmm0, 62(%rdx) |
| ; SSE41-NEXT: movw %ax, 60(%rdx) |
| ; SSE41-NEXT: retq |
| ; |
| ; SSE42-LABEL: vec384_v3i8: |
| ; SSE42: # %bb.0: |
| ; SSE42-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE42-NEXT: pxor %xmm1, %xmm0 |
| ; SSE42-NEXT: pextrb $2, %xmm0, 2(%rsi) |
| ; SSE42-NEXT: movd %xmm0, %eax |
| ; SSE42-NEXT: movw %ax, (%rsi) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 2(%rdx) |
| ; SSE42-NEXT: movw %ax, (%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 6(%rdx) |
| ; SSE42-NEXT: movw %ax, 4(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 10(%rdx) |
| ; SSE42-NEXT: movw %ax, 8(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 14(%rdx) |
| ; SSE42-NEXT: movw %ax, 12(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 18(%rdx) |
| ; SSE42-NEXT: movw %ax, 16(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 22(%rdx) |
| ; SSE42-NEXT: movw %ax, 20(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 26(%rdx) |
| ; SSE42-NEXT: movw %ax, 24(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 30(%rdx) |
| ; SSE42-NEXT: movw %ax, 28(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 34(%rdx) |
| ; SSE42-NEXT: movw %ax, 32(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 38(%rdx) |
| ; SSE42-NEXT: movw %ax, 36(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 42(%rdx) |
| ; SSE42-NEXT: movw %ax, 40(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 46(%rdx) |
| ; SSE42-NEXT: movw %ax, 44(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 50(%rdx) |
| ; SSE42-NEXT: movw %ax, 48(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 54(%rdx) |
| ; SSE42-NEXT: movw %ax, 52(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 58(%rdx) |
| ; SSE42-NEXT: movw %ax, 56(%rdx) |
| ; SSE42-NEXT: pextrb $2, %xmm0, 62(%rdx) |
| ; SSE42-NEXT: movw %ax, 60(%rdx) |
| ; SSE42-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v3i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rsi) |
| ; AVX1-NEXT: vmovd %xmm0, %eax |
| ; AVX1-NEXT: movw %ax, (%rsi) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdx) |
| ; AVX1-NEXT: movw %ax, (%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 6(%rdx) |
| ; AVX1-NEXT: movw %ax, 4(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 10(%rdx) |
| ; AVX1-NEXT: movw %ax, 8(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 14(%rdx) |
| ; AVX1-NEXT: movw %ax, 12(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 18(%rdx) |
| ; AVX1-NEXT: movw %ax, 16(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 22(%rdx) |
| ; AVX1-NEXT: movw %ax, 20(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 26(%rdx) |
| ; AVX1-NEXT: movw %ax, 24(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 30(%rdx) |
| ; AVX1-NEXT: movw %ax, 28(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 34(%rdx) |
| ; AVX1-NEXT: movw %ax, 32(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 38(%rdx) |
| ; AVX1-NEXT: movw %ax, 36(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 42(%rdx) |
| ; AVX1-NEXT: movw %ax, 40(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 46(%rdx) |
| ; AVX1-NEXT: movw %ax, 44(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 50(%rdx) |
| ; AVX1-NEXT: movw %ax, 48(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 54(%rdx) |
| ; AVX1-NEXT: movw %ax, 52(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 58(%rdx) |
| ; AVX1-NEXT: movw %ax, 56(%rdx) |
| ; AVX1-NEXT: vpextrb $2, %xmm0, 62(%rdx) |
| ; AVX1-NEXT: movw %ax, 60(%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec384_v3i8: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 2(%rsi) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, %eax |
| ; AVX2-ONLY-NEXT: movw %ax, (%rsi) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 2(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, (%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 6(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 4(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 10(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 8(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 14(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 12(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 18(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 16(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 22(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 20(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 26(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 24(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 30(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 28(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 34(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 38(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 36(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 42(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 40(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 46(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 44(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 50(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 48(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 54(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 52(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 58(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 56(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 62(%rdx) |
| ; AVX2-ONLY-NEXT: movw %ax, 60(%rdx) |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec384_v3i8: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rsi) |
| ; AVX512-NEXT: vmovd %xmm0, %eax |
| ; AVX512-NEXT: movw %ax, (%rsi) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rdx) |
| ; AVX512-NEXT: movw %ax, (%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 6(%rdx) |
| ; AVX512-NEXT: movw %ax, 4(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 10(%rdx) |
| ; AVX512-NEXT: movw %ax, 8(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 14(%rdx) |
| ; AVX512-NEXT: movw %ax, 12(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 18(%rdx) |
| ; AVX512-NEXT: movw %ax, 16(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 22(%rdx) |
| ; AVX512-NEXT: movw %ax, 20(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 26(%rdx) |
| ; AVX512-NEXT: movw %ax, 24(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 30(%rdx) |
| ; AVX512-NEXT: movw %ax, 28(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 34(%rdx) |
| ; AVX512-NEXT: movw %ax, 32(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 38(%rdx) |
| ; AVX512-NEXT: movw %ax, 36(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 42(%rdx) |
| ; AVX512-NEXT: movw %ax, 40(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 46(%rdx) |
| ; AVX512-NEXT: movw %ax, 44(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 50(%rdx) |
| ; AVX512-NEXT: movw %ax, 48(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 54(%rdx) |
| ; AVX512-NEXT: movw %ax, 52(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 58(%rdx) |
| ; AVX512-NEXT: movw %ax, 56(%rdx) |
| ; AVX512-NEXT: vpextrb $2, %xmm0, 62(%rdx) |
| ; AVX512-NEXT: movw %ax, 60(%rdx) |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <3 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <3 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1> |
| store <3 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 0 |
| store <3 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 1 |
| store <3 x i8> %in.subvec, ptr %out.subvec1.ptr, align 1 |
| %out.subvec2.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 2 |
| store <3 x i8> %in.subvec, ptr %out.subvec2.ptr, align 2 |
| %out.subvec3.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 3 |
| store <3 x i8> %in.subvec, ptr %out.subvec3.ptr, align 1 |
| %out.subvec4.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 4 |
| store <3 x i8> %in.subvec, ptr %out.subvec4.ptr, align 4 |
| %out.subvec5.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 5 |
| store <3 x i8> %in.subvec, ptr %out.subvec5.ptr, align 1 |
| %out.subvec6.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 6 |
| store <3 x i8> %in.subvec, ptr %out.subvec6.ptr, align 2 |
| %out.subvec7.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 7 |
| store <3 x i8> %in.subvec, ptr %out.subvec7.ptr, align 1 |
| %out.subvec8.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 8 |
| store <3 x i8> %in.subvec, ptr %out.subvec8.ptr, align 8 |
| %out.subvec9.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 9 |
| store <3 x i8> %in.subvec, ptr %out.subvec9.ptr, align 1 |
| %out.subvec10.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 10 |
| store <3 x i8> %in.subvec, ptr %out.subvec10.ptr, align 2 |
| %out.subvec11.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 11 |
| store <3 x i8> %in.subvec, ptr %out.subvec11.ptr, align 1 |
| %out.subvec12.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 12 |
| store <3 x i8> %in.subvec, ptr %out.subvec12.ptr, align 4 |
| %out.subvec13.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 13 |
| store <3 x i8> %in.subvec, ptr %out.subvec13.ptr, align 1 |
| %out.subvec14.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 14 |
| store <3 x i8> %in.subvec, ptr %out.subvec14.ptr, align 2 |
| %out.subvec15.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 15 |
| store <3 x i8> %in.subvec, ptr %out.subvec15.ptr, align 1 |
| ret void |
| } |
| |
| define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v3i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq %rax, %rcx |
| ; SCALAR-NEXT: shrq $32, %rcx |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movw %cx, 4(%rsi) |
| ; SCALAR-NEXT: movw %cx, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movw %cx, 12(%rdx) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: movw %cx, 20(%rdx) |
| ; SCALAR-NEXT: movl %eax, 16(%rdx) |
| ; SCALAR-NEXT: movw %cx, 28(%rdx) |
| ; SCALAR-NEXT: movl %eax, 24(%rdx) |
| ; SCALAR-NEXT: movw %cx, 36(%rdx) |
| ; SCALAR-NEXT: movl %eax, 32(%rdx) |
| ; SCALAR-NEXT: movw %cx, 44(%rdx) |
| ; SCALAR-NEXT: movl %eax, 40(%rdx) |
| ; SCALAR-NEXT: movw %cx, 52(%rdx) |
| ; SCALAR-NEXT: movl %eax, 48(%rdx) |
| ; SCALAR-NEXT: movw %cx, 60(%rdx) |
| ; SCALAR-NEXT: movl %eax, 56(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-ONLY-LABEL: vec384_v3i16: |
| ; SSE2-ONLY: # %bb.0: |
| ; SSE2-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-ONLY-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-ONLY-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-ONLY-NEXT: movd %xmm1, (%rsi) |
| ; SSE2-ONLY-NEXT: pextrw $2, %xmm1, %eax |
| ; SSE2-ONLY-NEXT: movw %ax, 4(%rsi) |
| ; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, (%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 16(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 32(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 48(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE2-ONLY-NEXT: retq |
| ; |
| ; SSE3-LABEL: vec384_v3i16: |
| ; SSE3: # %bb.0: |
| ; SSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE3-NEXT: pxor %xmm0, %xmm1 |
| ; SSE3-NEXT: movd %xmm1, (%rsi) |
| ; SSE3-NEXT: pextrw $2, %xmm1, %eax |
| ; SSE3-NEXT: movw %ax, 4(%rsi) |
| ; SSE3-NEXT: movw %ax, 4(%rdx) |
| ; SSE3-NEXT: movd %xmm1, (%rdx) |
| ; SSE3-NEXT: movw %ax, 12(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE3-NEXT: movw %ax, 20(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 16(%rdx) |
| ; SSE3-NEXT: movw %ax, 28(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE3-NEXT: movw %ax, 36(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 32(%rdx) |
| ; SSE3-NEXT: movw %ax, 44(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE3-NEXT: movw %ax, 52(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 48(%rdx) |
| ; SSE3-NEXT: movw %ax, 60(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE3-NEXT: retq |
| ; |
| ; SSSE3-ONLY-LABEL: vec384_v3i16: |
| ; SSSE3-ONLY: # %bb.0: |
| ; SSSE3-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSSE3-ONLY-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm1 |
| ; SSSE3-ONLY-NEXT: movd %xmm1, (%rsi) |
| ; SSSE3-ONLY-NEXT: pextrw $2, %xmm1, %eax |
| ; SSSE3-ONLY-NEXT: movw %ax, 4(%rsi) |
| ; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, (%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 16(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 32(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 48(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSSE3-ONLY-NEXT: retq |
| ; |
| ; SSE41-LABEL: vec384_v3i16: |
| ; SSE41: # %bb.0: |
| ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE41-NEXT: pxor %xmm0, %xmm1 |
| ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rsi) |
| ; SSE41-NEXT: movd %xmm1, (%rsi) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rdx) |
| ; SSE41-NEXT: movd %xmm1, (%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 12(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 20(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 16(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 28(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 36(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 32(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 44(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 52(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 48(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 60(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE41-NEXT: retq |
| ; |
| ; SSE42-LABEL: vec384_v3i16: |
| ; SSE42: # %bb.0: |
| ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE42-NEXT: pxor %xmm0, %xmm1 |
| ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rsi) |
| ; SSE42-NEXT: movd %xmm1, (%rsi) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdx) |
| ; SSE42-NEXT: movd %xmm1, (%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 12(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 20(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 16(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 28(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 36(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 32(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 44(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 52(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 48(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 60(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE42-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v3i16: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rsi) |
| ; AVX1-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, (%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 12(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 8(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 16(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 28(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 24(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 36(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 44(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 40(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 52(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 48(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 60(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 56(%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec384_v3i16: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rsi) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 12(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 8(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 20(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 16(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 28(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 24(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 36(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 44(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 40(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 52(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 48(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 60(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 56(%rdx) |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec384_v3i16: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) |
| ; AVX512-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, (%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 12(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 8(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 20(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 16(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 28(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 24(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 36(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 32(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 44(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 40(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 52(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 48(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 60(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 56(%rdx) |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <3 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <3 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1> |
| store <3 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 0 |
| store <3 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 1 |
| store <3 x i16> %in.subvec, ptr %out.subvec1.ptr, align 2 |
| %out.subvec2.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 2 |
| store <3 x i16> %in.subvec, ptr %out.subvec2.ptr, align 4 |
| %out.subvec3.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 3 |
| store <3 x i16> %in.subvec, ptr %out.subvec3.ptr, align 2 |
| %out.subvec4.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 4 |
| store <3 x i16> %in.subvec, ptr %out.subvec4.ptr, align 8 |
| %out.subvec5.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 5 |
| store <3 x i16> %in.subvec, ptr %out.subvec5.ptr, align 2 |
| %out.subvec6.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 6 |
| store <3 x i16> %in.subvec, ptr %out.subvec6.ptr, align 4 |
| %out.subvec7.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 7 |
| store <3 x i16> %in.subvec, ptr %out.subvec7.ptr, align 2 |
| ret void |
| } |
| |
| define void @vec384_v3i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v3i32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movl 8(%rdi), %eax |
| ; SCALAR-NEXT: movq (%rdi), %rcx |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: movl %eax, 8(%rsi) |
| ; SCALAR-NEXT: movq %rcx, (%rsi) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: movq %rcx, (%rdx) |
| ; SCALAR-NEXT: movl %eax, 24(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 16(%rdx) |
| ; SCALAR-NEXT: movl %eax, 40(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 32(%rdx) |
| ; SCALAR-NEXT: movl %eax, 56(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 48(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-ONLY-LABEL: vec384_v3i32: |
| ; SSE2-ONLY: # %bb.0: |
| ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) |
| ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE2-ONLY-NEXT: retq |
| ; |
| ; SSE3-LABEL: vec384_v3i32: |
| ; SSE3: # %bb.0: |
| ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE3-NEXT: pxor (%rdi), %xmm0 |
| ; SSE3-NEXT: movq %xmm0, (%rsi) |
| ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSE3-NEXT: movd %xmm1, 8(%rsi) |
| ; SSE3-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE3-NEXT: movq %xmm0, (%rdx) |
| ; SSE3-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE3-NEXT: retq |
| ; |
| ; SSSE3-ONLY-LABEL: vec384_v3i32: |
| ; SSSE3-ONLY: # %bb.0: |
| ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) |
| ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx) |
| ; SSSE3-ONLY-NEXT: retq |
| ; |
| ; SSE41-LABEL: vec384_v3i32: |
| ; SSE41: # %bb.0: |
| ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE41-NEXT: pxor (%rdi), %xmm0 |
| ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) |
| ; SSE41-NEXT: movq %xmm0, (%rsi) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx) |
| ; SSE41-NEXT: movq %xmm0, (%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE41-NEXT: retq |
| ; |
| ; SSE42-LABEL: vec384_v3i32: |
| ; SSE42: # %bb.0: |
| ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE42-NEXT: pxor (%rdi), %xmm0 |
| ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) |
| ; SSE42-NEXT: movq %xmm0, (%rsi) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx) |
| ; SSE42-NEXT: movq %xmm0, (%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE42-NEXT: retq |
| ; |
| ; AVX-LABEL: vec384_v3i32: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) |
| ; AVX-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, (%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 16(%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 32(%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 48(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <3 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <3 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1> |
| store <3 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 0 |
| store <3 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 1 |
| store <3 x i32> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 2 |
| store <3 x i32> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 3 |
| store <3 x i32> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec384_v3f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v3f32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movl 8(%rdi), %eax |
| ; SCALAR-NEXT: movq (%rdi), %rcx |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: movl %eax, 8(%rsi) |
| ; SCALAR-NEXT: movq %rcx, (%rsi) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: movq %rcx, (%rdx) |
| ; SCALAR-NEXT: movl %eax, 24(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 16(%rdx) |
| ; SCALAR-NEXT: movl %eax, 40(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 32(%rdx) |
| ; SCALAR-NEXT: movl %eax, 56(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 48(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-ONLY-LABEL: vec384_v3f32: |
| ; SSE2-ONLY: # %bb.0: |
| ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) |
| ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE2-ONLY-NEXT: retq |
| ; |
| ; SSE3-LABEL: vec384_v3f32: |
| ; SSE3: # %bb.0: |
| ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE3-NEXT: pxor (%rdi), %xmm0 |
| ; SSE3-NEXT: movq %xmm0, (%rsi) |
| ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSE3-NEXT: movd %xmm1, 8(%rsi) |
| ; SSE3-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE3-NEXT: movq %xmm0, (%rdx) |
| ; SSE3-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE3-NEXT: retq |
| ; |
| ; SSSE3-ONLY-LABEL: vec384_v3f32: |
| ; SSSE3-ONLY: # %bb.0: |
| ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) |
| ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx) |
| ; SSSE3-ONLY-NEXT: retq |
| ; |
| ; SSE41-LABEL: vec384_v3f32: |
| ; SSE41: # %bb.0: |
| ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE41-NEXT: pxor (%rdi), %xmm0 |
| ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) |
| ; SSE41-NEXT: movq %xmm0, (%rsi) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx) |
| ; SSE41-NEXT: movq %xmm0, (%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE41-NEXT: retq |
| ; |
| ; SSE42-LABEL: vec384_v3f32: |
| ; SSE42: # %bb.0: |
| ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE42-NEXT: pxor (%rdi), %xmm0 |
| ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) |
| ; SSE42-NEXT: movq %xmm0, (%rsi) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx) |
| ; SSE42-NEXT: movq %xmm0, (%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE42-NEXT: retq |
| ; |
| ; AVX-LABEL: vec384_v3f32: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) |
| ; AVX-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, (%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 16(%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 32(%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 48(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <3 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <3 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1> |
| %in.subvec = bitcast <3 x i32> %in.subvec.int to <3 x float> |
| store <3 x float> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 0 |
| store <3 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 1 |
| store <3 x float> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 2 |
| store <3 x float> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 3 |
| store <3 x float> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec384_v3i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v3i64: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: movq 16(%rdi), %rdi |
| ; SCALAR-NEXT: notq %rdi |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rdi, 16(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 16(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 48(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v3i64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq 16(%rdi), %rax |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: notq %rax |
| ; SSE2-NEXT: movq %rax, 16(%rsi) |
| ; SSE2-NEXT: movq %rax, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movq %rax, 48(%rdx) |
| ; SSE2-NEXT: movdqu %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v3i64: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rsi) |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rdx) |
| ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) |
| ; AVX1-NEXT: vmovups %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec384_v3i64: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rsi) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rdx) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-NEXT: vmovq %xmm1, 48(%rdx) |
| ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <3 x i64>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <3 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1> |
| store <3 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 0 |
| store <3 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 1 |
| store <3 x i64> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec384_v3f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v3f64: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: movq 16(%rdi), %rdi |
| ; SCALAR-NEXT: notq %rdi |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rdi, 16(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 16(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 48(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v3f64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq 16(%rdi), %rax |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: notq %rax |
| ; SSE2-NEXT: movq %rax, 16(%rsi) |
| ; SSE2-NEXT: movq %rax, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movq %rax, 48(%rdx) |
| ; SSE2-NEXT: movdqu %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v3f64: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rsi) |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rdx) |
| ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) |
| ; AVX1-NEXT: vmovups %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec384_v3f64: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rsi) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rdx) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-NEXT: vmovq %xmm1, 48(%rdx) |
| ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <3 x i64>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <3 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1> |
| %in.subvec = bitcast <3 x i64> %in.subvec.int to <3 x double> |
| store <3 x double> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 0 |
| store <3 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 1 |
| store <3 x double> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec384_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v4i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzbl 3(%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %edi |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: movb %r8b, 3(%rsi) |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movb %dil, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %r8b, 3(%rdx) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movb %dil, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %r8b, 7(%rdx) |
| ; SCALAR-NEXT: movb %cl, 6(%rdx) |
| ; SCALAR-NEXT: movb %dil, 5(%rdx) |
| ; SCALAR-NEXT: movb %al, 4(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 11(%rdx) |
| ; SCALAR-NEXT: movb %cl, 10(%rdx) |
| ; SCALAR-NEXT: movb %dil, 9(%rdx) |
| ; SCALAR-NEXT: movb %al, 8(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 15(%rdx) |
| ; SCALAR-NEXT: movb %cl, 14(%rdx) |
| ; SCALAR-NEXT: movb %dil, 13(%rdx) |
| ; SCALAR-NEXT: movb %al, 12(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 19(%rdx) |
| ; SCALAR-NEXT: movb %cl, 18(%rdx) |
| ; SCALAR-NEXT: movb %dil, 17(%rdx) |
| ; SCALAR-NEXT: movb %al, 16(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 23(%rdx) |
| ; SCALAR-NEXT: movb %cl, 22(%rdx) |
| ; SCALAR-NEXT: movb %dil, 21(%rdx) |
| ; SCALAR-NEXT: movb %al, 20(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 27(%rdx) |
| ; SCALAR-NEXT: movb %cl, 26(%rdx) |
| ; SCALAR-NEXT: movb %dil, 25(%rdx) |
| ; SCALAR-NEXT: movb %al, 24(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 31(%rdx) |
| ; SCALAR-NEXT: movb %cl, 30(%rdx) |
| ; SCALAR-NEXT: movb %dil, 29(%rdx) |
| ; SCALAR-NEXT: movb %al, 28(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 35(%rdx) |
| ; SCALAR-NEXT: movb %cl, 34(%rdx) |
| ; SCALAR-NEXT: movb %dil, 33(%rdx) |
| ; SCALAR-NEXT: movb %al, 32(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 39(%rdx) |
| ; SCALAR-NEXT: movb %cl, 38(%rdx) |
| ; SCALAR-NEXT: movb %dil, 37(%rdx) |
| ; SCALAR-NEXT: movb %al, 36(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 43(%rdx) |
| ; SCALAR-NEXT: movb %cl, 42(%rdx) |
| ; SCALAR-NEXT: movb %dil, 41(%rdx) |
| ; SCALAR-NEXT: movb %al, 40(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 47(%rdx) |
| ; SCALAR-NEXT: movb %cl, 46(%rdx) |
| ; SCALAR-NEXT: movb %dil, 45(%rdx) |
| ; SCALAR-NEXT: movb %al, 44(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v4i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movd %xmm0, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v4i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec384_v4i8: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1> |
| store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 |
| store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 |
| store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 |
| store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 |
| store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4 |
| store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16 |
| %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5 |
| store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4 |
| %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6 |
| store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8 |
| %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7 |
| store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4 |
| %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8 |
| store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32 |
| %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9 |
| store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4 |
| %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10 |
| store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8 |
| %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11 |
| store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v4i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzwl 6(%rdi), %r8d |
| ; SCALAR-NEXT: movzwl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %edi |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %edi |
| ; SCALAR-NEXT: notl %r8d |
| ; SCALAR-NEXT: movw %r8w, 6(%rsi) |
| ; SCALAR-NEXT: movw %di, 4(%rsi) |
| ; SCALAR-NEXT: movw %cx, 2(%rsi) |
| ; SCALAR-NEXT: movw %ax, (%rsi) |
| ; SCALAR-NEXT: movw %r8w, 6(%rdx) |
| ; SCALAR-NEXT: movw %di, 4(%rdx) |
| ; SCALAR-NEXT: movw %cx, 2(%rdx) |
| ; SCALAR-NEXT: movw %ax, (%rdx) |
| ; SCALAR-NEXT: movw %r8w, 14(%rdx) |
| ; SCALAR-NEXT: movw %di, 12(%rdx) |
| ; SCALAR-NEXT: movw %cx, 10(%rdx) |
| ; SCALAR-NEXT: movw %ax, 8(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 22(%rdx) |
| ; SCALAR-NEXT: movw %di, 20(%rdx) |
| ; SCALAR-NEXT: movw %cx, 18(%rdx) |
| ; SCALAR-NEXT: movw %ax, 16(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 30(%rdx) |
| ; SCALAR-NEXT: movw %di, 28(%rdx) |
| ; SCALAR-NEXT: movw %cx, 26(%rdx) |
| ; SCALAR-NEXT: movw %ax, 24(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 38(%rdx) |
| ; SCALAR-NEXT: movw %di, 36(%rdx) |
| ; SCALAR-NEXT: movw %cx, 34(%rdx) |
| ; SCALAR-NEXT: movw %ax, 32(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 46(%rdx) |
| ; SCALAR-NEXT: movw %di, 44(%rdx) |
| ; SCALAR-NEXT: movw %cx, 42(%rdx) |
| ; SCALAR-NEXT: movw %ax, 40(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v4i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v4i16: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 |
| ; AVX1-NEXT: vmovaps %ymm1, (%rdx) |
| ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec384_v4i16: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec384_v4i16: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1> |
| store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 |
| store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 |
| store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2 |
| store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3 |
| store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4 |
| store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32 |
| %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5 |
| store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec384_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v4i32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movaps (%rdi), %xmm0 |
| ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| ; SCALAR-NEXT: movaps %xmm0, (%rsi) |
| ; SCALAR-NEXT: movaps %xmm0, (%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 16(%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v4i32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec384_v4i32: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> |
| store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0 |
| store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1 |
| store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2 |
| store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec384_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v4f32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movaps (%rdi), %xmm0 |
| ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| ; SCALAR-NEXT: movaps %xmm0, (%rsi) |
| ; SCALAR-NEXT: movaps %xmm0, (%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 16(%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v4f32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec384_v4f32: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> |
| %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float> |
| store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0 |
| store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1 |
| store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2 |
| store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v6i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq %rax, %rcx |
| ; SCALAR-NEXT: shrq $32, %rcx |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movw %cx, 4(%rsi) |
| ; SCALAR-NEXT: movw %cx, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movw %cx, 12(%rdx) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: movw %cx, 20(%rdx) |
| ; SCALAR-NEXT: movl %eax, 16(%rdx) |
| ; SCALAR-NEXT: movw %cx, 28(%rdx) |
| ; SCALAR-NEXT: movl %eax, 24(%rdx) |
| ; SCALAR-NEXT: movw %cx, 36(%rdx) |
| ; SCALAR-NEXT: movl %eax, 32(%rdx) |
| ; SCALAR-NEXT: movw %cx, 44(%rdx) |
| ; SCALAR-NEXT: movl %eax, 40(%rdx) |
| ; SCALAR-NEXT: movw %cx, 52(%rdx) |
| ; SCALAR-NEXT: movl %eax, 48(%rdx) |
| ; SCALAR-NEXT: movw %cx, 60(%rdx) |
| ; SCALAR-NEXT: movl %eax, 56(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-ONLY-LABEL: vec384_v6i8: |
| ; SSE2-ONLY: # %bb.0: |
| ; SSE2-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-ONLY-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-ONLY-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-ONLY-NEXT: movd %xmm1, (%rsi) |
| ; SSE2-ONLY-NEXT: pextrw $2, %xmm1, %eax |
| ; SSE2-ONLY-NEXT: movw %ax, 4(%rsi) |
| ; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, (%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 16(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 32(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 48(%rdx) |
| ; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE2-ONLY-NEXT: retq |
| ; |
| ; SSE3-LABEL: vec384_v6i8: |
| ; SSE3: # %bb.0: |
| ; SSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE3-NEXT: pxor %xmm0, %xmm1 |
| ; SSE3-NEXT: movd %xmm1, (%rsi) |
| ; SSE3-NEXT: pextrw $2, %xmm1, %eax |
| ; SSE3-NEXT: movw %ax, 4(%rsi) |
| ; SSE3-NEXT: movw %ax, 4(%rdx) |
| ; SSE3-NEXT: movd %xmm1, (%rdx) |
| ; SSE3-NEXT: movw %ax, 12(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE3-NEXT: movw %ax, 20(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 16(%rdx) |
| ; SSE3-NEXT: movw %ax, 28(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE3-NEXT: movw %ax, 36(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 32(%rdx) |
| ; SSE3-NEXT: movw %ax, 44(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE3-NEXT: movw %ax, 52(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 48(%rdx) |
| ; SSE3-NEXT: movw %ax, 60(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE3-NEXT: retq |
| ; |
| ; SSSE3-ONLY-LABEL: vec384_v6i8: |
| ; SSSE3-ONLY: # %bb.0: |
| ; SSSE3-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSSE3-ONLY-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm1 |
| ; SSSE3-ONLY-NEXT: movd %xmm1, (%rsi) |
| ; SSSE3-ONLY-NEXT: pextrw $2, %xmm1, %eax |
| ; SSSE3-ONLY-NEXT: movw %ax, 4(%rsi) |
| ; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, (%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 16(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 32(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 48(%rdx) |
| ; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSSE3-ONLY-NEXT: retq |
| ; |
| ; SSE41-LABEL: vec384_v6i8: |
| ; SSE41: # %bb.0: |
| ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE41-NEXT: pxor %xmm0, %xmm1 |
| ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rsi) |
| ; SSE41-NEXT: movd %xmm1, (%rsi) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rdx) |
| ; SSE41-NEXT: movd %xmm1, (%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 12(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 20(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 16(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 28(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 36(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 32(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 44(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 52(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 48(%rdx) |
| ; SSE41-NEXT: pextrw $2, %xmm1, 60(%rdx) |
| ; SSE41-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE41-NEXT: retq |
| ; |
| ; SSE42-LABEL: vec384_v6i8: |
| ; SSE42: # %bb.0: |
| ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE42-NEXT: pxor %xmm0, %xmm1 |
| ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rsi) |
| ; SSE42-NEXT: movd %xmm1, (%rsi) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdx) |
| ; SSE42-NEXT: movd %xmm1, (%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 12(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 20(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 16(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 28(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 36(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 32(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 44(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 52(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 48(%rdx) |
| ; SSE42-NEXT: pextrw $2, %xmm1, 60(%rdx) |
| ; SSE42-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE42-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v6i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rsi) |
| ; AVX1-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, (%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 12(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 8(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 16(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 28(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 24(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 36(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 44(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 40(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 52(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 48(%rdx) |
| ; AVX1-NEXT: vpextrw $2, %xmm0, 60(%rdx) |
| ; AVX1-NEXT: vmovd %xmm0, 56(%rdx) |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec384_v6i8: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rsi) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 12(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 8(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 20(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 16(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 28(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 24(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 36(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 44(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 40(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 52(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 48(%rdx) |
| ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 60(%rdx) |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, 56(%rdx) |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec384_v6i8: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) |
| ; AVX512-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, (%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 12(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 8(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 20(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 16(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 28(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 24(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 36(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 32(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 44(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 40(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 52(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 48(%rdx) |
| ; AVX512-NEXT: vpextrw $2, %xmm0, 60(%rdx) |
| ; AVX512-NEXT: vmovd %xmm0, 56(%rdx) |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <6 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <6 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| store <6 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 0 |
| store <6 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 1 |
| store <6 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 |
| %out.subvec2.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 2 |
| store <6 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 |
| %out.subvec3.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 3 |
| store <6 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 |
| %out.subvec4.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 4 |
| store <6 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 |
| %out.subvec5.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 5 |
| store <6 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 |
| %out.subvec6.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 6 |
| store <6 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 |
| %out.subvec7.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 7 |
| store <6 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 |
| ret void |
| } |
| |
| define void @vec384_v6i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v6i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movl 8(%rdi), %ecx |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movl %ecx, 8(%rsi) |
| ; SCALAR-NEXT: movl %ecx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movl %ecx, 24(%rdx) |
| ; SCALAR-NEXT: movq %rax, 16(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 56(%rdx) |
| ; SCALAR-NEXT: movq %rax, 48(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-ONLY-LABEL: vec384_v6i16: |
| ; SSE2-ONLY: # %bb.0: |
| ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) |
| ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE2-ONLY-NEXT: retq |
| ; |
| ; SSE3-LABEL: vec384_v6i16: |
| ; SSE3: # %bb.0: |
| ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE3-NEXT: pxor (%rdi), %xmm0 |
| ; SSE3-NEXT: movq %xmm0, (%rsi) |
| ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSE3-NEXT: movd %xmm1, 8(%rsi) |
| ; SSE3-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE3-NEXT: movq %xmm0, (%rdx) |
| ; SSE3-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE3-NEXT: retq |
| ; |
| ; SSSE3-ONLY-LABEL: vec384_v6i16: |
| ; SSSE3-ONLY: # %bb.0: |
| ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) |
| ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx) |
| ; SSSE3-ONLY-NEXT: retq |
| ; |
| ; SSE41-LABEL: vec384_v6i16: |
| ; SSE41: # %bb.0: |
| ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE41-NEXT: pxor (%rdi), %xmm0 |
| ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) |
| ; SSE41-NEXT: movq %xmm0, (%rsi) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx) |
| ; SSE41-NEXT: movq %xmm0, (%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE41-NEXT: retq |
| ; |
| ; SSE42-LABEL: vec384_v6i16: |
| ; SSE42: # %bb.0: |
| ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE42-NEXT: pxor (%rdi), %xmm0 |
| ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) |
| ; SSE42-NEXT: movq %xmm0, (%rsi) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx) |
| ; SSE42-NEXT: movq %xmm0, (%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE42-NEXT: retq |
| ; |
| ; AVX-LABEL: vec384_v6i16: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) |
| ; AVX-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, (%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 16(%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 32(%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 48(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <6 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <6 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> |
| store <6 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 0 |
| store <6 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 1 |
| store <6 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 2 |
| store <6 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 3 |
| store <6 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v6i32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: movq 16(%rdi), %rdi |
| ; SCALAR-NEXT: notq %rdi |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rdi, 16(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 16(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 48(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v6i32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movq %xmm1, 16(%rsi) |
| ; SSE2-NEXT: movq %xmm1, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movq %xmm1, 48(%rdx) |
| ; SSE2-NEXT: movdqu %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v6i32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rsi) |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rdx) |
| ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) |
| ; AVX1-NEXT: vmovups %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec384_v6i32: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rsi) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rdx) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-NEXT: vmovq %xmm1, 48(%rdx) |
| ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <6 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <6 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> |
| store <6 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 0 |
| store <6 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 1 |
| store <6 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec384_v6f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v6f32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: movq 16(%rdi), %rdi |
| ; SCALAR-NEXT: notq %rdi |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rdi, 16(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 16(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 48(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v6f32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movq %xmm1, 16(%rsi) |
| ; SSE2-NEXT: movq %xmm1, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movq %xmm1, 48(%rdx) |
| ; SSE2-NEXT: movdqu %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v6f32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rsi) |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rdx) |
| ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) |
| ; AVX1-NEXT: vmovups %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec384_v6f32: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rsi) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rdx) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-NEXT: vmovq %xmm1, 48(%rdx) |
| ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <6 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <6 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> |
| %in.subvec = bitcast <6 x i32> %in.subvec.int to <6 x float> |
| store <6 x float> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 0 |
| store <6 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 1 |
| store <6 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v8i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzbl 7(%rdi), %ebx |
| ; SCALAR-NEXT: movzbl 6(%rdi), %r11d |
| ; SCALAR-NEXT: movzbl 5(%rdi), %r10d |
| ; SCALAR-NEXT: movzbl 4(%rdi), %r9d |
| ; SCALAR-NEXT: movzbl 3(%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %edi |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: notb %r10b |
| ; SCALAR-NEXT: notb %r11b |
| ; SCALAR-NEXT: notb %bl |
| ; SCALAR-NEXT: movb %bl, 7(%rsi) |
| ; SCALAR-NEXT: movb %r11b, 6(%rsi) |
| ; SCALAR-NEXT: movb %r10b, 5(%rsi) |
| ; SCALAR-NEXT: movb %r9b, 4(%rsi) |
| ; SCALAR-NEXT: movb %r8b, 3(%rsi) |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movb %dil, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %bl, 7(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 6(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 5(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 4(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 3(%rdx) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movb %dil, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %bl, 15(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 14(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 13(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 12(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 11(%rdx) |
| ; SCALAR-NEXT: movb %cl, 10(%rdx) |
| ; SCALAR-NEXT: movb %dil, 9(%rdx) |
| ; SCALAR-NEXT: movb %al, 8(%rdx) |
| ; SCALAR-NEXT: movb %bl, 23(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 22(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 21(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 20(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 19(%rdx) |
| ; SCALAR-NEXT: movb %cl, 18(%rdx) |
| ; SCALAR-NEXT: movb %dil, 17(%rdx) |
| ; SCALAR-NEXT: movb %al, 16(%rdx) |
| ; SCALAR-NEXT: movb %bl, 31(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 30(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 29(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 28(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 27(%rdx) |
| ; SCALAR-NEXT: movb %cl, 26(%rdx) |
| ; SCALAR-NEXT: movb %dil, 25(%rdx) |
| ; SCALAR-NEXT: movb %al, 24(%rdx) |
| ; SCALAR-NEXT: movb %bl, 39(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 38(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 37(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 36(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 35(%rdx) |
| ; SCALAR-NEXT: movb %cl, 34(%rdx) |
| ; SCALAR-NEXT: movb %dil, 33(%rdx) |
| ; SCALAR-NEXT: movb %al, 32(%rdx) |
| ; SCALAR-NEXT: movb %bl, 47(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 46(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 45(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 44(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 43(%rdx) |
| ; SCALAR-NEXT: movb %cl, 42(%rdx) |
| ; SCALAR-NEXT: movb %dil, 41(%rdx) |
| ; SCALAR-NEXT: movb %al, 40(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v8i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v8i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 |
| ; AVX1-NEXT: vmovaps %ymm1, (%rdx) |
| ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec384_v8i8: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec384_v8i8: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 |
| store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 |
| store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2 |
| store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3 |
| store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4 |
| store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32 |
| %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5 |
| store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec384_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v8i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzwl 14(%rdi), %ebx |
| ; SCALAR-NEXT: movl 12(%rdi), %r11d |
| ; SCALAR-NEXT: movzwl 10(%rdi), %r10d |
| ; SCALAR-NEXT: movl 8(%rdi), %r9d |
| ; SCALAR-NEXT: movzwl 6(%rdi), %r8d |
| ; SCALAR-NEXT: movzwl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %edi |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %edi |
| ; SCALAR-NEXT: notl %r8d |
| ; SCALAR-NEXT: notl %r9d |
| ; SCALAR-NEXT: notl %r10d |
| ; SCALAR-NEXT: notl %r11d |
| ; SCALAR-NEXT: notl %ebx |
| ; SCALAR-NEXT: movw %bx, 14(%rsi) |
| ; SCALAR-NEXT: movw %r11w, 12(%rsi) |
| ; SCALAR-NEXT: movw %r10w, 10(%rsi) |
| ; SCALAR-NEXT: movw %r9w, 8(%rsi) |
| ; SCALAR-NEXT: movw %r8w, 6(%rsi) |
| ; SCALAR-NEXT: movw %di, 4(%rsi) |
| ; SCALAR-NEXT: movw %cx, 2(%rsi) |
| ; SCALAR-NEXT: movw %ax, (%rsi) |
| ; SCALAR-NEXT: movw %bx, 14(%rdx) |
| ; SCALAR-NEXT: movw %r11w, 12(%rdx) |
| ; SCALAR-NEXT: movw %r10w, 10(%rdx) |
| ; SCALAR-NEXT: movw %r9w, 8(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 6(%rdx) |
| ; SCALAR-NEXT: movw %di, 4(%rdx) |
| ; SCALAR-NEXT: movw %cx, 2(%rdx) |
| ; SCALAR-NEXT: movw %ax, (%rdx) |
| ; SCALAR-NEXT: movw %bx, 30(%rdx) |
| ; SCALAR-NEXT: movw %r11w, 28(%rdx) |
| ; SCALAR-NEXT: movw %r10w, 26(%rdx) |
| ; SCALAR-NEXT: movw %r9w, 24(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 22(%rdx) |
| ; SCALAR-NEXT: movw %di, 20(%rdx) |
| ; SCALAR-NEXT: movw %cx, 18(%rdx) |
| ; SCALAR-NEXT: movw %ax, 16(%rdx) |
| ; SCALAR-NEXT: movw %bx, 46(%rdx) |
| ; SCALAR-NEXT: movw %r11w, 44(%rdx) |
| ; SCALAR-NEXT: movw %r10w, 42(%rdx) |
| ; SCALAR-NEXT: movw %r9w, 40(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 38(%rdx) |
| ; SCALAR-NEXT: movw %di, 36(%rdx) |
| ; SCALAR-NEXT: movw %cx, 34(%rdx) |
| ; SCALAR-NEXT: movw %ax, 32(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v8i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec384_v8i16: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> |
| store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0 |
| store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1 |
| store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2 |
| store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec384_v12i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v12i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movl 8(%rdi), %ecx |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movl %ecx, 8(%rsi) |
| ; SCALAR-NEXT: movl %ecx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movl %ecx, 24(%rdx) |
| ; SCALAR-NEXT: movq %rax, 16(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 56(%rdx) |
| ; SCALAR-NEXT: movq %rax, 48(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-ONLY-LABEL: vec384_v12i8: |
| ; SSE2-ONLY: # %bb.0: |
| ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) |
| ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE2-ONLY-NEXT: retq |
| ; |
| ; SSE3-LABEL: vec384_v12i8: |
| ; SSE3: # %bb.0: |
| ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE3-NEXT: pxor (%rdi), %xmm0 |
| ; SSE3-NEXT: movq %xmm0, (%rsi) |
| ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSE3-NEXT: movd %xmm1, 8(%rsi) |
| ; SSE3-NEXT: movd %xmm1, 8(%rdx) |
| ; SSE3-NEXT: movq %xmm0, (%rdx) |
| ; SSE3-NEXT: movd %xmm1, 24(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 40(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE3-NEXT: movd %xmm1, 56(%rdx) |
| ; SSE3-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE3-NEXT: retq |
| ; |
| ; SSSE3-ONLY-LABEL: vec384_v12i8: |
| ; SSSE3-ONLY: # %bb.0: |
| ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) |
| ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx) |
| ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) |
| ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx) |
| ; SSSE3-ONLY-NEXT: retq |
| ; |
| ; SSE41-LABEL: vec384_v12i8: |
| ; SSE41: # %bb.0: |
| ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE41-NEXT: pxor (%rdi), %xmm0 |
| ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) |
| ; SSE41-NEXT: movq %xmm0, (%rsi) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx) |
| ; SSE41-NEXT: movq %xmm0, (%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx) |
| ; SSE41-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE41-NEXT: retq |
| ; |
| ; SSE42-LABEL: vec384_v12i8: |
| ; SSE42: # %bb.0: |
| ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE42-NEXT: pxor (%rdi), %xmm0 |
| ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) |
| ; SSE42-NEXT: movq %xmm0, (%rsi) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx) |
| ; SSE42-NEXT: movq %xmm0, (%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 16(%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 32(%rdx) |
| ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx) |
| ; SSE42-NEXT: movq %xmm0, 48(%rdx) |
| ; SSE42-NEXT: retq |
| ; |
| ; AVX-LABEL: vec384_v12i8: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) |
| ; AVX-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, (%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 16(%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 32(%rdx) |
| ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx) |
| ; AVX-NEXT: vmovq %xmm0, 48(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <12 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <12 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| store <12 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 0 |
| store <12 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 1 |
| store <12 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 2 |
| store <12 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 3 |
| store <12 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec384_v12i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v12i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: movq 16(%rdi), %rdi |
| ; SCALAR-NEXT: notq %rdi |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rdi, 16(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 16(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 48(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v12i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movq %xmm1, 16(%rsi) |
| ; SSE2-NEXT: movq %xmm1, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movq %xmm1, 48(%rdx) |
| ; SSE2-NEXT: movdqu %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v12i16: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rsi) |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rdx) |
| ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) |
| ; AVX1-NEXT: vmovups %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec384_v12i16: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rsi) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rdx) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-NEXT: vmovq %xmm1, 48(%rdx) |
| ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <12 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <12 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> |
| store <12 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 0 |
| store <12 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 1 |
| store <12 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v16i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbp |
| ; SCALAR-NEXT: pushq %r15 |
| ; SCALAR-NEXT: pushq %r14 |
| ; SCALAR-NEXT: pushq %r13 |
| ; SCALAR-NEXT: pushq %r12 |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzbl 15(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 14(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 13(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 12(%rdi), %r11d |
| ; SCALAR-NEXT: movzbl 11(%rdi), %r13d |
| ; SCALAR-NEXT: movzbl 10(%rdi), %r12d |
| ; SCALAR-NEXT: movzbl 9(%rdi), %ebp |
| ; SCALAR-NEXT: movzbl 8(%rdi), %r14d |
| ; SCALAR-NEXT: movzbl 7(%rdi), %ebx |
| ; SCALAR-NEXT: movzbl 6(%rdi), %r10d |
| ; SCALAR-NEXT: movzbl 5(%rdi), %r15d |
| ; SCALAR-NEXT: movzbl 4(%rdi), %r9d |
| ; SCALAR-NEXT: movzbl 3(%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %edi |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movl %r15d, %r9d |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: notb %r10b |
| ; SCALAR-NEXT: notb %bl |
| ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r14b |
| ; SCALAR-NEXT: notb %bpl |
| ; SCALAR-NEXT: movl %ebp, %r15d |
| ; SCALAR-NEXT: notb %r12b |
| ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r13b |
| ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r11b |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: movb %r8b, 15(%rsi) |
| ; SCALAR-NEXT: movb %cl, 14(%rsi) |
| ; SCALAR-NEXT: movl %edi, %eax |
| ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movb %dil, 13(%rsi) |
| ; SCALAR-NEXT: movb %r11b, 12(%rsi) |
| ; SCALAR-NEXT: movl %r11d, %ebp |
| ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movb %r13b, 11(%rsi) |
| ; SCALAR-NEXT: movb %r12b, 10(%rsi) |
| ; SCALAR-NEXT: movb %r15b, 9(%rsi) |
| ; SCALAR-NEXT: movb %r14b, 8(%rsi) |
| ; SCALAR-NEXT: movb %bl, 7(%rsi) |
| ; SCALAR-NEXT: movb %r10b, 6(%rsi) |
| ; SCALAR-NEXT: movl %r10d, %ebx |
| ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movb %r9b, 5(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r11b, 4(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r12b, 3(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r13b, 1(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r10b, (%rsi) |
| ; SCALAR-NEXT: movb %r8b, 15(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %dil, 14(%rdx) |
| ; SCALAR-NEXT: movb %al, 13(%rdx) |
| ; SCALAR-NEXT: movb %bpl, 12(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 11(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 10(%rdx) |
| ; SCALAR-NEXT: movb %r15b, 9(%rdx) |
| ; SCALAR-NEXT: movb %r14b, 8(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %bpl, 7(%rdx) |
| ; SCALAR-NEXT: movb %bl, 6(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 5(%rdx) |
| ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movb %r11b, 4(%rdx) |
| ; SCALAR-NEXT: movb %r12b, 3(%rdx) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movl %r13d, %ebx |
| ; SCALAR-NEXT: movb %r13b, 1(%rdx) |
| ; SCALAR-NEXT: movl %r10d, %esi |
| ; SCALAR-NEXT: movb %r10b, (%rdx) |
| ; SCALAR-NEXT: movb %r8b, 31(%rdx) |
| ; SCALAR-NEXT: movb %dil, 30(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 29(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r11b, 28(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r13b, 27(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r12b, 26(%rdx) |
| ; SCALAR-NEXT: movb %r15b, 25(%rdx) |
| ; SCALAR-NEXT: movb %r14b, 24(%rdx) |
| ; SCALAR-NEXT: movb %bpl, 23(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r10b, 22(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 21(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r9b, 20(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %dil, 19(%rdx) |
| ; SCALAR-NEXT: movb %cl, 18(%rdx) |
| ; SCALAR-NEXT: movb %bl, 17(%rdx) |
| ; SCALAR-NEXT: movb %sil, 16(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 47(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r8b, 46(%rdx) |
| ; SCALAR-NEXT: movb %al, 45(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 44(%rdx) |
| ; SCALAR-NEXT: movb %r13b, 43(%rdx) |
| ; SCALAR-NEXT: movb %r12b, 42(%rdx) |
| ; SCALAR-NEXT: movb %r15b, 41(%rdx) |
| ; SCALAR-NEXT: movb %r14b, 40(%rdx) |
| ; SCALAR-NEXT: movb %bpl, 39(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 38(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 37(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 36(%rdx) |
| ; SCALAR-NEXT: movb %dil, 35(%rdx) |
| ; SCALAR-NEXT: movb %cl, 34(%rdx) |
| ; SCALAR-NEXT: movb %bl, 33(%rdx) |
| ; SCALAR-NEXT: movb %sil, 32(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: popq %r12 |
| ; SCALAR-NEXT: popq %r13 |
| ; SCALAR-NEXT: popq %r14 |
| ; SCALAR-NEXT: popq %r15 |
| ; SCALAR-NEXT: popq %rbp |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v16i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec384_v16i8: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0 |
| store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1 |
| store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2 |
| store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec384_v24i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: movq 16(%rdi), %rdi |
| ; SCALAR-NEXT: notq %rdi |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rdi, 16(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 16(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 48(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec384_v24i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movq %xmm1, 16(%rsi) |
| ; SSE2-NEXT: movq %xmm1, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movq %xmm1, 48(%rdx) |
| ; SSE2-NEXT: movdqu %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec384_v24i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rsi) |
| ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) |
| ; AVX1-NEXT: vmovaps %xmm0, (%rdx) |
| ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) |
| ; AVX1-NEXT: vmovups %xmm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec384_v24i8: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rsi) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX2-NEXT: vmovq %xmm1, 16(%rdx) |
| ; AVX2-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX2-NEXT: vmovq %xmm1, 48(%rdx) |
| ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <24 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <24 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| store <24 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 0 |
| store <24 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 1 |
| store <24 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v2i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %ecx |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %cl, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %cl, 3(%rdx) |
| ; SCALAR-NEXT: movb %al, 2(%rdx) |
| ; SCALAR-NEXT: movb %cl, 5(%rdx) |
| ; SCALAR-NEXT: movb %al, 4(%rdx) |
| ; SCALAR-NEXT: movb %cl, 7(%rdx) |
| ; SCALAR-NEXT: movb %al, 6(%rdx) |
| ; SCALAR-NEXT: movb %cl, 9(%rdx) |
| ; SCALAR-NEXT: movb %al, 8(%rdx) |
| ; SCALAR-NEXT: movb %cl, 11(%rdx) |
| ; SCALAR-NEXT: movb %al, 10(%rdx) |
| ; SCALAR-NEXT: movb %cl, 13(%rdx) |
| ; SCALAR-NEXT: movb %al, 12(%rdx) |
| ; SCALAR-NEXT: movb %cl, 15(%rdx) |
| ; SCALAR-NEXT: movb %al, 14(%rdx) |
| ; SCALAR-NEXT: movb %cl, 17(%rdx) |
| ; SCALAR-NEXT: movb %al, 16(%rdx) |
| ; SCALAR-NEXT: movb %cl, 19(%rdx) |
| ; SCALAR-NEXT: movb %al, 18(%rdx) |
| ; SCALAR-NEXT: movb %cl, 21(%rdx) |
| ; SCALAR-NEXT: movb %al, 20(%rdx) |
| ; SCALAR-NEXT: movb %cl, 23(%rdx) |
| ; SCALAR-NEXT: movb %al, 22(%rdx) |
| ; SCALAR-NEXT: movb %cl, 25(%rdx) |
| ; SCALAR-NEXT: movb %al, 24(%rdx) |
| ; SCALAR-NEXT: movb %cl, 27(%rdx) |
| ; SCALAR-NEXT: movb %al, 26(%rdx) |
| ; SCALAR-NEXT: movb %cl, 29(%rdx) |
| ; SCALAR-NEXT: movb %al, 28(%rdx) |
| ; SCALAR-NEXT: movb %cl, 31(%rdx) |
| ; SCALAR-NEXT: movb %al, 30(%rdx) |
| ; SCALAR-NEXT: movb %cl, 33(%rdx) |
| ; SCALAR-NEXT: movb %al, 32(%rdx) |
| ; SCALAR-NEXT: movb %cl, 35(%rdx) |
| ; SCALAR-NEXT: movb %al, 34(%rdx) |
| ; SCALAR-NEXT: movb %cl, 37(%rdx) |
| ; SCALAR-NEXT: movb %al, 36(%rdx) |
| ; SCALAR-NEXT: movb %cl, 39(%rdx) |
| ; SCALAR-NEXT: movb %al, 38(%rdx) |
| ; SCALAR-NEXT: movb %cl, 41(%rdx) |
| ; SCALAR-NEXT: movb %al, 40(%rdx) |
| ; SCALAR-NEXT: movb %cl, 43(%rdx) |
| ; SCALAR-NEXT: movb %al, 42(%rdx) |
| ; SCALAR-NEXT: movb %cl, 45(%rdx) |
| ; SCALAR-NEXT: movb %al, 44(%rdx) |
| ; SCALAR-NEXT: movb %cl, 47(%rdx) |
| ; SCALAR-NEXT: movb %al, 46(%rdx) |
| ; SCALAR-NEXT: movb %cl, 49(%rdx) |
| ; SCALAR-NEXT: movb %al, 48(%rdx) |
| ; SCALAR-NEXT: movb %cl, 51(%rdx) |
| ; SCALAR-NEXT: movb %al, 50(%rdx) |
| ; SCALAR-NEXT: movb %cl, 53(%rdx) |
| ; SCALAR-NEXT: movb %al, 52(%rdx) |
| ; SCALAR-NEXT: movb %cl, 55(%rdx) |
| ; SCALAR-NEXT: movb %al, 54(%rdx) |
| ; SCALAR-NEXT: movb %cl, 57(%rdx) |
| ; SCALAR-NEXT: movb %al, 56(%rdx) |
| ; SCALAR-NEXT: movb %cl, 59(%rdx) |
| ; SCALAR-NEXT: movb %al, 58(%rdx) |
| ; SCALAR-NEXT: movb %cl, 61(%rdx) |
| ; SCALAR-NEXT: movb %al, 60(%rdx) |
| ; SCALAR-NEXT: movb %cl, 63(%rdx) |
| ; SCALAR-NEXT: movb %al, 62(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-ONLY-LABEL: vec512_v2i8: |
| ; SSE2-ONLY: # %bb.0: |
| ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-ONLY-NEXT: movd %xmm0, %eax |
| ; SSE2-ONLY-NEXT: movw %ax, (%rsi) |
| ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-ONLY-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-ONLY-NEXT: retq |
| ; |
| ; SSE3-LABEL: vec512_v2i8: |
| ; SSE3: # %bb.0: |
| ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE3-NEXT: pxor (%rdi), %xmm0 |
| ; SSE3-NEXT: movd %xmm0, %eax |
| ; SSE3-NEXT: movw %ax, (%rsi) |
| ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE3-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE3-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE3-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE3-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE3-NEXT: retq |
| ; |
| ; SSSE3-ONLY-LABEL: vec512_v2i8: |
| ; SSSE3-ONLY: # %bb.0: |
| ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 |
| ; SSSE3-ONLY-NEXT: movd %xmm0, %eax |
| ; SSSE3-ONLY-NEXT: movw %ax, (%rsi) |
| ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) |
| ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSSE3-ONLY-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSSE3-ONLY-NEXT: retq |
| ; |
| ; SSE41-LABEL: vec512_v2i8: |
| ; SSE41: # %bb.0: |
| ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE41-NEXT: pxor (%rdi), %xmm0 |
| ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) |
| ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE41-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE41-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE41-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE41-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE41-NEXT: retq |
| ; |
| ; SSE42-LABEL: vec512_v2i8: |
| ; SSE42: # %bb.0: |
| ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE42-NEXT: pxor (%rdi), %xmm0 |
| ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) |
| ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE42-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE42-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE42-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE42-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v2i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec512_v2i8: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vpextrw $0, %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastw %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512F-LABEL: vec512_v2i8: |
| ; AVX512F: # %bb.0: |
| ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX512F-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) |
| ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 |
| ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 |
| ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-NEXT: vzeroupper |
| ; AVX512F-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: vec512_v2i8: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX512BW-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) |
| ; AVX512BW-NEXT: vpbroadcastw %xmm0, %zmm0 |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> |
| store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 |
| store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 |
| store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 |
| %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 |
| store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 |
| %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 |
| store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 |
| %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 |
| store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 |
| %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 |
| store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 |
| %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 |
| store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 |
| %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 |
| store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 |
| %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8 |
| store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16 |
| %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9 |
| store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2 |
| %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10 |
| store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4 |
| %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11 |
| store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2 |
| %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12 |
| store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8 |
| %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13 |
| store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2 |
| %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14 |
| store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4 |
| %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15 |
| store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2 |
| %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16 |
| store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32 |
| %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17 |
| store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2 |
| %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18 |
| store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4 |
| %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19 |
| store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2 |
| %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20 |
| store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8 |
| %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21 |
| store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2 |
| %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22 |
| store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4 |
| %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23 |
| store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2 |
| %out.subvec24.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 24 |
| store <2 x i8> %in.subvec, ptr %out.subvec24.ptr, align 16 |
| %out.subvec25.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 25 |
| store <2 x i8> %in.subvec, ptr %out.subvec25.ptr, align 2 |
| %out.subvec26.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 26 |
| store <2 x i8> %in.subvec, ptr %out.subvec26.ptr, align 4 |
| %out.subvec27.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 27 |
| store <2 x i8> %in.subvec, ptr %out.subvec27.ptr, align 2 |
| %out.subvec28.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 28 |
| store <2 x i8> %in.subvec, ptr %out.subvec28.ptr, align 8 |
| %out.subvec29.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 29 |
| store <2 x i8> %in.subvec, ptr %out.subvec29.ptr, align 2 |
| %out.subvec30.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 30 |
| store <2 x i8> %in.subvec, ptr %out.subvec30.ptr, align 4 |
| %out.subvec31.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 31 |
| store <2 x i8> %in.subvec, ptr %out.subvec31.ptr, align 2 |
| ret void |
| } |
| |
| define void @vec512_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v2i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzwl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: movw %cx, 2(%rsi) |
| ; SCALAR-NEXT: movw %ax, (%rsi) |
| ; SCALAR-NEXT: movw %cx, 2(%rdx) |
| ; SCALAR-NEXT: movw %ax, (%rdx) |
| ; SCALAR-NEXT: movw %cx, 6(%rdx) |
| ; SCALAR-NEXT: movw %ax, 4(%rdx) |
| ; SCALAR-NEXT: movw %cx, 10(%rdx) |
| ; SCALAR-NEXT: movw %ax, 8(%rdx) |
| ; SCALAR-NEXT: movw %cx, 14(%rdx) |
| ; SCALAR-NEXT: movw %ax, 12(%rdx) |
| ; SCALAR-NEXT: movw %cx, 18(%rdx) |
| ; SCALAR-NEXT: movw %ax, 16(%rdx) |
| ; SCALAR-NEXT: movw %cx, 22(%rdx) |
| ; SCALAR-NEXT: movw %ax, 20(%rdx) |
| ; SCALAR-NEXT: movw %cx, 26(%rdx) |
| ; SCALAR-NEXT: movw %ax, 24(%rdx) |
| ; SCALAR-NEXT: movw %cx, 30(%rdx) |
| ; SCALAR-NEXT: movw %ax, 28(%rdx) |
| ; SCALAR-NEXT: movw %cx, 34(%rdx) |
| ; SCALAR-NEXT: movw %ax, 32(%rdx) |
| ; SCALAR-NEXT: movw %cx, 38(%rdx) |
| ; SCALAR-NEXT: movw %ax, 36(%rdx) |
| ; SCALAR-NEXT: movw %cx, 42(%rdx) |
| ; SCALAR-NEXT: movw %ax, 40(%rdx) |
| ; SCALAR-NEXT: movw %cx, 46(%rdx) |
| ; SCALAR-NEXT: movw %ax, 44(%rdx) |
| ; SCALAR-NEXT: movw %cx, 50(%rdx) |
| ; SCALAR-NEXT: movw %ax, 48(%rdx) |
| ; SCALAR-NEXT: movw %cx, 54(%rdx) |
| ; SCALAR-NEXT: movw %ax, 52(%rdx) |
| ; SCALAR-NEXT: movw %cx, 58(%rdx) |
| ; SCALAR-NEXT: movw %ax, 56(%rdx) |
| ; SCALAR-NEXT: movw %cx, 62(%rdx) |
| ; SCALAR-NEXT: movw %ax, 60(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v2i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movd %xmm0, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v2i16: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec512_v2i16: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec512_v2i16: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 |
| ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1> |
| store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 |
| store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 |
| store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 |
| store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 |
| store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4 |
| store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16 |
| %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5 |
| store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4 |
| %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6 |
| store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8 |
| %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7 |
| store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4 |
| %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8 |
| store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32 |
| %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9 |
| store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4 |
| %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10 |
| store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8 |
| %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11 |
| store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4 |
| %out.subvec12.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 12 |
| store <2 x i16> %in.subvec, ptr %out.subvec12.ptr, align 16 |
| %out.subvec13.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 13 |
| store <2 x i16> %in.subvec, ptr %out.subvec13.ptr, align 4 |
| %out.subvec14.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 14 |
| store <2 x i16> %in.subvec, ptr %out.subvec14.ptr, align 8 |
| %out.subvec15.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 15 |
| store <2 x i16> %in.subvec, ptr %out.subvec15.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v2i32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: movl %ecx, 4(%rsi) |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movl %ecx, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movl %ecx, 12(%rdx) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 20(%rdx) |
| ; SCALAR-NEXT: movl %eax, 16(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 28(%rdx) |
| ; SCALAR-NEXT: movl %eax, 24(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 36(%rdx) |
| ; SCALAR-NEXT: movl %eax, 32(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 44(%rdx) |
| ; SCALAR-NEXT: movl %eax, 40(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 52(%rdx) |
| ; SCALAR-NEXT: movl %eax, 48(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 60(%rdx) |
| ; SCALAR-NEXT: movl %eax, 56(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v2i32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v2i32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec512_v2i32: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec512_v2i32: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 |
| ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> |
| store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 |
| store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 |
| store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2 |
| store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3 |
| store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4 |
| store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32 |
| %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5 |
| store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8 |
| %out.subvec6.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 6 |
| store <2 x i32> %in.subvec, ptr %out.subvec6.ptr, align 16 |
| %out.subvec7.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 7 |
| store <2 x i32> %in.subvec, ptr %out.subvec7.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v2f32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %ecx |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: movl %ecx, 4(%rsi) |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movl %ecx, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movl %ecx, 12(%rdx) |
| ; SCALAR-NEXT: movl %eax, 8(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 20(%rdx) |
| ; SCALAR-NEXT: movl %eax, 16(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 28(%rdx) |
| ; SCALAR-NEXT: movl %eax, 24(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 36(%rdx) |
| ; SCALAR-NEXT: movl %eax, 32(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 44(%rdx) |
| ; SCALAR-NEXT: movl %eax, 40(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 52(%rdx) |
| ; SCALAR-NEXT: movl %eax, 48(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 60(%rdx) |
| ; SCALAR-NEXT: movl %eax, 56(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v2f32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v2f32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec512_v2f32: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec512_v2f32: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 |
| ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> |
| %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> |
| store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 |
| store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 |
| store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2 |
| store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3 |
| store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4 |
| store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32 |
| %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5 |
| store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8 |
| %out.subvec6.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 6 |
| store <2 x float> %in.subvec, ptr %out.subvec6.ptr, align 16 |
| %out.subvec7.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 7 |
| store <2 x float> %in.subvec, ptr %out.subvec7.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec512_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v2i64: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 24(%rdx) |
| ; SCALAR-NEXT: movq %rax, 16(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 56(%rdx) |
| ; SCALAR-NEXT: movq %rax, 48(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v2i64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec512_v2i64: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> |
| store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0 |
| store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1 |
| store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2 |
| store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| %out.subvec3.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 3 |
| store <2 x i64> %in.subvec, ptr %out.subvec3.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec512_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v2f64: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq (%rdi), %rax |
| ; SCALAR-NEXT: movq 8(%rdi), %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: movq %rcx, 8(%rsi) |
| ; SCALAR-NEXT: movq %rax, (%rsi) |
| ; SCALAR-NEXT: movq %rcx, 8(%rdx) |
| ; SCALAR-NEXT: movq %rax, (%rdx) |
| ; SCALAR-NEXT: movq %rcx, 24(%rdx) |
| ; SCALAR-NEXT: movq %rax, 16(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 40(%rdx) |
| ; SCALAR-NEXT: movq %rax, 32(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 56(%rdx) |
| ; SCALAR-NEXT: movq %rax, 48(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v2f64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec512_v2f64: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> |
| %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double> |
| store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0 |
| store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1 |
| store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2 |
| store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| %out.subvec3.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 3 |
| store <2 x double> %in.subvec, ptr %out.subvec3.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec512_v2i128(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; ALL-LABEL: vec512_v2i128: |
| ; ALL: # %bb.0: |
| ; ALL-NEXT: movq 16(%rdi), %rax |
| ; ALL-NEXT: movq 24(%rdi), %rcx |
| ; ALL-NEXT: movq (%rdi), %r8 |
| ; ALL-NEXT: movq 8(%rdi), %rdi |
| ; ALL-NEXT: notq %rdi |
| ; ALL-NEXT: notq %r8 |
| ; ALL-NEXT: notq %rcx |
| ; ALL-NEXT: notq %rax |
| ; ALL-NEXT: movq %rax, 16(%rsi) |
| ; ALL-NEXT: movq %rcx, 24(%rsi) |
| ; ALL-NEXT: movq %r8, (%rsi) |
| ; ALL-NEXT: movq %rdi, 8(%rsi) |
| ; ALL-NEXT: movq %rax, 16(%rdx) |
| ; ALL-NEXT: movq %rcx, 24(%rdx) |
| ; ALL-NEXT: movq %r8, (%rdx) |
| ; ALL-NEXT: movq %rdi, 8(%rdx) |
| ; ALL-NEXT: movq %rax, 48(%rdx) |
| ; ALL-NEXT: movq %rcx, 56(%rdx) |
| ; ALL-NEXT: movq %r8, 32(%rdx) |
| ; ALL-NEXT: movq %rdi, 40(%rdx) |
| ; ALL-NEXT: retq |
| %in.subvec.not = load <2 x i128>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <2 x i128> %in.subvec.not, <i128 -1, i128 -1> |
| store <2 x i128> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 0 |
| store <2 x i128> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 1 |
| store <2 x i128> %in.subvec, ptr %out.subvec1.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec512_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v4i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzbl 3(%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %edi |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: movb %r8b, 3(%rsi) |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movb %dil, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %r8b, 3(%rdx) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movb %dil, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %r8b, 7(%rdx) |
| ; SCALAR-NEXT: movb %cl, 6(%rdx) |
| ; SCALAR-NEXT: movb %dil, 5(%rdx) |
| ; SCALAR-NEXT: movb %al, 4(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 11(%rdx) |
| ; SCALAR-NEXT: movb %cl, 10(%rdx) |
| ; SCALAR-NEXT: movb %dil, 9(%rdx) |
| ; SCALAR-NEXT: movb %al, 8(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 15(%rdx) |
| ; SCALAR-NEXT: movb %cl, 14(%rdx) |
| ; SCALAR-NEXT: movb %dil, 13(%rdx) |
| ; SCALAR-NEXT: movb %al, 12(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 19(%rdx) |
| ; SCALAR-NEXT: movb %cl, 18(%rdx) |
| ; SCALAR-NEXT: movb %dil, 17(%rdx) |
| ; SCALAR-NEXT: movb %al, 16(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 23(%rdx) |
| ; SCALAR-NEXT: movb %cl, 22(%rdx) |
| ; SCALAR-NEXT: movb %dil, 21(%rdx) |
| ; SCALAR-NEXT: movb %al, 20(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 27(%rdx) |
| ; SCALAR-NEXT: movb %cl, 26(%rdx) |
| ; SCALAR-NEXT: movb %dil, 25(%rdx) |
| ; SCALAR-NEXT: movb %al, 24(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 31(%rdx) |
| ; SCALAR-NEXT: movb %cl, 30(%rdx) |
| ; SCALAR-NEXT: movb %dil, 29(%rdx) |
| ; SCALAR-NEXT: movb %al, 28(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 35(%rdx) |
| ; SCALAR-NEXT: movb %cl, 34(%rdx) |
| ; SCALAR-NEXT: movb %dil, 33(%rdx) |
| ; SCALAR-NEXT: movb %al, 32(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 39(%rdx) |
| ; SCALAR-NEXT: movb %cl, 38(%rdx) |
| ; SCALAR-NEXT: movb %dil, 37(%rdx) |
| ; SCALAR-NEXT: movb %al, 36(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 43(%rdx) |
| ; SCALAR-NEXT: movb %cl, 42(%rdx) |
| ; SCALAR-NEXT: movb %dil, 41(%rdx) |
| ; SCALAR-NEXT: movb %al, 40(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 47(%rdx) |
| ; SCALAR-NEXT: movb %cl, 46(%rdx) |
| ; SCALAR-NEXT: movb %dil, 45(%rdx) |
| ; SCALAR-NEXT: movb %al, 44(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 51(%rdx) |
| ; SCALAR-NEXT: movb %cl, 50(%rdx) |
| ; SCALAR-NEXT: movb %dil, 49(%rdx) |
| ; SCALAR-NEXT: movb %al, 48(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 55(%rdx) |
| ; SCALAR-NEXT: movb %cl, 54(%rdx) |
| ; SCALAR-NEXT: movb %dil, 53(%rdx) |
| ; SCALAR-NEXT: movb %al, 52(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 59(%rdx) |
| ; SCALAR-NEXT: movb %cl, 58(%rdx) |
| ; SCALAR-NEXT: movb %dil, 57(%rdx) |
| ; SCALAR-NEXT: movb %al, 56(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 63(%rdx) |
| ; SCALAR-NEXT: movb %cl, 62(%rdx) |
| ; SCALAR-NEXT: movb %dil, 61(%rdx) |
| ; SCALAR-NEXT: movb %al, 60(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v4i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movd %xmm0, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v4i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec512_v4i8: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec512_v4i8: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovd %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 |
| ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1> |
| store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 |
| store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 |
| store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 |
| %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 |
| store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 |
| %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 |
| store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 |
| %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4 |
| store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16 |
| %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5 |
| store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4 |
| %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6 |
| store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8 |
| %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7 |
| store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4 |
| %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8 |
| store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32 |
| %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9 |
| store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4 |
| %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10 |
| store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8 |
| %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11 |
| store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4 |
| %out.subvec12.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 12 |
| store <4 x i8> %in.subvec, ptr %out.subvec12.ptr, align 16 |
| %out.subvec13.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 13 |
| store <4 x i8> %in.subvec, ptr %out.subvec13.ptr, align 4 |
| %out.subvec14.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 14 |
| store <4 x i8> %in.subvec, ptr %out.subvec14.ptr, align 8 |
| %out.subvec15.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 15 |
| store <4 x i8> %in.subvec, ptr %out.subvec15.ptr, align 4 |
| ret void |
| } |
| |
| define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v4i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movzwl 6(%rdi), %r8d |
| ; SCALAR-NEXT: movzwl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %edi |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %edi |
| ; SCALAR-NEXT: notl %r8d |
| ; SCALAR-NEXT: movw %r8w, 6(%rsi) |
| ; SCALAR-NEXT: movw %di, 4(%rsi) |
| ; SCALAR-NEXT: movw %cx, 2(%rsi) |
| ; SCALAR-NEXT: movw %ax, (%rsi) |
| ; SCALAR-NEXT: movw %r8w, 6(%rdx) |
| ; SCALAR-NEXT: movw %di, 4(%rdx) |
| ; SCALAR-NEXT: movw %cx, 2(%rdx) |
| ; SCALAR-NEXT: movw %ax, (%rdx) |
| ; SCALAR-NEXT: movw %r8w, 14(%rdx) |
| ; SCALAR-NEXT: movw %di, 12(%rdx) |
| ; SCALAR-NEXT: movw %cx, 10(%rdx) |
| ; SCALAR-NEXT: movw %ax, 8(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 22(%rdx) |
| ; SCALAR-NEXT: movw %di, 20(%rdx) |
| ; SCALAR-NEXT: movw %cx, 18(%rdx) |
| ; SCALAR-NEXT: movw %ax, 16(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 30(%rdx) |
| ; SCALAR-NEXT: movw %di, 28(%rdx) |
| ; SCALAR-NEXT: movw %cx, 26(%rdx) |
| ; SCALAR-NEXT: movw %ax, 24(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 38(%rdx) |
| ; SCALAR-NEXT: movw %di, 36(%rdx) |
| ; SCALAR-NEXT: movw %cx, 34(%rdx) |
| ; SCALAR-NEXT: movw %ax, 32(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 46(%rdx) |
| ; SCALAR-NEXT: movw %di, 44(%rdx) |
| ; SCALAR-NEXT: movw %cx, 42(%rdx) |
| ; SCALAR-NEXT: movw %ax, 40(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 54(%rdx) |
| ; SCALAR-NEXT: movw %di, 52(%rdx) |
| ; SCALAR-NEXT: movw %cx, 50(%rdx) |
| ; SCALAR-NEXT: movw %ax, 48(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 62(%rdx) |
| ; SCALAR-NEXT: movw %di, 60(%rdx) |
| ; SCALAR-NEXT: movw %cx, 58(%rdx) |
| ; SCALAR-NEXT: movw %ax, 56(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v4i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v4i16: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec512_v4i16: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec512_v4i16: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 |
| ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1> |
| store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 |
| store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 |
| store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2 |
| store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3 |
| store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4 |
| store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32 |
| %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5 |
| store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8 |
| %out.subvec6.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 6 |
| store <4 x i16> %in.subvec, ptr %out.subvec6.ptr, align 16 |
| %out.subvec7.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 7 |
| store <4 x i16> %in.subvec, ptr %out.subvec7.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec512_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v4i32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movaps (%rdi), %xmm0 |
| ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| ; SCALAR-NEXT: movaps %xmm0, (%rsi) |
| ; SCALAR-NEXT: movaps %xmm0, (%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 16(%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 32(%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 48(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v4i32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec512_v4i32: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> |
| store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0 |
| store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1 |
| store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2 |
| store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| %out.subvec3.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 3 |
| store <4 x i32> %in.subvec, ptr %out.subvec3.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec512_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v4f32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movaps (%rdi), %xmm0 |
| ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| ; SCALAR-NEXT: movaps %xmm0, (%rsi) |
| ; SCALAR-NEXT: movaps %xmm0, (%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 16(%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 32(%rdx) |
| ; SCALAR-NEXT: movaps %xmm0, 48(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v4f32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec512_v4f32: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> |
| %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float> |
| store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0 |
| store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1 |
| store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2 |
| store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| %out.subvec3.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 3 |
| store <4 x float> %in.subvec, ptr %out.subvec3.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec512_v4i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v4i64: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq 24(%rdi), %rax |
| ; SCALAR-NEXT: movq 16(%rdi), %rcx |
| ; SCALAR-NEXT: movq (%rdi), %r8 |
| ; SCALAR-NEXT: movq 8(%rdi), %rdi |
| ; SCALAR-NEXT: notq %r8 |
| ; SCALAR-NEXT: notq %rdi |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: movq %rax, 24(%rsi) |
| ; SCALAR-NEXT: movq %rcx, 16(%rsi) |
| ; SCALAR-NEXT: movq %rdi, 8(%rsi) |
| ; SCALAR-NEXT: movq %r8, (%rsi) |
| ; SCALAR-NEXT: movq %rax, 24(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 16(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 8(%rdx) |
| ; SCALAR-NEXT: movq %r8, (%rdx) |
| ; SCALAR-NEXT: movq %rax, 56(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 48(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 40(%rdx) |
| ; SCALAR-NEXT: movq %r8, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v4i64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 48(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v4i64: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rsi) |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec512_v4i64: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rsi) |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <4 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1, i64 -1> |
| store <4 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 0 |
| store <4 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 1 |
| store <4 x i64> %in.subvec, ptr %out.subvec1.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec512_v4f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v4f64: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: movq 24(%rdi), %rax |
| ; SCALAR-NEXT: movq 16(%rdi), %rcx |
| ; SCALAR-NEXT: movq (%rdi), %r8 |
| ; SCALAR-NEXT: movq 8(%rdi), %rdi |
| ; SCALAR-NEXT: notq %r8 |
| ; SCALAR-NEXT: notq %rdi |
| ; SCALAR-NEXT: notq %rcx |
| ; SCALAR-NEXT: notq %rax |
| ; SCALAR-NEXT: movq %rax, 24(%rsi) |
| ; SCALAR-NEXT: movq %rcx, 16(%rsi) |
| ; SCALAR-NEXT: movq %rdi, 8(%rsi) |
| ; SCALAR-NEXT: movq %r8, (%rsi) |
| ; SCALAR-NEXT: movq %rax, 24(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 16(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 8(%rdx) |
| ; SCALAR-NEXT: movq %r8, (%rdx) |
| ; SCALAR-NEXT: movq %rax, 56(%rdx) |
| ; SCALAR-NEXT: movq %rcx, 48(%rdx) |
| ; SCALAR-NEXT: movq %rdi, 40(%rdx) |
| ; SCALAR-NEXT: movq %r8, 32(%rdx) |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v4f64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 48(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v4f64: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rsi) |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec512_v4f64: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rsi) |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <4 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1, i64 -1> |
| %in.subvec = bitcast <4 x i64> %in.subvec.int to <4 x double> |
| store <4 x double> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 0 |
| store <4 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 1 |
| store <4 x double> %in.subvec, ptr %out.subvec1.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v8i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzbl 7(%rdi), %ebx |
| ; SCALAR-NEXT: movzbl 6(%rdi), %r11d |
| ; SCALAR-NEXT: movzbl 5(%rdi), %r10d |
| ; SCALAR-NEXT: movzbl 4(%rdi), %r9d |
| ; SCALAR-NEXT: movzbl 3(%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %edi |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: notb %r10b |
| ; SCALAR-NEXT: notb %r11b |
| ; SCALAR-NEXT: notb %bl |
| ; SCALAR-NEXT: movb %bl, 7(%rsi) |
| ; SCALAR-NEXT: movb %r11b, 6(%rsi) |
| ; SCALAR-NEXT: movb %r10b, 5(%rsi) |
| ; SCALAR-NEXT: movb %r9b, 4(%rsi) |
| ; SCALAR-NEXT: movb %r8b, 3(%rsi) |
| ; SCALAR-NEXT: movb %cl, 2(%rsi) |
| ; SCALAR-NEXT: movb %dil, 1(%rsi) |
| ; SCALAR-NEXT: movb %al, (%rsi) |
| ; SCALAR-NEXT: movb %bl, 7(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 6(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 5(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 4(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 3(%rdx) |
| ; SCALAR-NEXT: movb %cl, 2(%rdx) |
| ; SCALAR-NEXT: movb %dil, 1(%rdx) |
| ; SCALAR-NEXT: movb %al, (%rdx) |
| ; SCALAR-NEXT: movb %bl, 15(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 14(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 13(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 12(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 11(%rdx) |
| ; SCALAR-NEXT: movb %cl, 10(%rdx) |
| ; SCALAR-NEXT: movb %dil, 9(%rdx) |
| ; SCALAR-NEXT: movb %al, 8(%rdx) |
| ; SCALAR-NEXT: movb %bl, 23(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 22(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 21(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 20(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 19(%rdx) |
| ; SCALAR-NEXT: movb %cl, 18(%rdx) |
| ; SCALAR-NEXT: movb %dil, 17(%rdx) |
| ; SCALAR-NEXT: movb %al, 16(%rdx) |
| ; SCALAR-NEXT: movb %bl, 31(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 30(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 29(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 28(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 27(%rdx) |
| ; SCALAR-NEXT: movb %cl, 26(%rdx) |
| ; SCALAR-NEXT: movb %dil, 25(%rdx) |
| ; SCALAR-NEXT: movb %al, 24(%rdx) |
| ; SCALAR-NEXT: movb %bl, 39(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 38(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 37(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 36(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 35(%rdx) |
| ; SCALAR-NEXT: movb %cl, 34(%rdx) |
| ; SCALAR-NEXT: movb %dil, 33(%rdx) |
| ; SCALAR-NEXT: movb %al, 32(%rdx) |
| ; SCALAR-NEXT: movb %bl, 47(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 46(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 45(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 44(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 43(%rdx) |
| ; SCALAR-NEXT: movb %cl, 42(%rdx) |
| ; SCALAR-NEXT: movb %dil, 41(%rdx) |
| ; SCALAR-NEXT: movb %al, 40(%rdx) |
| ; SCALAR-NEXT: movb %bl, 55(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 54(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 53(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 52(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 51(%rdx) |
| ; SCALAR-NEXT: movb %cl, 50(%rdx) |
| ; SCALAR-NEXT: movb %dil, 49(%rdx) |
| ; SCALAR-NEXT: movb %al, 48(%rdx) |
| ; SCALAR-NEXT: movb %bl, 63(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 62(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 61(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 60(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 59(%rdx) |
| ; SCALAR-NEXT: movb %cl, 58(%rdx) |
| ; SCALAR-NEXT: movb %dil, 57(%rdx) |
| ; SCALAR-NEXT: movb %al, 56(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v8i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: movq %xmm1, (%rsi) |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v8i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] |
| ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-ONLY-LABEL: vec512_v8i8: |
| ; AVX2-ONLY: # %bb.0: |
| ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-ONLY-NEXT: vzeroupper |
| ; AVX2-ONLY-NEXT: retq |
| ; |
| ; AVX512-LABEL: vec512_v8i8: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, (%rsi) |
| ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 |
| ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 |
| store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 |
| store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 |
| %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2 |
| store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16 |
| %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3 |
| store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8 |
| %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4 |
| store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32 |
| %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5 |
| store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8 |
| %out.subvec6.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 6 |
| store <8 x i8> %in.subvec, ptr %out.subvec6.ptr, align 16 |
| %out.subvec7.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 7 |
| store <8 x i8> %in.subvec, ptr %out.subvec7.ptr, align 8 |
| ret void |
| } |
| |
| define void @vec512_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v8i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzwl 14(%rdi), %ebx |
| ; SCALAR-NEXT: movl 12(%rdi), %r11d |
| ; SCALAR-NEXT: movzwl 10(%rdi), %r10d |
| ; SCALAR-NEXT: movl 8(%rdi), %r9d |
| ; SCALAR-NEXT: movzwl 6(%rdi), %r8d |
| ; SCALAR-NEXT: movzwl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %edi |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %edi |
| ; SCALAR-NEXT: notl %r8d |
| ; SCALAR-NEXT: notl %r9d |
| ; SCALAR-NEXT: notl %r10d |
| ; SCALAR-NEXT: notl %r11d |
| ; SCALAR-NEXT: notl %ebx |
| ; SCALAR-NEXT: movw %bx, 14(%rsi) |
| ; SCALAR-NEXT: movw %r11w, 12(%rsi) |
| ; SCALAR-NEXT: movw %r10w, 10(%rsi) |
| ; SCALAR-NEXT: movw %r9w, 8(%rsi) |
| ; SCALAR-NEXT: movw %r8w, 6(%rsi) |
| ; SCALAR-NEXT: movw %di, 4(%rsi) |
| ; SCALAR-NEXT: movw %cx, 2(%rsi) |
| ; SCALAR-NEXT: movw %ax, (%rsi) |
| ; SCALAR-NEXT: movw %bx, 14(%rdx) |
| ; SCALAR-NEXT: movw %r11w, 12(%rdx) |
| ; SCALAR-NEXT: movw %r10w, 10(%rdx) |
| ; SCALAR-NEXT: movw %r9w, 8(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 6(%rdx) |
| ; SCALAR-NEXT: movw %di, 4(%rdx) |
| ; SCALAR-NEXT: movw %cx, 2(%rdx) |
| ; SCALAR-NEXT: movw %ax, (%rdx) |
| ; SCALAR-NEXT: movw %bx, 30(%rdx) |
| ; SCALAR-NEXT: movw %r11w, 28(%rdx) |
| ; SCALAR-NEXT: movw %r10w, 26(%rdx) |
| ; SCALAR-NEXT: movw %r9w, 24(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 22(%rdx) |
| ; SCALAR-NEXT: movw %di, 20(%rdx) |
| ; SCALAR-NEXT: movw %cx, 18(%rdx) |
| ; SCALAR-NEXT: movw %ax, 16(%rdx) |
| ; SCALAR-NEXT: movw %bx, 46(%rdx) |
| ; SCALAR-NEXT: movw %r11w, 44(%rdx) |
| ; SCALAR-NEXT: movw %r10w, 42(%rdx) |
| ; SCALAR-NEXT: movw %r9w, 40(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 38(%rdx) |
| ; SCALAR-NEXT: movw %di, 36(%rdx) |
| ; SCALAR-NEXT: movw %cx, 34(%rdx) |
| ; SCALAR-NEXT: movw %ax, 32(%rdx) |
| ; SCALAR-NEXT: movw %bx, 62(%rdx) |
| ; SCALAR-NEXT: movw %r11w, 60(%rdx) |
| ; SCALAR-NEXT: movw %r10w, 58(%rdx) |
| ; SCALAR-NEXT: movw %r9w, 56(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 54(%rdx) |
| ; SCALAR-NEXT: movw %di, 52(%rdx) |
| ; SCALAR-NEXT: movw %cx, 50(%rdx) |
| ; SCALAR-NEXT: movw %ax, 48(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v8i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec512_v8i16: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> |
| store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0 |
| store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1 |
| store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2 |
| store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| %out.subvec3.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 3 |
| store <8 x i16> %in.subvec, ptr %out.subvec3.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec512_v8i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v8i32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movl 28(%rdi), %ebx |
| ; SCALAR-NEXT: movl 24(%rdi), %r11d |
| ; SCALAR-NEXT: movl 20(%rdi), %r10d |
| ; SCALAR-NEXT: movl 16(%rdi), %r9d |
| ; SCALAR-NEXT: movl 12(%rdi), %r8d |
| ; SCALAR-NEXT: movl 8(%rdi), %ecx |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %edi |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %edi |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %r8d |
| ; SCALAR-NEXT: notl %r9d |
| ; SCALAR-NEXT: notl %r10d |
| ; SCALAR-NEXT: notl %r11d |
| ; SCALAR-NEXT: notl %ebx |
| ; SCALAR-NEXT: movl %ebx, 28(%rsi) |
| ; SCALAR-NEXT: movl %r11d, 24(%rsi) |
| ; SCALAR-NEXT: movl %r10d, 20(%rsi) |
| ; SCALAR-NEXT: movl %r9d, 16(%rsi) |
| ; SCALAR-NEXT: movl %r8d, 12(%rsi) |
| ; SCALAR-NEXT: movl %ecx, 8(%rsi) |
| ; SCALAR-NEXT: movl %edi, 4(%rsi) |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movl %ebx, 28(%rdx) |
| ; SCALAR-NEXT: movl %r11d, 24(%rdx) |
| ; SCALAR-NEXT: movl %r10d, 20(%rdx) |
| ; SCALAR-NEXT: movl %r9d, 16(%rdx) |
| ; SCALAR-NEXT: movl %r8d, 12(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 8(%rdx) |
| ; SCALAR-NEXT: movl %edi, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movl %ebx, 60(%rdx) |
| ; SCALAR-NEXT: movl %r11d, 56(%rdx) |
| ; SCALAR-NEXT: movl %r10d, 52(%rdx) |
| ; SCALAR-NEXT: movl %r9d, 48(%rdx) |
| ; SCALAR-NEXT: movl %r8d, 44(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 40(%rdx) |
| ; SCALAR-NEXT: movl %edi, 36(%rdx) |
| ; SCALAR-NEXT: movl %eax, 32(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v8i32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 48(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v8i32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rsi) |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec512_v8i32: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rsi) |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <8 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> |
| store <8 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 0 |
| store <8 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 1 |
| store <8 x i32> %in.subvec, ptr %out.subvec1.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec512_v8f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v8f32: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movl 28(%rdi), %ebx |
| ; SCALAR-NEXT: movl 24(%rdi), %r11d |
| ; SCALAR-NEXT: movl 20(%rdi), %r10d |
| ; SCALAR-NEXT: movl 16(%rdi), %r9d |
| ; SCALAR-NEXT: movl 12(%rdi), %r8d |
| ; SCALAR-NEXT: movl 8(%rdi), %ecx |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %edi |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: notl %edi |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: notl %r8d |
| ; SCALAR-NEXT: notl %r9d |
| ; SCALAR-NEXT: notl %r10d |
| ; SCALAR-NEXT: notl %r11d |
| ; SCALAR-NEXT: notl %ebx |
| ; SCALAR-NEXT: movl %ebx, 28(%rsi) |
| ; SCALAR-NEXT: movl %r11d, 24(%rsi) |
| ; SCALAR-NEXT: movl %r10d, 20(%rsi) |
| ; SCALAR-NEXT: movl %r9d, 16(%rsi) |
| ; SCALAR-NEXT: movl %r8d, 12(%rsi) |
| ; SCALAR-NEXT: movl %ecx, 8(%rsi) |
| ; SCALAR-NEXT: movl %edi, 4(%rsi) |
| ; SCALAR-NEXT: movl %eax, (%rsi) |
| ; SCALAR-NEXT: movl %ebx, 28(%rdx) |
| ; SCALAR-NEXT: movl %r11d, 24(%rdx) |
| ; SCALAR-NEXT: movl %r10d, 20(%rdx) |
| ; SCALAR-NEXT: movl %r9d, 16(%rdx) |
| ; SCALAR-NEXT: movl %r8d, 12(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 8(%rdx) |
| ; SCALAR-NEXT: movl %edi, 4(%rdx) |
| ; SCALAR-NEXT: movl %eax, (%rdx) |
| ; SCALAR-NEXT: movl %ebx, 60(%rdx) |
| ; SCALAR-NEXT: movl %r11d, 56(%rdx) |
| ; SCALAR-NEXT: movl %r10d, 52(%rdx) |
| ; SCALAR-NEXT: movl %r9d, 48(%rdx) |
| ; SCALAR-NEXT: movl %r8d, 44(%rdx) |
| ; SCALAR-NEXT: movl %ecx, 40(%rdx) |
| ; SCALAR-NEXT: movl %edi, 36(%rdx) |
| ; SCALAR-NEXT: movl %eax, 32(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v8f32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 48(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v8f32: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rsi) |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec512_v8f32: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rsi) |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64 |
| %in.subvec.int = xor <8 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> |
| %in.subvec = bitcast <8 x i32> %in.subvec.int to <8 x float> |
| store <8 x float> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 0 |
| store <8 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 1 |
| store <8 x float> %in.subvec, ptr %out.subvec1.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v16i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbp |
| ; SCALAR-NEXT: pushq %r15 |
| ; SCALAR-NEXT: pushq %r14 |
| ; SCALAR-NEXT: pushq %r13 |
| ; SCALAR-NEXT: pushq %r12 |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzbl 15(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 14(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 13(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 12(%rdi), %r10d |
| ; SCALAR-NEXT: movzbl 11(%rdi), %r13d |
| ; SCALAR-NEXT: movzbl 10(%rdi), %r12d |
| ; SCALAR-NEXT: movzbl 9(%rdi), %r15d |
| ; SCALAR-NEXT: movzbl 8(%rdi), %r14d |
| ; SCALAR-NEXT: movzbl 7(%rdi), %ebp |
| ; SCALAR-NEXT: movzbl 6(%rdi), %r11d |
| ; SCALAR-NEXT: movzbl 5(%rdi), %ebx |
| ; SCALAR-NEXT: movzbl 4(%rdi), %r9d |
| ; SCALAR-NEXT: movzbl 3(%rdi), %r8d |
| ; SCALAR-NEXT: movzbl 2(%rdi), %ecx |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %edi |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movl %ebx, %r9d |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: notb %r11b |
| ; SCALAR-NEXT: movl %r11d, %ebx |
| ; SCALAR-NEXT: notb %bpl |
| ; SCALAR-NEXT: notb %r14b |
| ; SCALAR-NEXT: notb %r15b |
| ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r12b |
| ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r13b |
| ; SCALAR-NEXT: notb %r10b |
| ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %r11b |
| ; SCALAR-NEXT: movb %r11b, 15(%rsi) |
| ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movb %r8b, 14(%rsi) |
| ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movl %edi, %eax |
| ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movb %dil, 13(%rsi) |
| ; SCALAR-NEXT: movb %r10b, 12(%rsi) |
| ; SCALAR-NEXT: movb %r13b, 11(%rsi) |
| ; SCALAR-NEXT: movb %r12b, 10(%rsi) |
| ; SCALAR-NEXT: movb %r15b, 9(%rsi) |
| ; SCALAR-NEXT: movb %r14b, 8(%rsi) |
| ; SCALAR-NEXT: movl %r14d, %r12d |
| ; SCALAR-NEXT: movb %bpl, 7(%rsi) |
| ; SCALAR-NEXT: movl %ebp, %r14d |
| ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movb %bl, 6(%rsi) |
| ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movb %r9b, 5(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %cl, 4(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %bpl, 3(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %dil, 2(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %cl, 1(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r10b, (%rsi) |
| ; SCALAR-NEXT: movb %r11b, 15(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 14(%rdx) |
| ; SCALAR-NEXT: movb %al, 13(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 12(%rdx) |
| ; SCALAR-NEXT: movb %r13b, 11(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r15b, 10(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 9(%rdx) |
| ; SCALAR-NEXT: movb %r12b, 8(%rdx) |
| ; SCALAR-NEXT: movb %r14b, 7(%rdx) |
| ; SCALAR-NEXT: movb %bl, 6(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 5(%rdx) |
| ; SCALAR-NEXT: movl %r9d, %r11d |
| ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r8b, 4(%rdx) |
| ; SCALAR-NEXT: movb %bpl, 3(%rdx) |
| ; SCALAR-NEXT: movb %dil, 2(%rdx) |
| ; SCALAR-NEXT: movb %cl, 1(%rdx) |
| ; SCALAR-NEXT: movl %ecx, %r14d |
| ; SCALAR-NEXT: movl %r10d, %esi |
| ; SCALAR-NEXT: movb %r10b, (%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %cl, 31(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r9b, 30(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %dil, 29(%rdx) |
| ; SCALAR-NEXT: movb %al, 28(%rdx) |
| ; SCALAR-NEXT: movl %eax, %r10d |
| ; SCALAR-NEXT: movb %r13b, 27(%rdx) |
| ; SCALAR-NEXT: movb %r15b, 26(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r15b, 25(%rdx) |
| ; SCALAR-NEXT: movl %r12d, %ebp |
| ; SCALAR-NEXT: movb %r12b, 24(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %bl, 23(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 22(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 21(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 20(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r8b, 19(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r8b, 18(%rdx) |
| ; SCALAR-NEXT: movb %r14b, 17(%rdx) |
| ; SCALAR-NEXT: movb %sil, 16(%rdx) |
| ; SCALAR-NEXT: movl %esi, %r11d |
| ; SCALAR-NEXT: movb %cl, 47(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 46(%rdx) |
| ; SCALAR-NEXT: movb %dil, 45(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 44(%rdx) |
| ; SCALAR-NEXT: movb %r13b, 43(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r12b, 42(%rdx) |
| ; SCALAR-NEXT: movb %r15b, 41(%rdx) |
| ; SCALAR-NEXT: movl %ebp, %r14d |
| ; SCALAR-NEXT: movb %bpl, 40(%rdx) |
| ; SCALAR-NEXT: movl %ebx, %ebp |
| ; SCALAR-NEXT: movb %bl, 39(%rdx) |
| ; SCALAR-NEXT: movl %eax, %ebx |
| ; SCALAR-NEXT: movb %al, 38(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %cl, 37(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 36(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 35(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 34(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r9b, 33(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 32(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r11b, 63(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r11b, 62(%rdx) |
| ; SCALAR-NEXT: movb %dil, 61(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 60(%rdx) |
| ; SCALAR-NEXT: movb %r13b, 59(%rdx) |
| ; SCALAR-NEXT: movb %r12b, 58(%rdx) |
| ; SCALAR-NEXT: movb %r15b, 57(%rdx) |
| ; SCALAR-NEXT: movb %r14b, 56(%rdx) |
| ; SCALAR-NEXT: movb %bpl, 55(%rdx) |
| ; SCALAR-NEXT: movb %bl, 54(%rdx) |
| ; SCALAR-NEXT: movb %cl, 53(%rdx) |
| ; SCALAR-NEXT: movb %al, 52(%rdx) |
| ; SCALAR-NEXT: movb %sil, 51(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 50(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 49(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 48(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: popq %r12 |
| ; SCALAR-NEXT: popq %r13 |
| ; SCALAR-NEXT: popq %r14 |
| ; SCALAR-NEXT: popq %r15 |
| ; SCALAR-NEXT: popq %rbp |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v16i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX-LABEL: vec512_v16i8: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rsi) |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) |
| ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) |
| ; AVX-NEXT: retq |
| %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0 |
| store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1 |
| store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16 |
| %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2 |
| store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32 |
| %out.subvec3.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 3 |
| store <16 x i8> %in.subvec, ptr %out.subvec3.ptr, align 16 |
| ret void |
| } |
| |
| define void @vec512_v16i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v16i16: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbp |
| ; SCALAR-NEXT: pushq %r15 |
| ; SCALAR-NEXT: pushq %r14 |
| ; SCALAR-NEXT: pushq %r13 |
| ; SCALAR-NEXT: pushq %r12 |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzwl 30(%rdi), %eax |
| ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: movl 28(%rdi), %eax |
| ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: movzwl 26(%rdi), %eax |
| ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: movl 24(%rdi), %r13d |
| ; SCALAR-NEXT: movzwl 22(%rdi), %r12d |
| ; SCALAR-NEXT: movl 20(%rdi), %r15d |
| ; SCALAR-NEXT: movzwl 18(%rdi), %r14d |
| ; SCALAR-NEXT: movl 16(%rdi), %ebx |
| ; SCALAR-NEXT: movzwl 14(%rdi), %r11d |
| ; SCALAR-NEXT: movl 12(%rdi), %r10d |
| ; SCALAR-NEXT: movzwl 10(%rdi), %r9d |
| ; SCALAR-NEXT: movl 8(%rdi), %r8d |
| ; SCALAR-NEXT: movzwl 6(%rdi), %ecx |
| ; SCALAR-NEXT: movzwl 2(%rdi), %ebp |
| ; SCALAR-NEXT: movl (%rdi), %eax |
| ; SCALAR-NEXT: movl 4(%rdi), %edi |
| ; SCALAR-NEXT: notl %eax |
| ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: notl %ebp |
| ; SCALAR-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: notl %edi |
| ; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: notl %ecx |
| ; SCALAR-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: notl %r8d |
| ; SCALAR-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: notl %r9d |
| ; SCALAR-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: movl %r10d, %edi |
| ; SCALAR-NEXT: notl %edi |
| ; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: notl %r11d |
| ; SCALAR-NEXT: movl %r11d, %r9d |
| ; SCALAR-NEXT: notl %ebx |
| ; SCALAR-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: notl %r14d |
| ; SCALAR-NEXT: notl %r15d |
| ; SCALAR-NEXT: notl %r12d |
| ; SCALAR-NEXT: notl %r13d |
| ; SCALAR-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload |
| ; SCALAR-NEXT: notl %r10d |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload |
| ; SCALAR-NEXT: notl %r11d |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload |
| ; SCALAR-NEXT: notl %r8d |
| ; SCALAR-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: movw %r8w, 30(%rsi) |
| ; SCALAR-NEXT: movw %r11w, 28(%rsi) |
| ; SCALAR-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: movw %r10w, 26(%rsi) |
| ; SCALAR-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SCALAR-NEXT: movw %r13w, 24(%rsi) |
| ; SCALAR-NEXT: movw %r12w, 22(%rsi) |
| ; SCALAR-NEXT: movw %r15w, 20(%rsi) |
| ; SCALAR-NEXT: movw %r14w, 18(%rsi) |
| ; SCALAR-NEXT: movw %bx, 16(%rsi) |
| ; SCALAR-NEXT: movw %r9w, 14(%rsi) |
| ; SCALAR-NEXT: movw %di, 12(%rsi) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Reload |
| ; SCALAR-NEXT: movw %bp, 10(%rsi) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload |
| ; SCALAR-NEXT: movw %di, 8(%rsi) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload |
| ; SCALAR-NEXT: movw %cx, 6(%rsi) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload |
| ; SCALAR-NEXT: movw %r8w, 4(%rsi) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload |
| ; SCALAR-NEXT: movw %ax, 2(%rsi) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload |
| ; SCALAR-NEXT: movw %bx, (%rsi) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload |
| ; SCALAR-NEXT: movw %r13w, 30(%rdx) |
| ; SCALAR-NEXT: movw %r11w, 28(%rdx) |
| ; SCALAR-NEXT: movw %r10w, 26(%rdx) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload |
| ; SCALAR-NEXT: movw %si, 24(%rdx) |
| ; SCALAR-NEXT: movw %r12w, 22(%rdx) |
| ; SCALAR-NEXT: movw %r15w, 20(%rdx) |
| ; SCALAR-NEXT: movw %r14w, 18(%rdx) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload |
| ; SCALAR-NEXT: movw %r11w, 16(%rdx) |
| ; SCALAR-NEXT: movw %r9w, 14(%rdx) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload |
| ; SCALAR-NEXT: movw %r10w, 12(%rdx) |
| ; SCALAR-NEXT: movw %bp, 10(%rdx) |
| ; SCALAR-NEXT: movw %di, 8(%rdx) |
| ; SCALAR-NEXT: movw %cx, 6(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 4(%rdx) |
| ; SCALAR-NEXT: movw %ax, 2(%rdx) |
| ; SCALAR-NEXT: movl %ebx, %esi |
| ; SCALAR-NEXT: movw %si, (%rdx) |
| ; SCALAR-NEXT: movw %r13w, 62(%rdx) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload |
| ; SCALAR-NEXT: movw %bx, 60(%rdx) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload |
| ; SCALAR-NEXT: movw %bx, 58(%rdx) |
| ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload |
| ; SCALAR-NEXT: movw %bx, 56(%rdx) |
| ; SCALAR-NEXT: movw %r12w, 54(%rdx) |
| ; SCALAR-NEXT: movw %r15w, 52(%rdx) |
| ; SCALAR-NEXT: movw %r14w, 50(%rdx) |
| ; SCALAR-NEXT: movw %r11w, 48(%rdx) |
| ; SCALAR-NEXT: movw %r9w, 46(%rdx) |
| ; SCALAR-NEXT: movw %r10w, 44(%rdx) |
| ; SCALAR-NEXT: movw %bp, 42(%rdx) |
| ; SCALAR-NEXT: movw %di, 40(%rdx) |
| ; SCALAR-NEXT: movw %cx, 38(%rdx) |
| ; SCALAR-NEXT: movw %r8w, 36(%rdx) |
| ; SCALAR-NEXT: movw %ax, 34(%rdx) |
| ; SCALAR-NEXT: movw %si, 32(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: popq %r12 |
| ; SCALAR-NEXT: popq %r13 |
| ; SCALAR-NEXT: popq %r14 |
| ; SCALAR-NEXT: popq %r15 |
| ; SCALAR-NEXT: popq %rbp |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v16i16: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 48(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v16i16: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rsi) |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec512_v16i16: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rsi) |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <16 x i16>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <16 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> |
| store <16 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 0 |
| store <16 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 1 |
| store <16 x i16> %in.subvec, ptr %out.subvec1.ptr, align 32 |
| ret void |
| } |
| |
| define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { |
| ; SCALAR-LABEL: vec512_v32i8: |
| ; SCALAR: # %bb.0: |
| ; SCALAR-NEXT: pushq %rbp |
| ; SCALAR-NEXT: pushq %r15 |
| ; SCALAR-NEXT: pushq %r14 |
| ; SCALAR-NEXT: pushq %r13 |
| ; SCALAR-NEXT: pushq %r12 |
| ; SCALAR-NEXT: pushq %rbx |
| ; SCALAR-NEXT: movzbl 16(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 15(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 14(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 13(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 12(%rdi), %r13d |
| ; SCALAR-NEXT: movzbl 11(%rdi), %eax |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 10(%rdi), %r12d |
| ; SCALAR-NEXT: movzbl 9(%rdi), %r15d |
| ; SCALAR-NEXT: movzbl 8(%rdi), %r14d |
| ; SCALAR-NEXT: movzbl 7(%rdi), %ebp |
| ; SCALAR-NEXT: movzbl 6(%rdi), %ebx |
| ; SCALAR-NEXT: movzbl 5(%rdi), %r11d |
| ; SCALAR-NEXT: movzbl 4(%rdi), %r10d |
| ; SCALAR-NEXT: movzbl 3(%rdi), %r9d |
| ; SCALAR-NEXT: movzbl 2(%rdi), %r8d |
| ; SCALAR-NEXT: movzbl (%rdi), %eax |
| ; SCALAR-NEXT: movzbl 1(%rdi), %ecx |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r10b |
| ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r11b |
| ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %bl |
| ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %bpl |
| ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r14b |
| ; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r15b |
| ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r12b |
| ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %r11b |
| ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb %r13b |
| ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill |
| ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload |
| ; SCALAR-NEXT: notb %r8b |
| ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill |
| ; SCALAR-NEXT: movzbl 17(%rdi), %eax |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 18(%rdi), %eax |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 19(%rdi), %eax |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 20(%rdi), %eax |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 21(%rdi), %ebp |
| ; SCALAR-NEXT: notb %bpl |
| ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 22(%rdi), %ebx |
| ; SCALAR-NEXT: notb %bl |
| ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 23(%rdi), %r10d |
| ; SCALAR-NEXT: notb %r10b |
| ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 24(%rdi), %r9d |
| ; SCALAR-NEXT: notb %r9b |
| ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 25(%rdi), %ecx |
| ; SCALAR-NEXT: notb %cl |
| ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 26(%rdi), %r14d |
| ; SCALAR-NEXT: notb %r14b |
| ; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 27(%rdi), %r15d |
| ; SCALAR-NEXT: notb %r15b |
| ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 28(%rdi), %r12d |
| ; SCALAR-NEXT: notb %r12b |
| ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 29(%rdi), %r13d |
| ; SCALAR-NEXT: notb %r13b |
| ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 30(%rdi), %eax |
| ; SCALAR-NEXT: notb %al |
| ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl 31(%rdi), %edi |
| ; SCALAR-NEXT: notb %dil |
| ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movb %dil, 31(%rsi) |
| ; SCALAR-NEXT: movb %al, 30(%rsi) |
| ; SCALAR-NEXT: movb %r13b, 29(%rsi) |
| ; SCALAR-NEXT: movb %r12b, 28(%rsi) |
| ; SCALAR-NEXT: movb %r15b, 27(%rsi) |
| ; SCALAR-NEXT: movb %r14b, 26(%rsi) |
| ; SCALAR-NEXT: movb %cl, 25(%rsi) |
| ; SCALAR-NEXT: movb %r9b, 24(%rsi) |
| ; SCALAR-NEXT: movb %r10b, 23(%rsi) |
| ; SCALAR-NEXT: movb %bl, 22(%rsi) |
| ; SCALAR-NEXT: movb %bpl, 21(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %bpl, 20(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 19(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 18(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 17(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %cl, 16(%rsi) |
| ; SCALAR-NEXT: movb %r8b, 15(%rsi) |
| ; SCALAR-NEXT: movl %r8d, %r14d |
| ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %bl, 14(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 13(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 12(%rsi) |
| ; SCALAR-NEXT: movb %r11b, 11(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %dil, 10(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %dil, 9(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %dil, 8(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r11b, 7(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r13b, 6(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r10b, 5(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r12b, 4(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r9b, 3(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r15b, 2(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r8b, 1(%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %dil, (%rsi) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 31(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 30(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 29(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 28(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 27(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 26(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 25(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 24(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 23(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 22(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 21(%rdx) |
| ; SCALAR-NEXT: movb %bpl, 20(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 19(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 18(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 17(%rdx) |
| ; SCALAR-NEXT: movb %cl, 16(%rdx) |
| ; SCALAR-NEXT: movb %r14b, 15(%rdx) |
| ; SCALAR-NEXT: movb %bl, 14(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %cl, 13(%rdx) |
| ; SCALAR-NEXT: movb %al, 12(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %sil, 11(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %bl, 10(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %r14b, 9(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %bpl, 8(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 7(%rdx) |
| ; SCALAR-NEXT: movb %r13b, 6(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 5(%rdx) |
| ; SCALAR-NEXT: movb %r12b, 4(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 3(%rdx) |
| ; SCALAR-NEXT: movb %r15b, 2(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 1(%rdx) |
| ; SCALAR-NEXT: movb %dil, (%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 63(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 62(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 61(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 60(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 59(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 58(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 57(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 56(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 55(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 54(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 53(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 52(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 51(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 50(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 49(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 48(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 47(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 46(%rdx) |
| ; SCALAR-NEXT: movb %cl, 45(%rdx) |
| ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload |
| ; SCALAR-NEXT: movb %al, 44(%rdx) |
| ; SCALAR-NEXT: movb %sil, 43(%rdx) |
| ; SCALAR-NEXT: movb %bl, 42(%rdx) |
| ; SCALAR-NEXT: movb %r14b, 41(%rdx) |
| ; SCALAR-NEXT: movb %bpl, 40(%rdx) |
| ; SCALAR-NEXT: movb %r11b, 39(%rdx) |
| ; SCALAR-NEXT: movb %r13b, 38(%rdx) |
| ; SCALAR-NEXT: movb %r10b, 37(%rdx) |
| ; SCALAR-NEXT: movb %r12b, 36(%rdx) |
| ; SCALAR-NEXT: movb %r9b, 35(%rdx) |
| ; SCALAR-NEXT: movb %r15b, 34(%rdx) |
| ; SCALAR-NEXT: movb %r8b, 33(%rdx) |
| ; SCALAR-NEXT: movb %dil, 32(%rdx) |
| ; SCALAR-NEXT: popq %rbx |
| ; SCALAR-NEXT: popq %r12 |
| ; SCALAR-NEXT: popq %r13 |
| ; SCALAR-NEXT: popq %r14 |
| ; SCALAR-NEXT: popq %r15 |
| ; SCALAR-NEXT: popq %rbp |
| ; SCALAR-NEXT: retq |
| ; |
| ; SSE2-LABEL: vec512_v32i8: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 |
| ; SSE2-NEXT: pxor %xmm0, %xmm1 |
| ; SSE2-NEXT: pxor (%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa %xmm0, (%rsi) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rsi) |
| ; SSE2-NEXT: movdqa %xmm0, (%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 16(%rdx) |
| ; SSE2-NEXT: movdqa %xmm1, 48(%rdx) |
| ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) |
| ; SSE2-NEXT: retq |
| ; |
| ; AVX1-LABEL: vec512_v32i8: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 |
| ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 |
| ; AVX1-NEXT: vmovaps %ymm0, (%rsi) |
| ; AVX1-NEXT: vmovaps %ymm0, (%rdx) |
| ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) |
| ; AVX1-NEXT: vzeroupper |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: vec512_v32i8: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rsi) |
| ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) |
| ; AVX2-NEXT: vzeroupper |
| ; AVX2-NEXT: retq |
| %in.subvec.not = load <32 x i8>, ptr %in.subvec.ptr, align 64 |
| %in.subvec = xor <32 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| store <32 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 |
| %out.subvec0.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 0 |
| store <32 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 |
| %out.subvec1.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 1 |
| store <32 x i8> %in.subvec, ptr %out.subvec1.ptr, align 32 |
| ret void |
| } |
| ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: |
| ; SSSE3: {{.*}} |