| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL |
| |
| define void @widen_fsub_v2f32_v4f32(ptr %a0, ptr %b0, ptr %c0) { |
| ; SSE-LABEL: widen_fsub_v2f32_v4f32: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero |
| ; SSE-NEXT: subps %xmm2, %xmm0 |
| ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero |
| ; SSE-NEXT: subps %xmm2, %xmm1 |
| ; SSE-NEXT: movlps %xmm0, (%rdx) |
| ; SSE-NEXT: movlps %xmm1, 8(%rdx) |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-LABEL: widen_fsub_v2f32_v4f32: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero |
| ; AVX-NEXT: vsubps %xmm2, %xmm0, %xmm0 |
| ; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero |
| ; AVX-NEXT: vsubps %xmm2, %xmm1, %xmm1 |
| ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] |
| ; AVX-NEXT: vmovups %xmm0, (%rdx) |
| ; AVX-NEXT: retq |
| %a2 = getelementptr inbounds i8, ptr %a0, i64 8 |
| %b2 = getelementptr inbounds i8, ptr %b0, i64 8 |
| %c2 = getelementptr inbounds i8, ptr %c0, i64 8 |
| %va0 = load <2 x float>, ptr %a0, align 4 |
| %vb0 = load <2 x float>, ptr %b0, align 4 |
| %va2 = load <2 x float>, ptr %a2, align 4 |
| %vb2 = load <2 x float>, ptr %b2, align 4 |
| %vc0 = fsub <2 x float> %va0, %vb0 |
| %vc2 = fsub <2 x float> %va2, %vb2 |
| store <2 x float> %vc0, ptr %c0, align 4 |
| store <2 x float> %vc2, ptr %c2, align 4 |
| ret void |
| } |
| |
| define void @widen_fsub_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) { |
| ; SSE-LABEL: widen_fsub_v2f32_v8f32: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero |
| ; SSE-NEXT: subps %xmm4, %xmm0 |
| ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero |
| ; SSE-NEXT: subps %xmm4, %xmm1 |
| ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero |
| ; SSE-NEXT: subps %xmm4, %xmm2 |
| ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero |
| ; SSE-NEXT: subps %xmm4, %xmm3 |
| ; SSE-NEXT: movlps %xmm0, (%rdx) |
| ; SSE-NEXT: movlps %xmm1, 8(%rdx) |
| ; SSE-NEXT: movlps %xmm2, 16(%rdx) |
| ; SSE-NEXT: movlps %xmm3, 24(%rdx) |
| ; SSE-NEXT: retq |
| ; |
| ; AVX1OR2-LABEL: widen_fsub_v2f32_v8f32: |
| ; AVX1OR2: # %bb.0: |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm4, %xmm0, %xmm0 |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm4, %xmm1, %xmm1 |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm4, %xmm2, %xmm2 |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm4, %xmm3, %xmm3 |
| ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 |
| ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] |
| ; AVX1OR2-NEXT: vmovups %ymm0, (%rdx) |
| ; AVX1OR2-NEXT: vzeroupper |
| ; AVX1OR2-NEXT: retq |
| ; |
| ; AVX512F-LABEL: widen_fsub_v2f32_v8f32: |
| ; AVX512F: # %bb.0: |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm4, %xmm0, %xmm0 |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm4, %xmm1, %xmm1 |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm4, %xmm2, %xmm2 |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm4, %xmm3, %xmm3 |
| ; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 |
| ; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; AVX512F-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] |
| ; AVX512F-NEXT: vmovups %ymm0, (%rdx) |
| ; AVX512F-NEXT: vzeroupper |
| ; AVX512F-NEXT: retq |
| ; |
| ; AVX512VL-LABEL: widen_fsub_v2f32_v8f32: |
| ; AVX512VL: # %bb.0: |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm4, %xmm0, %xmm0 |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm4, %xmm1, %xmm1 |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm4, %xmm2, %xmm2 |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm4, %xmm3, %xmm3 |
| ; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 |
| ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 |
| ; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] |
| ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] |
| ; AVX512VL-NEXT: vmovups %ymm0, (%rdx) |
| ; AVX512VL-NEXT: vzeroupper |
| ; AVX512VL-NEXT: retq |
| %a2 = getelementptr inbounds i8, ptr %a0, i64 8 |
| %b2 = getelementptr inbounds i8, ptr %b0, i64 8 |
| %c2 = getelementptr inbounds i8, ptr %c0, i64 8 |
| %a4 = getelementptr inbounds i8, ptr %a0, i64 16 |
| %b4 = getelementptr inbounds i8, ptr %b0, i64 16 |
| %c4 = getelementptr inbounds i8, ptr %c0, i64 16 |
| %a6 = getelementptr inbounds i8, ptr %a0, i64 24 |
| %b6 = getelementptr inbounds i8, ptr %b0, i64 24 |
| %c6 = getelementptr inbounds i8, ptr %c0, i64 24 |
| %va0 = load <2 x float>, ptr %a0, align 4 |
| %vb0 = load <2 x float>, ptr %b0, align 4 |
| %va2 = load <2 x float>, ptr %a2, align 4 |
| %vb2 = load <2 x float>, ptr %b2, align 4 |
| %va4 = load <2 x float>, ptr %a4, align 4 |
| %vb4 = load <2 x float>, ptr %b4, align 4 |
| %va6 = load <2 x float>, ptr %a6, align 4 |
| %vb6 = load <2 x float>, ptr %b6, align 4 |
| %vc0 = fsub <2 x float> %va0, %vb0 |
| %vc2 = fsub <2 x float> %va2, %vb2 |
| %vc4 = fsub <2 x float> %va4, %vb4 |
| %vc6 = fsub <2 x float> %va6, %vb6 |
| store <2 x float> %vc0, ptr %c0, align 4 |
| store <2 x float> %vc2, ptr %c2, align 4 |
| store <2 x float> %vc4, ptr %c4, align 4 |
| store <2 x float> %vc6, ptr %c6, align 4 |
| ret void |
| } |
| |
| define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { |
| ; SSE-LABEL: widen_fsub_v2f32_v16f32: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero |
| ; SSE-NEXT: subps %xmm4, %xmm0 |
| ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero |
| ; SSE-NEXT: subps %xmm4, %xmm1 |
| ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero |
| ; SSE-NEXT: subps %xmm4, %xmm2 |
| ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero |
| ; SSE-NEXT: subps %xmm4, %xmm3 |
| ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero |
| ; SSE-NEXT: subps %xmm5, %xmm4 |
| ; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero |
| ; SSE-NEXT: subps %xmm6, %xmm5 |
| ; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero |
| ; SSE-NEXT: subps %xmm7, %xmm6 |
| ; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero |
| ; SSE-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero |
| ; SSE-NEXT: subps %xmm8, %xmm7 |
| ; SSE-NEXT: movlps %xmm0, (%rdx) |
| ; SSE-NEXT: movlps %xmm1, 8(%rdx) |
| ; SSE-NEXT: movlps %xmm2, 16(%rdx) |
| ; SSE-NEXT: movlps %xmm3, 24(%rdx) |
| ; SSE-NEXT: movlps %xmm4, 32(%rdx) |
| ; SSE-NEXT: movlps %xmm5, 40(%rdx) |
| ; SSE-NEXT: movlps %xmm6, 48(%rdx) |
| ; SSE-NEXT: movlps %xmm7, 56(%rdx) |
| ; SSE-NEXT: retq |
| ; |
| ; AVX1OR2-LABEL: widen_fsub_v2f32_v16f32: |
| ; AVX1OR2: # %bb.0: |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm4, %xmm0, %xmm0 |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm4, %xmm1, %xmm1 |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm4, %xmm2, %xmm2 |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm4, %xmm3, %xmm3 |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm5, %xmm4, %xmm4 |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm6, %xmm5, %xmm5 |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm7, %xmm6, %xmm6 |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero |
| ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero |
| ; AVX1OR2-NEXT: vsubps %xmm8, %xmm7, %xmm7 |
| ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 |
| ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] |
| ; AVX1OR2-NEXT: vmovups %ymm0, (%rdx) |
| ; AVX1OR2-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm0 |
| ; AVX1OR2-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm1 |
| ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] |
| ; AVX1OR2-NEXT: vmovups %ymm0, 32(%rdx) |
| ; AVX1OR2-NEXT: vzeroupper |
| ; AVX1OR2-NEXT: retq |
| ; |
| ; AVX512F-LABEL: widen_fsub_v2f32_v16f32: |
| ; AVX512F: # %bb.0: |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm4, %xmm0, %xmm0 |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm4, %xmm1, %xmm1 |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm4, %xmm2, %xmm2 |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm4, %xmm3, %xmm3 |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm5, %xmm4, %xmm4 |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm6, %xmm5, %xmm5 |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm7, %xmm6, %xmm6 |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero |
| ; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero |
| ; AVX512F-NEXT: vsubps %xmm8, %xmm7, %xmm7 |
| ; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6 |
| ; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4 |
| ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10] |
| ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] |
| ; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4 |
| ; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 |
| ; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 |
| ; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0 |
| ; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0 |
| ; AVX512F-NEXT: vmovupd %zmm0, (%rdx) |
| ; AVX512F-NEXT: vzeroupper |
| ; AVX512F-NEXT: retq |
| ; |
| ; AVX512VL-LABEL: widen_fsub_v2f32_v16f32: |
| ; AVX512VL: # %bb.0: |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm4, %xmm0, %xmm0 |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm4, %xmm1, %xmm1 |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm4, %xmm2, %xmm2 |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm4, %xmm3, %xmm3 |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm5, %xmm4, %xmm4 |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm6, %xmm5, %xmm5 |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm7, %xmm6, %xmm6 |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero |
| ; AVX512VL-NEXT: vsubps %xmm8, %xmm7, %xmm7 |
| ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6 |
| ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4 |
| ; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10] |
| ; AVX512VL-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] |
| ; AVX512VL-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5 |
| ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 |
| ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 |
| ; AVX512VL-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6] |
| ; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 |
| ; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm0 |
| ; AVX512VL-NEXT: vmovupd %zmm0, (%rdx) |
| ; AVX512VL-NEXT: vzeroupper |
| ; AVX512VL-NEXT: retq |
| %a2 = getelementptr inbounds i8, ptr %a0, i64 8 |
| %b2 = getelementptr inbounds i8, ptr %b0, i64 8 |
| %c2 = getelementptr inbounds i8, ptr %c0, i64 8 |
| %a4 = getelementptr inbounds i8, ptr %a0, i64 16 |
| %b4 = getelementptr inbounds i8, ptr %b0, i64 16 |
| %c4 = getelementptr inbounds i8, ptr %c0, i64 16 |
| %a6 = getelementptr inbounds i8, ptr %a0, i64 24 |
| %b6 = getelementptr inbounds i8, ptr %b0, i64 24 |
| %c6 = getelementptr inbounds i8, ptr %c0, i64 24 |
| %a8 = getelementptr inbounds i8, ptr %a0, i64 32 |
| %b8 = getelementptr inbounds i8, ptr %b0, i64 32 |
| %c8 = getelementptr inbounds i8, ptr %c0, i64 32 |
| %a10 = getelementptr inbounds i8, ptr %a0, i64 40 |
| %b10 = getelementptr inbounds i8, ptr %b0, i64 40 |
| %c10 = getelementptr inbounds i8, ptr %c0, i64 40 |
| %a12 = getelementptr inbounds i8, ptr %a0, i64 48 |
| %b12 = getelementptr inbounds i8, ptr %b0, i64 48 |
| %c12 = getelementptr inbounds i8, ptr %c0, i64 48 |
| %a14 = getelementptr inbounds i8, ptr %a0, i64 56 |
| %b14 = getelementptr inbounds i8, ptr %b0, i64 56 |
| %c14 = getelementptr inbounds i8, ptr %c0, i64 56 |
| %va0 = load <2 x float>, ptr %a0, align 4 |
| %vb0 = load <2 x float>, ptr %b0, align 4 |
| %va2 = load <2 x float>, ptr %a2, align 4 |
| %vb2 = load <2 x float>, ptr %b2, align 4 |
| %va4 = load <2 x float>, ptr %a4, align 4 |
| %vb4 = load <2 x float>, ptr %b4, align 4 |
| %va6 = load <2 x float>, ptr %a6, align 4 |
| %vb6 = load <2 x float>, ptr %b6, align 4 |
| %va8 = load <2 x float>, ptr %a8, align 4 |
| %vb8 = load <2 x float>, ptr %b8, align 4 |
| %va10 = load <2 x float>, ptr %a10, align 4 |
| %vb10 = load <2 x float>, ptr %b10, align 4 |
| %va12 = load <2 x float>, ptr %a12, align 4 |
| %vb12 = load <2 x float>, ptr %b12, align 4 |
| %va14 = load <2 x float>, ptr %a14, align 4 |
| %vb14 = load <2 x float>, ptr %b14, align 4 |
| %vc0 = fsub <2 x float> %va0, %vb0 |
| %vc2 = fsub <2 x float> %va2, %vb2 |
| %vc4 = fsub <2 x float> %va4, %vb4 |
| %vc6 = fsub <2 x float> %va6, %vb6 |
| %vc8 = fsub <2 x float> %va8, %vb8 |
| %vc10 = fsub <2 x float> %va10, %vb10 |
| %vc12 = fsub <2 x float> %va12, %vb12 |
| %vc14 = fsub <2 x float> %va14, %vb14 |
| store <2 x float> %vc0, ptr %c0, align 4 |
| store <2 x float> %vc2, ptr %c2, align 4 |
| store <2 x float> %vc4, ptr %c4, align 4 |
| store <2 x float> %vc6, ptr %c6, align 4 |
| store <2 x float> %vc8, ptr %c8, align 4 |
| store <2 x float> %vc10, ptr %c10, align 4 |
| store <2 x float> %vc12, ptr %c12, align 4 |
| store <2 x float> %vc14, ptr %c14, align 4 |
| ret void |
| } |
| |
| define <8 x float> @widen_fsub_v4f32_v8f32_const(<4 x float> %x, <4 x float> %y) { |
| ; SSE-LABEL: widen_fsub_v4f32_v8f32_const: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] |
| ; SSE-NEXT: subps %xmm2, %xmm0 |
| ; SSE-NEXT: subps %xmm2, %xmm1 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX1-LABEL: widen_fsub_v4f32_v8f32_const: |
| ; AVX1: # %bb.0: |
| ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 |
| ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX2-LABEL: widen_fsub_v4f32_v8f32_const: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 |
| ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0] |
| ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 |
| ; AVX2-NEXT: retq |
| ; |
| ; AVX512F-LABEL: widen_fsub_v4f32_v8f32_const: |
| ; AVX512F: # %bb.0: |
| ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 |
| ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0] |
| ; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 |
| ; AVX512F-NEXT: retq |
| ; |
| ; AVX512VL-LABEL: widen_fsub_v4f32_v8f32_const: |
| ; AVX512VL: # %bb.0: |
| ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 |
| ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; AVX512VL-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 |
| ; AVX512VL-NEXT: retq |
| %x2 = fsub <4 x float> %x, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> |
| %y2 = fsub <4 x float> %y, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> |
| %r = shufflevector <4 x float> %x2, <4 x float> %y2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| ret <8 x float> %r |
| } |
| |
| define <16 x float> @widen_fsub_v4f32_v16f32_const(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) { |
| ; SSE-LABEL: widen_fsub_v4f32_v16f32_const: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: movaps {{.*#+}} xmm4 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] |
| ; SSE-NEXT: subps %xmm4, %xmm0 |
| ; SSE-NEXT: subps %xmm4, %xmm1 |
| ; SSE-NEXT: subps %xmm4, %xmm2 |
| ; SSE-NEXT: subps %xmm4, %xmm3 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX1OR2-LABEL: widen_fsub_v4f32_v16f32_const: |
| ; AVX1OR2: # %bb.0: |
| ; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 |
| ; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 |
| ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0] |
| ; AVX1OR2-NEXT: vaddps %ymm1, %ymm0, %ymm0 |
| ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 |
| ; AVX1OR2-NEXT: vaddps %ymm1, %ymm2, %ymm1 |
| ; AVX1OR2-NEXT: retq |
| ; |
| ; AVX512-LABEL: widen_fsub_v4f32_v16f32_const: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 |
| ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 |
| ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 |
| ; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 |
| ; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 |
| ; AVX512-NEXT: retq |
| %x2 = fsub <4 x float> %x, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> |
| %y2 = fsub <4 x float> %y, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> |
| %z2 = fsub <4 x float> %z, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> |
| %w2 = fsub <4 x float> %w, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> |
| %r0 = shufflevector <4 x float> %x2, <4 x float> %y2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %r1 = shufflevector <4 x float> %z2, <4 x float> %w2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %r = shufflevector <8 x float> %r0, <8 x float> %r1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| ret <16 x float> %r |
| } |