diff --git a/test/CodeGen/X86/avx1-stack-reload-folding.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll similarity index 99% rename from test/CodeGen/X86/avx1-stack-reload-folding.ll rename to test/CodeGen/X86/stack-folding-fp-avx1.ll index 37dde9b7cde..2115d931939 100644 --- a/test/CodeGen/X86/avx1-stack-reload-folding.ll +++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -1,7 +1,7 @@ -; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+f16c < %s | FileCheck %s - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-unknown" +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+f16c < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" ; Stack reload folding tests. ; @@ -45,36 +45,36 @@ define double @stack_fold_addsd(double %a0, double %a1) { ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fadd double %a0, %a1 - ret double %2 -} - -define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) { - ;CHECK-LABEL: stack_fold_addsd_int - ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) - ret <2 x double> %2 -} -declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_addss(float %a0, float %a1) { + ret double %2 +} + +define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_addsd_int + ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_addss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_addss ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fadd float %a0, %a1 - ret float %2 -} - -define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) { - ;CHECK-LABEL: stack_fold_addss_int - ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone - -define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) { + ret float %2 +} + +define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_addss_int + ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_addsubpd ;CHECK: vaddsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -369,28 +369,28 @@ define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) { %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %2 -} -declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone - -define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) { - ;CHECK-LABEL: stack_fold_cvtdq2pd - ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) - ret <2 x double> %2 -} -declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone - -define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) { - ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm - ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0) - ret <4 x double> %2 -} -declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone - -define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) { +} +declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) { + ;CHECK-LABEL: stack_fold_cvtdq2pd + ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone + +define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) { + ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm + ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0) + ret <4 x double> %2 +} +declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone + +define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) { ;CHECK-LABEL: stack_fold_cvtdq2ps ;CHECK: vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -437,28 +437,28 @@ define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) { ;CHECK: vcvtpd2psy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fptrunc <4 x double> %a0 to <4 x float> - ret <4 x float> %2 -} - -define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) { - ;CHECK-LABEL: stack_fold_cvtph2ps - ;CHECK: vcvtph2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly - -define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) { - ;CHECK-LABEL: stack_fold_cvtph2ps_ymm - ;CHECK: vcvtph2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) - ret <8 x float> %2 -} -declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly - -define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) { + ret <4 x float> %2 +} + +define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) { + ;CHECK-LABEL: stack_fold_cvtph2ps + ;CHECK: vcvtph2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly + +define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) { + ;CHECK-LABEL: stack_fold_cvtph2ps_ymm + ;CHECK: vcvtph2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly + +define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) { ;CHECK-LABEL: stack_fold_cvtps2dq ;CHECK: vcvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -473,46 +473,46 @@ define <8 x i32> @stack_fold_cvtps2dq_ymm(<8 x float> %a0) { %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0) ret <8 x i32> %2 -} -declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone - -define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) { - ;CHECK-LABEL: stack_fold_cvtps2pd - ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) - ret <2 x double> %2 -} -declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone - -define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) { - ;CHECK-LABEL: stack_fold_cvtps2pd_ymm - ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0) - ret <4 x double> %2 -} -declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone - -define <8 x i16> @stack_fold_cvtps2ph(<4 x float> %a0) { - ;CHECK-LABEL: stack_fold_cvtps2ph - ;CHECK: vcvtps2ph $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill - %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) - %2 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - ret <8 x i16> %1 -} -declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly - -define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) { - ;CHECK-LABEL: stack_fold_cvtps2ph_ymm - ;CHECK: vcvtps2ph $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill - %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) - %2 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - ret <8 x i16> %1 -} -declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly - -; TODO stack_fold_cvtsd2si +} +declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone + +define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_cvtps2pd + ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone + +define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_cvtps2pd_ymm + ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0) + ret <4 x double> %2 +} +declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone + +define <8 x i16> @stack_fold_cvtps2ph(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_cvtps2ph + ;CHECK: vcvtps2ph $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) + %2 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly + +define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) { + ;CHECK-LABEL: stack_fold_cvtps2ph_ymm + ;CHECK: vcvtps2ph $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) + %2 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly + +; TODO stack_fold_cvtsd2si define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) { ;CHECK-LABEL: stack_fold_cvtsd2si_int @@ -643,20 +643,20 @@ define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) { %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ret i64 %2 -} -declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone - -define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) { - ;CHECK-LABEL: stack_fold_cvttpd2dq - ;CHECK: vcvttpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) - ret <4 x i32> %2 -} -declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone - -define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) { - ;CHECK-LABEL: stack_fold_cvttpd2dq_ymm +} +declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone + +define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) { + ;CHECK-LABEL: stack_fold_cvttpd2dq + ;CHECK: vcvttpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone + +define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) { + ;CHECK-LABEL: stack_fold_cvttpd2dq_ymm ;CHECK: vcvttpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fptosi <4 x double> %a0 to <4 x i32> @@ -784,64 +784,64 @@ define double @stack_fold_divsd(double %a0, double %a1) { ;CHECK: vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fdiv double %a0, %a1 - ret double %2 -} - -define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) { - ;CHECK-LABEL: stack_fold_divsd_int - ;CHECK: vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) - ret <2 x double> %2 -} -declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_divss(float %a0, float %a1) { + ret double %2 +} + +define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_divsd_int + ;CHECK: vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_divss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_divss ;CHECK: vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fdiv float %a0, %a1 - ret float %2 -} - -define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) { - ;CHECK-LABEL: stack_fold_divss_int - ;CHECK: vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone - -define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) { - ;CHECK-LABEL: stack_fold_dppd - ;CHECK: vdppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) - ret <2 x double> %2 -} -declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone - -define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) { - ;CHECK-LABEL: stack_fold_dpps - ;CHECK: vdpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone - -define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) { - ;CHECK-LABEL: stack_fold_dpps_ymm - ;CHECK: vdpps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) - ret <8 x float> %2 -} -declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone - -define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) { - ;CHECK-LABEL: stack_fold_extractf128 + ret float %2 +} + +define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_divss_int + ;CHECK: vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_dppd + ;CHECK: vdppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_dpps + ;CHECK: vdpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone + +define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) { + ;CHECK-LABEL: stack_fold_dpps_ymm + ;CHECK: vdpps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone + +define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) { + ;CHECK-LABEL: stack_fold_extractf128 ;CHECK: vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill ;CHECK: vmovaps {{-?[0-9]*}}(%rsp), %xmm0 {{.*#+}} 16-byte Reload %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> @@ -983,37 +983,37 @@ define double @stack_fold_maxsd(double %a0, double %a1) { %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp ogt double %a0, %a1 %3 = select i1 %2, double %a0, double %a1 - ret double %3 -} - -define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) { - ;CHECK-LABEL: stack_fold_maxsd_int - ;CHECK: vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) - ret <2 x double> %2 -} -declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_maxss(float %a0, float %a1) { + ret double %3 +} + +define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_maxsd_int + ;CHECK: vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_maxss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_maxss ;CHECK: vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp ogt float %a0, %a1 %3 = select i1 %2, float %a0, float %a1 - ret float %3 -} - -define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) { - ;CHECK-LABEL: stack_fold_maxss_int - ;CHECK: vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone - -define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) { + ret float %3 +} + +define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_maxss_int + ;CHECK: vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_minpd ;CHECK: vminpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -1055,37 +1055,37 @@ define double @stack_fold_minsd(double %a0, double %a1) { %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp olt double %a0, %a1 %3 = select i1 %2, double %a0, double %a1 - ret double %3 -} - -define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) { - ;CHECK-LABEL: stack_fold_minsd_int - ;CHECK: vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) - ret <2 x double> %2 -} -declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_minss(float %a0, float %a1) { + ret double %3 +} + +define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_minsd_int + ;CHECK: vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_minss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_minss ;CHECK: vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp olt float %a0, %a1 %3 = select i1 %2, float %a0, float %a1 - ret float %3 -} - -define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) { - ;CHECK-LABEL: stack_fold_minss_int - ;CHECK: vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone - -; TODO stack_fold_movd (load / store) + ret float %3 +} + +define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_minss_int + ;CHECK: vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone + +; TODO stack_fold_movd (load / store) ; TODO stack_fold_movq (load / store) ; TODO stack_fold_movddup @@ -1157,36 +1157,36 @@ define double @stack_fold_mulsd(double %a0, double %a1) { ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fmul double %a0, %a1 - ret double %2 -} - -define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) { - ;CHECK-LABEL: stack_fold_mulsd_int - ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) - ret <2 x double> %2 -} -declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_mulss(float %a0, float %a1) { + ret double %2 +} + +define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_mulsd_int + ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_mulss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_mulss ;CHECK: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fmul float %a0, %a1 - ret float %2 -} - -define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) { - ;CHECK-LABEL: stack_fold_mulss_int - ;CHECK: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone - -define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) { + ret float %2 +} + +define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_mulss_int + ;CHECK: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone + +define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_orpd ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -1255,30 +1255,30 @@ define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) { ;CHECK: vpermilpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> - ret <4 x double> %2 -} - -define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) { - ;CHECK-LABEL: stack_fold_permilpdvar - ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) - ret <2 x double> %2 -} -declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone - -define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) { - ;CHECK-LABEL: stack_fold_permilpdvar_ymm - ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) - ret <4 x double> %2 -} -declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone - -define <4 x float> @stack_fold_permilps(<4 x float> %a0) { - ;CHECK-LABEL: stack_fold_permilps - ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ret <4 x double> %2 +} + +define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) { + ;CHECK-LABEL: stack_fold_permilpdvar + ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone + +define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) { + ;CHECK-LABEL: stack_fold_permilpdvar_ymm + ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) + ret <4 x double> %2 +} +declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone + +define <4 x float> @stack_fold_permilps(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_permilps + ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> ret <4 x float> %2 @@ -1289,30 +1289,30 @@ define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) { ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> - ret <8 x float> %2 -} - -define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) { - ;CHECK-LABEL: stack_fold_permilpsvar - ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone - -define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) { - ;CHECK-LABEL: stack_fold_permilpsvar_ymm - ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) - ret <8 x float> %2 -} -declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone - -; TODO stack_fold_rcpps - -define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) { + ret <8 x float> %2 +} + +define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) { + ;CHECK-LABEL: stack_fold_permilpsvar + ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone + +define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) { + ;CHECK-LABEL: stack_fold_permilpsvar_ymm + ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone + +; TODO stack_fold_rcpps + +define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) { ;CHECK-LABEL: stack_fold_rcpps_int ;CHECK: vrcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -1330,67 +1330,67 @@ define <8 x float> @stack_fold_rcpps_ymm_int(<8 x float> %a0) { %2 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ret <8 x float> %2 } -declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone - -; TODO stack_fold_rcpss - -define <4 x float> @stack_fold_rcpss_int(<4 x float> %a0) { - ;CHECK-LABEL: stack_fold_rcpss_int - ;CHECK: vrcpss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone - -define <2 x double> @stack_fold_roundpd(<2 x double> %a0) { - ;CHECK-LABEL: stack_fold_roundpd - ;CHECK: vroundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) - ret <2 x double> %2 -} -declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone - -define <4 x double> @stack_fold_roundpd_ymm(<4 x double> %a0) { - ;CHECK-LABEL: stack_fold_roundpd_ymm - ;CHECK: vroundpd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) - ret <4 x double> %2 -} -declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone - -define <4 x float> @stack_fold_roundps(<4 x float> %a0) { - ;CHECK-LABEL: stack_fold_roundps - ;CHECK: vroundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone - -define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) { - ;CHECK-LABEL: stack_fold_roundps_ymm - ;CHECK: vroundps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) - ret <8 x float> %2 -} -declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone - -; TODO stack_fold_roundsd - -; TODO stack_fold_roundsd_int -declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone - -; TODO stack_fold_roundss - -; TODO stack_fold_roundss_int -declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone - -; TODO stack_fold_rsqrtps - +declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone + +; TODO stack_fold_rcpss + +define <4 x float> @stack_fold_rcpss_int(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_rcpss_int + ;CHECK: vrcpss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone + +define <2 x double> @stack_fold_roundpd(<2 x double> %a0) { + ;CHECK-LABEL: stack_fold_roundpd + ;CHECK: vroundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <4 x double> @stack_fold_roundpd_ymm(<4 x double> %a0) { + ;CHECK-LABEL: stack_fold_roundpd_ymm + ;CHECK: vroundpd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) + ret <4 x double> %2 +} +declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone + +define <4 x float> @stack_fold_roundps(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_roundps + ;CHECK: vroundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) { + ;CHECK-LABEL: stack_fold_roundps_ymm + ;CHECK: vroundps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone + +; TODO stack_fold_roundsd + +; TODO stack_fold_roundsd_int +declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone + +; TODO stack_fold_roundss + +; TODO stack_fold_roundss_int +declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone + +; TODO stack_fold_rsqrtps + define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) { ;CHECK-LABEL: stack_fold_rsqrtps_int ;CHECK: vrsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload @@ -1398,32 +1398,32 @@ define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) { %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ret <4 x float> %2 } -declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone - -; TODO stack_fold_rsqrtps_ymm - -define <8 x float> @stack_fold_rsqrtps_ymm_int(<8 x float> %a0) { - ;CHECK-LABEL: stack_fold_rsqrtps_ymm_int - ;CHECK: vrsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) - ret <8 x float> %2 -} -declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone - -; TODO stack_fold_rsqrtss - -define <4 x float> @stack_fold_rsqrtss_int(<4 x float> %a0) { - ;CHECK-LABEL: stack_fold_rsqrtss_int - ;CHECK: vrsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone - -define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) { - ;CHECK-LABEL: stack_fold_shufpd +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +; TODO stack_fold_rsqrtps_ymm + +define <8 x float> @stack_fold_rsqrtps_ymm_int(<8 x float> %a0) { + ;CHECK-LABEL: stack_fold_rsqrtps_ymm_int + ;CHECK: vrsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone + +; TODO stack_fold_rsqrtss + +define <4 x float> @stack_fold_rsqrtss_int(<4 x float> %a0) { + ;CHECK-LABEL: stack_fold_rsqrtss_int + ;CHECK: vrsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone + +define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_shufpd ;CHECK: vshufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> @@ -1563,72 +1563,72 @@ define double @stack_fold_subsd(double %a0, double %a1) { ;CHECK: vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fsub double %a0, %a1 - ret double %2 -} - -define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) { - ;CHECK-LABEL: stack_fold_subsd_int - ;CHECK: vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) - ret <2 x double> %2 -} -declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone - -define float @stack_fold_subss(float %a0, float %a1) { + ret double %2 +} + +define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_subsd_int + ;CHECK: vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %2 +} +declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone + +define float @stack_fold_subss(float %a0, float %a1) { ;CHECK-LABEL: stack_fold_subss ;CHECK: vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fsub float %a0, %a1 - ret float %2 -} - -define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) { - ;CHECK-LABEL: stack_fold_subss_int - ;CHECK: vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) - ret <4 x float> %2 -} -declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone - -define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) { - ;CHECK-LABEL: stack_fold_testpd - ;CHECK: vtestpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) - ret i32 %2 -} -declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone - -define i32 @stack_fold_testpd_ymm(<4 x double> %a0, <4 x double> %a1) { - ;CHECK-LABEL: stack_fold_testpd_ymm - ;CHECK: vtestpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) - ret i32 %2 -} -declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone - -define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) { - ;CHECK-LABEL: stack_fold_testps - ;CHECK: vtestps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) - ret i32 %2 -} -declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone - -define i32 @stack_fold_testps_ymm(<8 x float> %a0, <8 x float> %a1) { - ;CHECK-LABEL: stack_fold_testps_ymm - ;CHECK: vtestps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload - %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) - ret i32 %2 -} -declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone - -define i32 @stack_fold_ucomisd(double %a0, double %a1) { + ret float %2 +} + +define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_subss_int + ;CHECK: vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) { + ;CHECK-LABEL: stack_fold_testpd + ;CHECK: vtestpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) + ret i32 %2 +} +declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone + +define i32 @stack_fold_testpd_ymm(<4 x double> %a0, <4 x double> %a1) { + ;CHECK-LABEL: stack_fold_testpd_ymm + ;CHECK: vtestpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) + ret i32 %2 +} +declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_testps + ;CHECK: vtestps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) + ret i32 %2 +} +declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone + +define i32 @stack_fold_testps_ymm(<8 x float> %a0, <8 x float> %a1) { + ;CHECK-LABEL: stack_fold_testps_ymm + ;CHECK: vtestps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) + ret i32 %2 +} +declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define i32 @stack_fold_ucomisd(double %a0, double %a1) { ;CHECK-LABEL: stack_fold_ucomisd ;CHECK: vucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()