diff --git a/test/CodeGen/X86/stack-folding-int-sse42.ll b/test/CodeGen/X86/stack-folding-int-sse42.ll index e12d57e2fd1..0687592c2a3 100644 --- a/test/CodeGen/X86/stack-folding-int-sse42.ll +++ b/test/CodeGen/X86/stack-folding-int-sse42.ll @@ -1,7 +1,7 @@ -; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.2,+aes,+pclmul < %s | FileCheck %s - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-unknown" +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.2,+aes,+pclmul < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" ; Stack reload folding tests. ; @@ -59,44 +59,44 @@ define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) { %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7) ret <2 x i64> %2 -} -declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone - -define <4 x i32> @stack_fold_movd_load(i32 %a0) { - ;CHECK-LABEL: stack_fold_movd_load - ;CHECK: movd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0 - ret <4 x i32> %2 -} - -define i32 @stack_fold_movd_store(<4 x i32> %a0) { - ;CHECK-LABEL: stack_fold_movd_store - ;CHECK: movd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill - %1 = extractelement <4 x i32> %a0, i32 0 - %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - ret i32 %1 -} - -define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { - ;CHECK-LABEL: stack_fold_movq_load - ;CHECK: movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> - ret <2 x i64> %2 -} - -define i64 @stack_fold_movq_store(<2 x i64> %a0) { - ;CHECK-LABEL: stack_fold_movq_store - ;CHECK: movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill - %1 = extractelement <2 x i64> %a0, i32 0 - %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - ret i64 %1 -} - -define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pabsb - ;CHECK: pabsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload +} +declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone + +define <4 x i32> @stack_fold_movd_load(i32 %a0) { + ;CHECK-LABEL: stack_fold_movd_load + ;CHECK: movd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0 + ret <4 x i32> %2 +} + +define i32 @stack_fold_movd_store(<4 x i32> %a0) { + ;CHECK-LABEL: stack_fold_movd_store + ;CHECK: movd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill + %1 = extractelement <4 x i32> %a0, i32 0 + %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + ret i32 %1 +} + +define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { + ;CHECK-LABEL: stack_fold_movq_load + ;CHECK: movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> + ret <2 x i64> %2 +} + +define i64 @stack_fold_movq_store(<2 x i64> %a0) { + ;CHECK-LABEL: stack_fold_movq_store + ;CHECK: movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill + %1 = extractelement <2 x i64> %a0, i32 0 + %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + ret i64 %1 +} + +define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) { + ;CHECK-LABEL: stack_fold_pabsb + ;CHECK: pabsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) ret <16 x i8> %2 @@ -269,37 +269,37 @@ define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) { %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %2 -} -declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone - -define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) { - ;CHECK-LABEL: stack_fold_pblendvb - ;CHECK: pblendvb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0) - ret <16 x i8> %2 -} -declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone - -define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) { +} +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) { + ;CHECK-LABEL: stack_fold_pblendvb + ;CHECK: pblendvb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0) + ret <16 x i8> %2 +} +declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) { ;CHECK-LABEL: stack_fold_pblendw ;CHECK: pblendw $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ret <8 x i16> %2 -} -declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone - -define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) { - ;CHECK-LABEL: stack_fold_pclmulqdq - ;CHECK: pclmulqdq $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) - ret <2 x i64> %2 -} -declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone - -define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) { +} +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone + +define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) { + ;CHECK-LABEL: stack_fold_pclmulqdq + ;CHECK: pclmulqdq $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone + +define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) { ;CHECK-LABEL: stack_fold_pcmpeqb ;CHECK: pcmpeqb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -332,28 +332,28 @@ define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) { %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = icmp eq <8 x i16> %a0, %a1 %3 = sext <8 x i1> %2 to <8 x i16> - ret <8 x i16> %3 -} - -define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pcmpestri - ;CHECK: pcmpestri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"() - %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) - ret i32 %2 -} -declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone - -define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pcmpestrm - ;CHECK: pcmpestrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"() - %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) - ret <16 x i8> %2 -} -declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone - -define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) { + ret <8 x i16> %3 +} + +define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) { + ;CHECK-LABEL: stack_fold_pcmpestri + ;CHECK: pcmpestri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"() + %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) + ret i32 %2 +} +declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone + +define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) { + ;CHECK-LABEL: stack_fold_pcmpestrm + ;CHECK: pcmpestrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"() + %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) + ret <16 x i8> %2 +} +declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone + +define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) { ;CHECK-LABEL: stack_fold_pcmpgtb ;CHECK: pcmpgtb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -386,50 +386,50 @@ define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) { %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = icmp sgt <8 x i16> %a0, %a1 %3 = sext <8 x i1> %2 to <8 x i16> - ret <8 x i16> %3 -} - -define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pcmpistri - ;CHECK: pcmpistri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) - ret i32 %2 -} -declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone - -define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pcmpistrm - ;CHECK: pcmpistrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) - ret <16 x i8> %2 -} -declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone - -; TODO stack_fold_pextrb - -define i32 @stack_fold_pextrd(<4 x i32> %a0) { - ;CHECK-LABEL: stack_fold_pextrd - ;CHECK: pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill - ;CHECK: movl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload - %1 = extractelement <4 x i32> %a0, i32 1 - %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - ret i32 %1 -} - -define i64 @stack_fold_pextrq(<2 x i64> %a0) { - ;CHECK-LABEL: stack_fold_pextrq - ;CHECK: pextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill - ;CHECK: movq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload - %1 = extractelement <2 x i64> %a0, i32 1 - %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - ret i64 %1 -} - -; TODO stack_fold_pextrw - -define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) { + ret <8 x i16> %3 +} + +define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) { + ;CHECK-LABEL: stack_fold_pcmpistri + ;CHECK: pcmpistri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) + ret i32 %2 +} +declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone + +define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) { + ;CHECK-LABEL: stack_fold_pcmpistrm + ;CHECK: pcmpistrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) + ret <16 x i8> %2 +} +declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone + +; TODO stack_fold_pextrb + +define i32 @stack_fold_pextrd(<4 x i32> %a0) { + ;CHECK-LABEL: stack_fold_pextrd + ;CHECK: pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill + ;CHECK: movl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload + %1 = extractelement <4 x i32> %a0, i32 1 + %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + ret i32 %1 +} + +define i64 @stack_fold_pextrq(<2 x i64> %a0) { + ;CHECK-LABEL: stack_fold_pextrq + ;CHECK: pextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill + ;CHECK: movq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload + %1 = extractelement <2 x i64> %a0, i32 1 + %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + ret i64 %1 +} + +; TODO stack_fold_pextrw + +define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) { ;CHECK-LABEL: stack_fold_phaddd ;CHECK: phaddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -453,19 +453,19 @@ define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) { %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %2 -} -declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone - -define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) { - ;CHECK-LABEL: stack_fold_phminposuw - ;CHECK: phminposuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) - ret <8 x i16> %2 -} -declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone - -define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) { +} +declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) { + ;CHECK-LABEL: stack_fold_phminposuw + ;CHECK: phminposuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone + +define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) { ;CHECK-LABEL: stack_fold_phsubd ;CHECK: phsubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -489,43 +489,43 @@ define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) { %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %2 -} -declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone - -define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) { - ;CHECK-LABEL: stack_fold_pinsrb - ;CHECK: pinsrb $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1 - ret <16 x i8> %2 -} - -define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) { - ;CHECK-LABEL: stack_fold_pinsrd - ;CHECK: pinsrd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1 - ret <4 x i32> %2 -} - -define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) { - ;CHECK-LABEL: stack_fold_pinsrq - ;CHECK: pinsrq $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1 - ret <2 x i64> %2 -} - -define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) { - ;CHECK-LABEL: stack_fold_pinsrw - ;CHECK: pinsrw $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1 - ret <8 x i16> %2 -} - -define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { - ;CHECK-LABEL: stack_fold_pmaddubsw +} +declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) { + ;CHECK-LABEL: stack_fold_pinsrb + ;CHECK: pinsrb $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1 + ret <16 x i8> %2 +} + +define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) { + ;CHECK-LABEL: stack_fold_pinsrd + ;CHECK: pinsrd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1 + ret <4 x i32> %2 +} + +define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) { + ;CHECK-LABEL: stack_fold_pinsrq + ;CHECK: pinsrq $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1 + ret <2 x i64> %2 +} + +define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) { + ;CHECK-LABEL: stack_fold_pinsrw + ;CHECK: pinsrw $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1 + ret <8 x i16> %2 +} + +define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { + ;CHECK-LABEL: stack_fold_pmaddubsw ;CHECK: pmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) @@ -647,118 +647,118 @@ define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) { %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %2 -} -declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone - -define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxbd - ;CHECK: pmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) - ret <4 x i32> %2 -} -declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone - -define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxbq - ;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) - ret <2 x i64> %2 -} -declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone - -define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxbw - ;CHECK: pmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) - ret <8 x i16> %2 -} -declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone - -define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxdq - ;CHECK: pmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) - ret <2 x i64> %2 -} -declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone - -define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxwd - ;CHECK: pmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) - ret <4 x i32> %2 -} -declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone - -define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { - ;CHECK-LABEL: stack_fold_pmovsxwq - ;CHECK: pmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) - ret <2 x i64> %2 -} -declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone - -define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxbd - ;CHECK: pmovzxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) - ret <4 x i32> %2 -} -declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone - -define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxbq - ;CHECK: pmovzxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) - ret <2 x i64> %2 -} -declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone - -define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxbw - ;CHECK: pmovzxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) - ret <8 x i16> %2 -} -declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone - -define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxdq - ;CHECK: pmovzxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) - ret <2 x i64> %2 -} -declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone - -define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxwd - ;CHECK: pmovzxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) - ret <4 x i32> %2 -} -declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone - -define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { - ;CHECK-LABEL: stack_fold_pmovzxwq - ;CHECK: pmovzxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) - ret <2 x i64> %2 -} -declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone - -define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { +} +declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone + +define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { + ;CHECK-LABEL: stack_fold_pmovsxbd + ;CHECK: pmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone + +define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { + ;CHECK-LABEL: stack_fold_pmovsxbq + ;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone + +define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { + ;CHECK-LABEL: stack_fold_pmovsxbw + ;CHECK: pmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone + +define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { + ;CHECK-LABEL: stack_fold_pmovsxdq + ;CHECK: pmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone + +define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { + ;CHECK-LABEL: stack_fold_pmovsxwd + ;CHECK: pmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone + +define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { + ;CHECK-LABEL: stack_fold_pmovsxwq + ;CHECK: pmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone + +define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { + ;CHECK-LABEL: stack_fold_pmovzxbd + ;CHECK: pmovzxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone + +define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { + ;CHECK-LABEL: stack_fold_pmovzxbq + ;CHECK: pmovzxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone + +define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { + ;CHECK-LABEL: stack_fold_pmovzxbw + ;CHECK: pmovzxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone + +define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { + ;CHECK-LABEL: stack_fold_pmovzxdq + ;CHECK: pmovzxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone + +define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { + ;CHECK-LABEL: stack_fold_pmovzxwd + ;CHECK: pmovzxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone + +define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { + ;CHECK-LABEL: stack_fold_pmovzxwq + ;CHECK: pmovzxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone + +define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { ;CHECK-LABEL: stack_fold_pmuldq ;CHECK: pmuldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -983,19 +983,19 @@ define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) { ;CHECK: psubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = sub <4 x i32> %a0, %a1 - ret <4 x i32> %2 -} - -define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) { - ;CHECK-LABEL: stack_fold_psubq - ;CHECK: psubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = sub <2 x i64> %a0, %a1 - ret <2 x i64> %2 -} - -define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) { - ;CHECK-LABEL: stack_fold_psubsb + ret <4 x i32> %2 +} + +define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) { + ;CHECK-LABEL: stack_fold_psubq + ;CHECK: psubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = sub <2 x i64> %a0, %a1 + ret <2 x i64> %2 +} + +define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) { + ;CHECK-LABEL: stack_fold_psubsb ;CHECK: psubsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) @@ -1009,45 +1009,45 @@ define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) { %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %2 -} -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone - -define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) { - ;CHECK-LABEL: stack_fold_psubusb - ;CHECK: psubusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) - ret <16 x i8> %2 -} -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone - -define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) { - ;CHECK-LABEL: stack_fold_psubusw - ;CHECK: psubusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) - ret <8 x i16> %2 -} -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone - -define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) { +} +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) { + ;CHECK-LABEL: stack_fold_psubusb + ;CHECK: psubusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) + ret <16 x i8> %2 +} +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) { + ;CHECK-LABEL: stack_fold_psubusw + ;CHECK: psubusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) { ;CHECK-LABEL: stack_fold_psubw ;CHECK: psubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = sub <8 x i16> %a0, %a1 - ret <8 x i16> %2 -} - -define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) { - ;CHECK-LABEL: stack_fold_ptest - ;CHECK: ptest {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) - ret i32 %2 -} -declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone - -define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) { + ret <8 x i16> %2 +} + +define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) { + ;CHECK-LABEL: stack_fold_ptest + ;CHECK: ptest {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) + ret i32 %2 +} +declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone + +define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) { ;CHECK-LABEL: stack_fold_punpckhbw ;CHECK: punpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()