[X86][SSE] Simplified PSUBUS tests

Removed loops from PSUBUS tests - ensures folding is tested. Also renamed SSE2 tests SSSE3 to match cpu.

This is a follow up commit agreed in http://reviews.llvm.org/D7094



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226871 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Simon Pilgrim
2015-01-22 22:19:58 +00:00
parent bdf296fb06
commit c7d6e9b0f9

View File

@@ -1,4 +1,4 @@
; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2 ; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSSE3
; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1 ; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 ; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
@@ -7,334 +7,344 @@ target triple = "x86_64-apple-macosx10.8.0"
define void @test1(i16* nocapture %head) nounwind { define void @test1(i16* nocapture %head) nounwind {
vector.ph: vector.ph:
br label %vector.body %0 = getelementptr inbounds i16* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i16* %head, i64 %index
%1 = bitcast i16* %0 to <8 x i16>* %1 = bitcast i16* %0 to <8 x i16>*
%2 = load <8 x i16>* %1, align 2 %2 = load <8 x i16>* %1, align 2
%3 = icmp slt <8 x i16> %2, zeroinitializer %3 = icmp slt <8 x i16> %2, zeroinitializer
%4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
%5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
store <8 x i16> %5, <8 x i16>* %1, align 2 store <8 x i16> %5, <8 x i16>* %1, align 2
%index.next = add i64 %index, 8
%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; SSE2: @test1 ; SSSE3: @test1
; SSE2: psubusw %xmm0, %xmm1 ; SSSE3: # BB#0:
; SSSE3-NEXT: movdqu (%rdi), %xmm0
; SSSE3-NEXT: psubusw LCPI0_0(%rip), %xmm0
; SSSE3-NEXT: movdqu %xmm0, (%rdi)
; SSSE3-NEXT: retq
; AVX1: @test1 ; AVX1: @test1
; AVX1: vpsubusw %xmm0, %xmm1, %xmm1 ; AVX1: # BB#0:
; AVX1-NEXT: vmovdqu (%rdi), %xmm0
; AVX1-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: retq
; AVX2: @test1 ; AVX2: @test1
; AVX2: vpsubusw %xmm0, %xmm1, %xmm1 ; AVX2: # BB#0:
; AVX2-NEXT: vmovdqu (%rdi), %xmm0
; AVX2-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
; AVX2-NEXT: retq
} }
define void @test2(i16* nocapture %head) nounwind { define void @test2(i16* nocapture %head) nounwind {
vector.ph: vector.ph:
br label %vector.body %0 = getelementptr inbounds i16* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i16* %head, i64 %index
%1 = bitcast i16* %0 to <8 x i16>* %1 = bitcast i16* %0 to <8 x i16>*
%2 = load <8 x i16>* %1, align 2 %2 = load <8 x i16>* %1, align 2
%3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
%4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
%5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
store <8 x i16> %5, <8 x i16>* %1, align 2 store <8 x i16> %5, <8 x i16>* %1, align 2
%index.next = add i64 %index, 8
%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; SSE2: @test2 ; SSSE3: @test2
; SSE2: psubusw %xmm0, %xmm1 ; SSSE3: # BB#0:
; SSSE3-NEXT: movdqu (%rdi), %xmm0
; SSSE3-NEXT: psubusw LCPI1_0(%rip), %xmm0
; SSSE3-NEXT: movdqu %xmm0, (%rdi)
; SSSE3-NEXT: retq
; AVX1: @test2 ; AVX1: @test2
; AVX1: vpsubusw %xmm0, %xmm1, %xmm1 ; AVX1: # BB#0:
; AVX1-NEXT: vmovdqu (%rdi), %xmm0
; AVX1-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: retq
; AVX2: @test2 ; AVX2: @test2
; AVX2: vpsubusw %xmm0, %xmm1, %xmm1 ; AVX2: # BB#0:
; AVX2-NEXT: vmovdqu (%rdi), %xmm0
; AVX2-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
; AVX2-NEXT: retq
} }
define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind { define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
vector.ph: vector.ph:
%0 = insertelement <8 x i16> undef, i16 %w, i32 0 %0 = insertelement <8 x i16> undef, i16 %w, i32 0
%broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
br label %vector.body %1 = getelementptr inbounds i16* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds i16* %head, i64 %index
%2 = bitcast i16* %1 to <8 x i16>* %2 = bitcast i16* %1 to <8 x i16>*
%3 = load <8 x i16>* %2, align 2 %3 = load <8 x i16>* %2, align 2
%4 = icmp ult <8 x i16> %3, %broadcast15 %4 = icmp ult <8 x i16> %3, %broadcast15
%5 = sub <8 x i16> %3, %broadcast15 %5 = sub <8 x i16> %3, %broadcast15
%6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5 %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
store <8 x i16> %6, <8 x i16>* %2, align 2 store <8 x i16> %6, <8 x i16>* %2, align 2
%index.next = add i64 %index, 8
%7 = icmp eq i64 %index.next, 16384
br i1 %7, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; SSE2: @test3 ; SSSE3: @test3
; SSE2: psubusw %xmm0, %xmm1 ; SSSE3: # BB#0:
; SSSE3-NEXT: movd %esi, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; SSSE3-NEXT: movdqu (%rdi), %xmm1
; SSSE3-NEXT: psubusw %xmm0, %xmm1
; SSSE3-NEXT: movdqu %xmm1, (%rdi)
; SSSE3-NEXT: retq
; AVX1: @test3 ; AVX1: @test3
; AVX1: vpsubusw %xmm0, %xmm1, %xmm1 ; AVX1: # BB#0:
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vmovdqu (%rdi), %xmm1
; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: retq
; AVX2: @test3 ; AVX2: @test3
; AVX2: vpsubusw %xmm0, %xmm1, %xmm1 ; AVX2: # BB#0:
; AVX2-NEXT: vmovd %esi, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: vmovdqu (%rdi), %xmm1
; AVX2-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
; AVX2-NEXT: retq
} }
define void @test4(i8* nocapture %head) nounwind { define void @test4(i8* nocapture %head) nounwind {
vector.ph: vector.ph:
br label %vector.body %0 = getelementptr inbounds i8* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i8* %head, i64 %index
%1 = bitcast i8* %0 to <16 x i8>* %1 = bitcast i8* %0 to <16 x i8>*
%2 = load <16 x i8>* %1, align 1 %2 = load <16 x i8>* %1, align 1
%3 = icmp slt <16 x i8> %2, zeroinitializer %3 = icmp slt <16 x i8> %2, zeroinitializer
%4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
%5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
store <16 x i8> %5, <16 x i8>* %1, align 1 store <16 x i8> %5, <16 x i8>* %1, align 1
%index.next = add i64 %index, 16
%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; SSE2: @test4 ; SSSE3: @test4
; SSE2: psubusb %xmm0, %xmm1 ; SSSE3: # BB#0:
; SSSE3-NEXT: movdqu (%rdi), %xmm0
; SSSE3-NEXT: psubusb LCPI3_0(%rip), %xmm0
; SSSE3-NEXT: movdqu %xmm0, (%rdi)
; SSSE3-NEXT: retq
; AVX1: @test4 ; AVX1: @test4
; AVX1: vpsubusb %xmm0, %xmm1, %xmm1 ; AVX1: # BB#0:
; AVX1-NEXT: vmovdqu (%rdi), %xmm0
; AVX1-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: retq
; AVX2: @test4 ; AVX2: @test4
; AVX2: vpsubusb %xmm0, %xmm1, %xmm1 ; AVX2: # BB#0:
; AVX2-NEXT: vmovdqu (%rdi), %xmm0
; AVX2-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
; AVX2-NEXT: retq
} }
define void @test5(i8* nocapture %head) nounwind { define void @test5(i8* nocapture %head) nounwind {
vector.ph: vector.ph:
br label %vector.body %0 = getelementptr inbounds i8* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i8* %head, i64 %index
%1 = bitcast i8* %0 to <16 x i8>* %1 = bitcast i8* %0 to <16 x i8>*
%2 = load <16 x i8>* %1, align 1 %2 = load <16 x i8>* %1, align 1
%3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
%4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
%5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
store <16 x i8> %5, <16 x i8>* %1, align 1 store <16 x i8> %5, <16 x i8>* %1, align 1
%index.next = add i64 %index, 16
%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; SSE2: @test5 ; SSSE3: @test5
; SSE2: psubusb %xmm0, %xmm1 ; SSSE3: # BB#0:
; SSSE3-NEXT: movdqu (%rdi), %xmm0
; SSSE3-NEXT: psubusb LCPI4_0(%rip), %xmm0
; SSSE3-NEXT: movdqu %xmm0, (%rdi)
; SSSE3-NEXT: retq
; AVX1: @test5 ; AVX1: @test5
; AVX1: vpsubusb %xmm0, %xmm1, %xmm1 ; AVX1: # BB#0:
; AVX1-NEXT: vmovdqu (%rdi), %xmm0
; AVX1-NEXT: vpsubusb LCPI4_0(%rip), %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: retq
; AVX2: @test5 ; AVX2: @test5
; AVX2: vpsubusb %xmm0, %xmm1, %xmm1 ; AVX2: # BB#0:
; AVX2-NEXT: vmovdqu (%rdi), %xmm0
; AVX2-NEXT: vpsubusb LCPI4_0(%rip), %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
; AVX2-NEXT: retq
} }
define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind { define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
vector.ph: vector.ph:
%0 = insertelement <16 x i8> undef, i8 %w, i32 0 %0 = insertelement <16 x i8> undef, i8 %w, i32 0
%broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
br label %vector.body %1 = getelementptr inbounds i8* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds i8* %head, i64 %index
%2 = bitcast i8* %1 to <16 x i8>* %2 = bitcast i8* %1 to <16 x i8>*
%3 = load <16 x i8>* %2, align 1 %3 = load <16 x i8>* %2, align 1
%4 = icmp ult <16 x i8> %3, %broadcast15 %4 = icmp ult <16 x i8> %3, %broadcast15
%5 = sub <16 x i8> %3, %broadcast15 %5 = sub <16 x i8> %3, %broadcast15
%6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5 %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
store <16 x i8> %6, <16 x i8>* %2, align 1 store <16 x i8> %6, <16 x i8>* %2, align 1
%index.next = add i64 %index, 16
%7 = icmp eq i64 %index.next, 16384
br i1 %7, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; SSE2: @test6 ; SSSE3: @test6
; SSE2: psubusb %xmm0, %xmm1 ; SSSE3: # BB#0:
; SSSE3-NEXT: movd %esi, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: pshufb %xmm1, %xmm0
; SSSE3-NEXT: movdqu (%rdi), %xmm1
; SSSE3-NEXT: psubusb %xmm0, %xmm1
; SSSE3-NEXT: movdqu %xmm1, (%rdi)
; SSSE3-NEXT: retq
; AVX1: @test6 ; AVX1: @test6
; AVX1: vpsubusb %xmm0, %xmm1, %xmm1 ; AVX1: # BB#0:
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0
; AVX1-NEXT: vmovdqu (%rdi), %xmm1
; AVX1-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: retq
; AVX2: @test6 ; AVX2: @test6
; AVX2: vpsubusb %xmm0, %xmm1, %xmm1 ; AVX2: # BB#0:
; AVX2-NEXT: vmovd %esi, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vmovdqu (%rdi), %xmm1
; AVX2-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
; AVX2-NEXT: retq
} }
define void @test7(i16* nocapture %head) nounwind { define void @test7(i16* nocapture %head) nounwind {
vector.ph: vector.ph:
br label %vector.body %0 = getelementptr inbounds i16* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i16* %head, i64 %index
%1 = bitcast i16* %0 to <16 x i16>* %1 = bitcast i16* %0 to <16 x i16>*
%2 = load <16 x i16>* %1, align 2 %2 = load <16 x i16>* %1, align 2
%3 = icmp slt <16 x i16> %2, zeroinitializer %3 = icmp slt <16 x i16> %2, zeroinitializer
%4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
%5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
store <16 x i16> %5, <16 x i16>* %1, align 2 store <16 x i16> %5, <16 x i16>* %1, align 2
%index.next = add i64 %index, 8
%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; AVX2: @test7 ; AVX2: @test7
; AVX2: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0 ; AVX2: # BB#0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
} }
define void @test8(i16* nocapture %head) nounwind { define void @test8(i16* nocapture %head) nounwind {
vector.ph: vector.ph:
br label %vector.body %0 = getelementptr inbounds i16* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i16* %head, i64 %index
%1 = bitcast i16* %0 to <16 x i16>* %1 = bitcast i16* %0 to <16 x i16>*
%2 = load <16 x i16>* %1, align 2 %2 = load <16 x i16>* %1, align 2
%3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
%4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
%5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
store <16 x i16> %5, <16 x i16>* %1, align 2 store <16 x i16> %5, <16 x i16>* %1, align 2
%index.next = add i64 %index, 8
%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; AVX2: @test8 ; AVX2: @test8
; AVX2: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0 ; AVX2: # BB#0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
} }
define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind { define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
vector.ph: vector.ph:
%0 = insertelement <16 x i16> undef, i16 %w, i32 0 %0 = insertelement <16 x i16> undef, i16 %w, i32 0
%broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
br label %vector.body %1 = getelementptr inbounds i16* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds i16* %head, i64 %index
%2 = bitcast i16* %1 to <16 x i16>* %2 = bitcast i16* %1 to <16 x i16>*
%3 = load <16 x i16>* %2, align 2 %3 = load <16 x i16>* %2, align 2
%4 = icmp ult <16 x i16> %3, %broadcast15 %4 = icmp ult <16 x i16> %3, %broadcast15
%5 = sub <16 x i16> %3, %broadcast15 %5 = sub <16 x i16> %3, %broadcast15
%6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5 %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
store <16 x i16> %6, <16 x i16>* %2, align 2 store <16 x i16> %6, <16 x i16>* %2, align 2
%index.next = add i64 %index, 8
%7 = icmp eq i64 %index.next, 16384
br i1 %7, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; AVX2: @test9 ; AVX2: @test9
; AVX2: vpsubusw %ymm0, %ymm1, %ymm1 ; AVX2: # BB#0:
; AVX2-NEXT: vmovd %esi, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vmovdqu (%rdi), %ymm1
; AVX2-NEXT: vpsubusw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
} }
define void @test10(i8* nocapture %head) nounwind { define void @test10(i8* nocapture %head) nounwind {
vector.ph: vector.ph:
br label %vector.body %0 = getelementptr inbounds i8* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i8* %head, i64 %index
%1 = bitcast i8* %0 to <32 x i8>* %1 = bitcast i8* %0 to <32 x i8>*
%2 = load <32 x i8>* %1, align 1 %2 = load <32 x i8>* %1, align 1
%3 = icmp slt <32 x i8> %2, zeroinitializer %3 = icmp slt <32 x i8> %2, zeroinitializer
%4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
%5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
store <32 x i8> %5, <32 x i8>* %1, align 1 store <32 x i8> %5, <32 x i8>* %1, align 1
%index.next = add i64 %index, 16
%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; AVX2: @test10 ; AVX2: @test10
; AVX2: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0 ; AVX2: # BB#0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
} }
define void @test11(i8* nocapture %head) nounwind { define void @test11(i8* nocapture %head) nounwind {
vector.ph: vector.ph:
br label %vector.body %0 = getelementptr inbounds i8* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i8* %head, i64 %index
%1 = bitcast i8* %0 to <32 x i8>* %1 = bitcast i8* %0 to <32 x i8>*
%2 = load <32 x i8>* %1, align 1 %2 = load <32 x i8>* %1, align 1
%3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
%4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
%5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
store <32 x i8> %5, <32 x i8>* %1, align 1 store <32 x i8> %5, <32 x i8>* %1, align 1
%index.next = add i64 %index, 16
%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; AVX2: @test11 ; AVX2: @test11
; AVX2: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0 ; AVX2: # BB#0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
} }
define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind { define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
vector.ph: vector.ph:
%0 = insertelement <32 x i8> undef, i8 %w, i32 0 %0 = insertelement <32 x i8> undef, i8 %w, i32 0
%broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
br label %vector.body %1 = getelementptr inbounds i8* %head, i64 0
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds i8* %head, i64 %index
%2 = bitcast i8* %1 to <32 x i8>* %2 = bitcast i8* %1 to <32 x i8>*
%3 = load <32 x i8>* %2, align 1 %3 = load <32 x i8>* %2, align 1
%4 = icmp ult <32 x i8> %3, %broadcast15 %4 = icmp ult <32 x i8> %3, %broadcast15
%5 = sub <32 x i8> %3, %broadcast15 %5 = sub <32 x i8> %3, %broadcast15
%6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5 %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
store <32 x i8> %6, <32 x i8>* %2, align 1 store <32 x i8> %6, <32 x i8>* %2, align 1
%index.next = add i64 %index, 16
%7 = icmp eq i64 %index.next, 16384
br i1 %7, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void ret void
; AVX2: @test12 ; AVX2: @test12
; AVX2: vpsubusb %ymm0, %ymm1, %ymm1 ; AVX2: # BB#0:
; AVX2-NEXT: vmovd %esi, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqu (%rdi), %ymm1
; AVX2-NEXT: vpsubusb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
} }