llvm-6502/test/CodeGen/X86/vector-shuffle-128-v8.ll

833 lines
36 KiB
LLVM
Raw Normal View History

; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSSE3
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_01012323
; ALL: # BB#0:
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,0,1,1]
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_67452301(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_67452301
; ALL: # BB#0:
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,2,1,0]
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_456789AB
; ALL: # BB#0:
; ALL: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0]
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_00000000
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_00000000
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_00004444
; ALL: # BB#0:
; ALL-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
; ALL-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_31206745
; ALL: # BB#0:
; ALL-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7]
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2]
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_44440000(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_44440000
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,0,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_44440000
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_75643120(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_75643120
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_75643120
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_10545410(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_10545410
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_10545410
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_54105410(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_54105410
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_54105410
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_54101054(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_54101054
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_54101054
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_04400440(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_04400440
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,4,4,6]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_04400440
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_40044004(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_40044004
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,0,0,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_40044004
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_26405173
; SSE2: # BB#0:
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_26405173
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_20645173
; SSE2: # BB#0:
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_20645173
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_26401375
; SSE2: # BB#0:
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_26401375
; SSSE3: # BB#0:
[x86] Largely complete the use of PSHUFB in the new vector shuffle lowering with a small addition to it and adding PSHUFB combining. There is one obvious place in the new vector shuffle lowering where we should form PSHUFBs directly: when without them we will unpack a vector of i8s across two different registers and do a potentially 4-way blend as i16s only to re-pack them into i8s afterward. This is the crazy expensive fallback path for i8 shuffles and we can just directly use pshufb here as it will always be cheaper (the unpack and pack are two instructions so even a single shuffle between them hits our three instruction limit for forming PSHUFB). However, this doesn't generate very good code in many cases, and it leaves a bunch of common patterns not using PSHUFB. So this patch also adds support for extracting a shuffle mask from PSHUFB in the X86 lowering code, and uses it to handle PSHUFBs in the recursive shuffle combining. This allows us to combine through them, combine multiple ones together, and generally produce sufficiently high quality code. Extracting the PSHUFB mask is annoyingly complex because it could be either pre-legalization or post-legalization. At least this doesn't have to deal with re-materialized constants. =] I've added decode routines to handle the different patterns that show up at this level and we dispatch through them as appropriate. The two primary test cases are updated. For the v16 test case there is still a lot of room for improvement. Since I was going through it systematically I left behind a bunch of FIXME lines that I'm hoping to turn into ALL lines by the end of this. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214628 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-02 10:39:15 +00:00
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_66751643(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_66751643
; SSE2: # BB#0:
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,5,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,0]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,1,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_66751643
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
; SSSE3-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_60514754(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_60514754
; SSE2: # BB#0:
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,4,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,0,3,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,6]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_60514754
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
; SSSE3-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 6, i32 0, i32 5, i32 1, i32 4, i32 7, i32 5, i32 4>
ret <8 x i16> %shuffle
}
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
define <8 x i16> @shuffle_v8i16_00444444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_00444444
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_00444444
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_44004444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_44004444
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,2,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_44004444
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_04404444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_04404444
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_04404444
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_04400000(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_04400000
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,0,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_04400000
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_04404567
; ALL: # BB#0:
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; ALL-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_0X444444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_0X444444
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,2,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_0X444444
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,{{[0-9]+,[0-9]+}},8,9,8,9,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_44X04444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_44X04444
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,2,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_44X04444
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[8,9,8,9,{{[0-9]+,[0-9]+}},0,1,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_X4404444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_X4404444
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_X4404444
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+}},8,9,8,9,0,1,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_0127XXXX
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_0127XXXX
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,2,3,4,5,14,15,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_XXXX4563
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_XXXX4563
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},8,9,10,11,12,13,6,7]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_4563XXXX
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_4563XXXX
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[8,9,10,11,12,13,6,7,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_01274563
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,4,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_01274563
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_45630127
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,3,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_45630127
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_37102735(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_37102735
; SSE2: # BB#0:
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,5,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,4,5,6]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_37102735
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
; SSSE3-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 5>
ret <8 x i16> %shuffle
}
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_08192a3b
; ALL: # BB#0:
; ALL-NEXT: punpcklwd %xmm1, %xmm0
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_0c1d2e3f
; ALL: # BB#0:
; ALL-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
; ALL-NEXT: punpcklwd %xmm1, %xmm0
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_4c5d6e7f
; ALL: # BB#0:
; ALL-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
; ALL-NEXT: punpcklwd %xmm1, %xmm0
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_48596a7b
; ALL: # BB#0:
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
; ALL-NEXT: punpcklwd %xmm1, %xmm0
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_08196e7f
; ALL: # BB#0:
; ALL-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,3,2,3]
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,3]
; ALL-NEXT: punpcklwd %xmm1, %xmm0
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 6, i32 14, i32 7, i32 15>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_0c1d6879
; ALL: # BB#0:
; ALL-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,0,2,3]
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,3]
; ALL-NEXT: punpcklwd %xmm1, %xmm0
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 6, i32 8, i32 7, i32 9>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_109832ba
; ALL: # BB#0:
; ALL-NEXT: punpcklwd %xmm1, %xmm0
; ALL-NEXT: pshuflw {{.*}} # xmm1 = xmm0[2,0,3,1,4,5,6,7]
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
; ALL-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,0,3,1,4,5,6,7]
; ALL-NEXT: punpcklqdq %xmm0, %xmm1
; ALL-NEXT: movdqa %xmm1, %xmm0
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_8091a2b3(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_8091a2b3
; ALL: # BB#0:
; ALL-NEXT: punpcklwd %xmm0, %xmm1
; ALL-NEXT: movdqa %xmm1, %xmm0
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_c4d5e6f7
; ALL: # BB#0:
; ALL-NEXT: pshufd {{.*}} # xmm2 = xmm0[2,3,2,3]
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
; ALL-NEXT: punpcklwd %xmm2, %xmm0
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_0213cedf
; ALL: # BB#0:
; ALL-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
; ALL-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
; ALL-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,1,3,4,5,6,7]
; ALL-NEXT: punpcklqdq %xmm1, %xmm0
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_443aXXXX
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,3,4,5,6,7]
; SSE2-NEXT: punpcklwd %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_443aXXXX
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
; SSSE3-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,3,4,5,6,7]
; SSSE3-NEXT: punpcklwd %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,4,5,12,13,10,11,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
; SSSE3-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_032dXXXX
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
; SSE2-NEXT: punpcklwd %xmm1, %xmm0
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_032dXXXX
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
; SSSE3-NEXT: punpcklwd %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,12,13,8,9,6,7,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_XXXcXXXX(<8 x i16> %a, <8 x i16> %b) {
; ALL-LABEL: @shuffle_v8i16_XXXcXXXX
; ALL: # BB#0:
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm1[2,1,2,3]
; ALL-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,2,1,4,5,6,7]
; ALL-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_012dXXXX
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
; SSE2-NEXT: punpcklwd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_012dXXXX
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
; SSSE3-NEXT: punpcklwd %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,4,5,8,9,6,7,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_XXXXcde3
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
; SSE2-NEXT: punpckhwd %xmm0, %xmm1
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_XXXXcde3
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
; SSSE3-NEXT: punpckhwd %xmm0, %xmm1 # xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: pshufb {{.*}} # xmm1 = xmm1[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},0,1,4,5,8,9,14,15]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_cde3XXXX
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
; SSE2-NEXT: punpckhwd %xmm0, %xmm1
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_cde3XXXX
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
; SSSE3-NEXT: punpckhwd %xmm0, %xmm1 # xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: pshufb {{.*}} # xmm1 = xmm1[0,1,4,5,8,9,14,15,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_012dcde3
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm2 = xmm0[0,1,2,1]
; SSE2-NEXT: pshufd {{.*}} # xmm3 = xmm1[2,1,2,3]
; SSE2-NEXT: punpckhwd %xmm2, %xmm1
; SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,7,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpcklwd %xmm3, %xmm0
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7]
; SSE2-NEXT: punpcklqdq %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_012dcde3
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufd {{.*}} # xmm2 = xmm0[0,1,2,1]
; SSSE3-NEXT: pshufd {{.*}} # xmm3 = xmm1[2,1,2,3]
; SSSE3-NEXT: punpckhwd %xmm2, %xmm1 # xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: pshufb {{.*}} # xmm1 = xmm1[0,1,4,5,8,9,14,15,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
; SSSE3-NEXT: punpcklwd %xmm3, %xmm0 # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,4,5,8,9,6,7,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
; SSSE3-NEXT: punpcklqdq %xmm1, %xmm0
; SSSE3-NEXT: retq
[x86] Begin a significant overhaul of how vector lowering is done in the x86 backend. This sketches out a new code path for vector lowering, hidden behind an off-by-default flag while it is under development. The fundamental idea behind the new code path is to aggressively break down the problem space in ways that ease selecting the odd set of instructions available on x86, and carefully avoid scalarizing code even when forced to use older ISAs. Notably, this starts off restricting itself to SSE2 and implements the complete vector shuffle and blend space for 128-bit vectors in SSE2 without scalarizing. The plan is to layer on top of this ISA extensions where we can bail out of the complex SSE2 lowering and opt for a cheaper, specialized instruction (or set of instructions). It also needs to be generalized to AVX and AVX512 vector widths. Currently, this does a decent but not perfect job for SSE2. There are some specific shortcomings that I plan to address: - We need a peephole combine to fold together shuffles where possible. There are cases where a previous shuffle could be modified slightly to arrange for elements to be in the correct position and a later shuffle eliminated. Doing this eagerly added quite a bit of complexity, and so my plan is to combine away these redundancies afterward. - There are a lot more clever ways to use unpck and pack that need to be added. This is essential for real world shuffles as it turns out... Once SSE2 is polished a bit I should be able to get interesting numbers on performance improvements on benchmarks conducive to vectorization. All of this will be off by default until it is functionally equivalent of course. Differential Revision: http://reviews.llvm.org/D4225 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211888 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-27 11:23:44 +00:00
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
}
[x86] Fix a crash and wrong-code bug in the new vector lowering all found by a single test reduced out of a failure on llvm-stress. The start of the problem (and the crash) came when we tried to use a find of a non-used slot in the move-to half of the move-mask as the target for two bad-half inputs. While if lucky this will be the first of a pair of slots which we can place the bad-half inputs into, it isn't actually guaranteed. This really isn't surprising, not sure what I was thinking. The correct way to find the two unused slots is to look for one of the *used* slots. We know it isn't that pair, and we can use some modular arithmetic to find the other pair by masking off the odd bit and adding 2 modulo 4. With this, we reliably found a viable pair of slots for the bad-half inputs. Sadly, that wasn't enough. We also had a wrong code bug that surfaced when I reduced the test case for this where we would use the same slot twice for the two bad inputs. This is because both of the bad inputs could be in odd slots originally and thus the mod-2 mapping would actually be the same. The whole point of the weird indexing into the pair of empty slots was to try to leverage when the end result needed the two bad-half inputs to be paired in a dword and pre-pair them in the correct orrientation. This is less important with the powerful combining we're now doing, and also easier and more reliable to achieve be noting that we add the bad-half inputs in order. Thus, if they are in a dword pair, the low part of that will be the first input in the sequence. Always putting that in the low element will just do the right thing in addition to computing the correct result. Test case added. =] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214849 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-05 08:19:21 +00:00
define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @shuffle_v8i16_XXX1X579
; SSE2: # BB#0:
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,7]
[x86] Fix a crash and wrong-code bug in the new vector lowering all found by a single test reduced out of a failure on llvm-stress. The start of the problem (and the crash) came when we tried to use a find of a non-used slot in the move-to half of the move-mask as the target for two bad-half inputs. While if lucky this will be the first of a pair of slots which we can place the bad-half inputs into, it isn't actually guaranteed. This really isn't surprising, not sure what I was thinking. The correct way to find the two unused slots is to look for one of the *used* slots. We know it isn't that pair, and we can use some modular arithmetic to find the other pair by masking off the odd bit and adding 2 modulo 4. With this, we reliably found a viable pair of slots for the bad-half inputs. Sadly, that wasn't enough. We also had a wrong code bug that surfaced when I reduced the test case for this where we would use the same slot twice for the two bad inputs. This is because both of the bad inputs could be in odd slots originally and thus the mod-2 mapping would actually be the same. The whole point of the weird indexing into the pair of empty slots was to try to leverage when the end result needed the two bad-half inputs to be paired in a dword and pre-pair them in the correct orrientation. This is less important with the powerful combining we're now doing, and also easier and more reliable to achieve be noting that we add the bad-half inputs in order. Thus, if they are in a dword pair, the low part of that will be the first input in the sequence. Always putting that in the low element will just do the right thing in addition to computing the correct result. Test case added. =] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214849 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-05 08:19:21 +00:00
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,3,2,4,5,6,7]
[x86] Fix a crash and wrong-code bug in the new vector lowering all found by a single test reduced out of a failure on llvm-stress. The start of the problem (and the crash) came when we tried to use a find of a non-used slot in the move-to half of the move-mask as the target for two bad-half inputs. While if lucky this will be the first of a pair of slots which we can place the bad-half inputs into, it isn't actually guaranteed. This really isn't surprising, not sure what I was thinking. The correct way to find the two unused slots is to look for one of the *used* slots. We know it isn't that pair, and we can use some modular arithmetic to find the other pair by masking off the odd bit and adding 2 modulo 4. With this, we reliably found a viable pair of slots for the bad-half inputs. Sadly, that wasn't enough. We also had a wrong code bug that surfaced when I reduced the test case for this where we would use the same slot twice for the two bad inputs. This is because both of the bad inputs could be in odd slots originally and thus the mod-2 mapping would actually be the same. The whole point of the weird indexing into the pair of empty slots was to try to leverage when the end result needed the two bad-half inputs to be paired in a dword and pre-pair them in the correct orrientation. This is less important with the powerful combining we're now doing, and also easier and more reliable to achieve be noting that we add the bad-half inputs in order. Thus, if they are in a dword pair, the low part of that will be the first input in the sequence. Always putting that in the low element will just do the right thing in addition to computing the correct result. Test case added. =] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214849 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-05 08:19:21 +00:00
; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,2,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,5,7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v8i16_XXX1X579
; SSSE3: # BB#0:
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+}},2,3,10,11,14,15,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
[x86] Fix a crash and wrong-code bug in the new vector lowering all found by a single test reduced out of a failure on llvm-stress. The start of the problem (and the crash) came when we tried to use a find of a non-used slot in the move-to half of the move-mask as the target for two bad-half inputs. While if lucky this will be the first of a pair of slots which we can place the bad-half inputs into, it isn't actually guaranteed. This really isn't surprising, not sure what I was thinking. The correct way to find the two unused slots is to look for one of the *used* slots. We know it isn't that pair, and we can use some modular arithmetic to find the other pair by masking off the odd bit and adding 2 modulo 4. With this, we reliably found a viable pair of slots for the bad-half inputs. Sadly, that wasn't enough. We also had a wrong code bug that surfaced when I reduced the test case for this where we would use the same slot twice for the two bad inputs. This is because both of the bad inputs could be in odd slots originally and thus the mod-2 mapping would actually be the same. The whole point of the weird indexing into the pair of empty slots was to try to leverage when the end result needed the two bad-half inputs to be paired in a dword and pre-pair them in the correct orrientation. This is less important with the powerful combining we're now doing, and also easier and more reliable to achieve be noting that we add the bad-half inputs in order. Thus, if they are in a dword pair, the low part of that will be the first input in the sequence. Always putting that in the low element will just do the right thing in addition to computing the correct result. Test case added. =] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214849 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-05 08:19:21 +00:00
; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},4,5,{{[0-9]+,[0-9]+}},8,9,12,13,6,7]
[x86] Fix a crash and wrong-code bug in the new vector lowering all found by a single test reduced out of a failure on llvm-stress. The start of the problem (and the crash) came when we tried to use a find of a non-used slot in the move-to half of the move-mask as the target for two bad-half inputs. While if lucky this will be the first of a pair of slots which we can place the bad-half inputs into, it isn't actually guaranteed. This really isn't surprising, not sure what I was thinking. The correct way to find the two unused slots is to look for one of the *used* slots. We know it isn't that pair, and we can use some modular arithmetic to find the other pair by masking off the odd bit and adding 2 modulo 4. With this, we reliably found a viable pair of slots for the bad-half inputs. Sadly, that wasn't enough. We also had a wrong code bug that surfaced when I reduced the test case for this where we would use the same slot twice for the two bad inputs. This is because both of the bad inputs could be in odd slots originally and thus the mod-2 mapping would actually be the same. The whole point of the weird indexing into the pair of empty slots was to try to leverage when the end result needed the two bad-half inputs to be paired in a dword and pre-pair them in the correct orrientation. This is less important with the powerful combining we're now doing, and also easier and more reliable to achieve be noting that we add the bad-half inputs in order. Thus, if they are in a dword pair, the low part of that will be the first input in the sequence. Always putting that in the low element will just do the right thing in addition to computing the correct result. Test case added. =] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214849 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-05 08:19:21 +00:00
; SSSE3-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSE2-LABEL: @shuffle_v8i16_XX4X8acX
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpcklwd {{.*}} # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,1,2,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,1,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,4,7]
; SSE2-NEXT: retq
;
[x86] Fix two independent miscompiles in the process of getting the same test case to actually generate correct code. The primary miscompile fixed here is that we weren't correctly handling in-place elements in one half of a single-input v8i16 shuffle when moving a dword of elements from that half to the other half. Some times, we would clobber the in-place elements in forming the dword to move across halves. The fix to this involves forcibly marking the in-place inputs even when there is no need to gather them into a dword, and to much more carefully re-arrange the elements when grouping them into a dword to move across halves. With these two changes we would generate correct shuffles for the test case, but found another miscompile. There are also some random perturbations of the generated shuffle pattern in SSE2. It looks like a wash; more instructions in some cases fewer in others. The second miscompile would corrupt the results into nonsense. This is a buggy pattern in one of the added DAG combines. Mapping elements through a PSHUFD when pairing redundant half-shuffles is *much* harder than this code makes it out to be -- it requires reasoning about *all* of where the input is used in the PSHUFD, not just one part of where it is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the code didn't guard against it. I think this was just a bad idea and I've just removed that aspect of the combine. No tests regress as a consequence so seems OK. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214954 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-06 10:16:36 +00:00
; SSSE3-LABEL: @shuffle_v8i16_XX4X8acX
; SSSE3: # BB#0:
; SSSE3-NEXT: pshufd {{.*}} # [[X:xmm[0-9]+]] = xmm0[2,1,2,3]
; SSSE3-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+}},2,3,{{[0-9]+,[0-9]+}},0,1,4,5,8,9,{{[0-9]+,[0-9]+}}]
; SSSE3-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) {
; ALL-LABEL: @shuffle_v8i16_8zzzzzzz
; ALL: # BB#0:
; ALL-NEXT: movzwl {{.*}}, %[[R:.*]]
; ALL-NEXT: movd %[[R]], %xmm0
; ALL-NEXT: retq
%a = insertelement <8 x i16> undef, i16 %i, i32 0
%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
; ALL-LABEL: @shuffle_v8i16_z8zzzzzz
; ALL: # BB#0:
; ALL-NEXT: movzwl {{.*}}, %[[R:.*]]
; ALL-NEXT: movd %[[R]], %xmm0
; ALL-NEXT: pslldq $2, %xmm0
; ALL-NEXT: retq
%a = insertelement <8 x i16> undef, i16 %i, i32 0
%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
; ALL-LABEL: @shuffle_v8i16_zzzzz8zz
; ALL: # BB#0:
; ALL-NEXT: movzwl {{.*}}, %[[R:.*]]
; ALL-NEXT: movd %[[R]], %xmm0
; ALL-NEXT: pslldq $10, %xmm0
; ALL-NEXT: retq
%a = insertelement <8 x i16> undef, i16 %i, i32 0
%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
; ALL-LABEL: @shuffle_v8i16_zuuzuuz8
; ALL: # BB#0:
; ALL-NEXT: movzwl {{.*}}, %[[R:.*]]
; ALL-NEXT: movd %[[R]], %xmm0
; ALL-NEXT: pslldq $14, %xmm0
; ALL-NEXT: retq
%a = insertelement <8 x i16> undef, i16 %i, i32 0
%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
; ALL-LABEL: @shuffle_v8i16_zzBzzzzz
; ALL: # BB#0:
; ALL-NEXT: movzwl {{.*}}, %[[R:.*]]
; ALL-NEXT: movd %[[R]], %xmm0
; ALL-NEXT: pslldq $4, %xmm0
; ALL-NEXT: retq
%a = insertelement <8 x i16> undef, i16 %i, i32 3
%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %shuffle
}