[x86] Make vector legalization of extloads work more like the "normal"
vector operation legalization with support for custom target lowering
and fallback to expand when it fails, and use this to implement sext and
anyext load lowering for x86 in a more principled way.
Previously, the x86 backend relied on a target DAG combine to "combine
away" sextload and extload nodes prior to legalization, or would expand
them during legalization with terrible code. This is particularly
problematic because the DAG combine relies on running over non-canonical
DAG nodes at just the right time to match several common and important
patterns. It used a combine rather than lowering because we didn't have
good lowering support, and to expose some tricks being employed to more
combine phases.
With this change it becomes a proper lowering operation, the backend
marks that it can lower these nodes, and I've added support for handling
the canonical forms that don't have direct legal representations such as
sextload of a v4i8 -> v4i64 on AVX1. With this change, our test cases
for this behavior continue to pass even after the DAG combiner beigns
running more systematically over every node.
There is some noise caused by this in the test suite where we actually
use vector extends instead of subregister extraction. This doesn't
really seem like the right thing to do, but is unlikely to be a critical
regression. We do regress in one case where by lowering to the
target-specific patterns early we were able to combine away extraneous
legal math nodes. However, this regression is completely addressed by
switching to a widening based legalization which is what I'm working
toward anyways, so I've just switched the test to that mode.
Differential Revision: http://reviews.llvm.org/D4654
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213897 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-24 22:09:56 +00:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s
|
This commit contains a few changes that had to go in together.
1. Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
(and also scalar_to_vector).
2. Xor/and/or are indifferent to the swizzle operation (shuffle of one src).
Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A, B))
3. Optimize swizzles of shuffles: shuff(shuff(x, y), undef) -> shuff(x, y).
4. Fix an X86ISelLowering optimization which was very bitcast-sensitive.
Code which was previously compiled to this:
movd (%rsi), %xmm0
movdqa .LCPI0_0(%rip), %xmm2
pshufb %xmm2, %xmm0
movd (%rdi), %xmm1
pshufb %xmm2, %xmm1
pxor %xmm0, %xmm1
pshufb .LCPI0_1(%rip), %xmm1
movd %xmm1, (%rdi)
ret
Now compiles to this:
movl (%rsi), %eax
xorl %eax, (%rdi)
ret
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@153848 91177308-0d34-0410-b5e6-96231b3b80d8
2012-04-01 19:31:22 +00:00
|
|
|
|
|
|
|
; Check that we perform a scalar XOR on i32.
|
|
|
|
|
|
|
|
; CHECK: pull_bitcast
|
|
|
|
; CHECK: xorl
|
|
|
|
; CHECK: ret
|
|
|
|
define void @pull_bitcast (<4 x i8>* %pA, <4 x i8>* %pB) {
|
|
|
|
%A = load <4 x i8>* %pA
|
|
|
|
%B = load <4 x i8>* %pB
|
|
|
|
%C = xor <4 x i8> %A, %B
|
|
|
|
store <4 x i8> %C, <4 x i8>* %pA
|
|
|
|
ret void
|
|
|
|
}
|
2012-04-02 07:11:12 +00:00
|
|
|
|
|
|
|
; CHECK: multi_use_swizzle
|
|
|
|
; CHECK: mov
|
|
|
|
; CHECK-NEXT: shuf
|
|
|
|
; CHECK-NEXT: shuf
|
|
|
|
; CHECK-NEXT: shuf
|
|
|
|
; CHECK-NEXT: xor
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
define <4 x i32> @multi_use_swizzle (<4 x i32>* %pA, <4 x i32>* %pB) {
|
|
|
|
%A = load <4 x i32>* %pA
|
|
|
|
%B = load <4 x i32>* %pB
|
|
|
|
%S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 6>
|
|
|
|
%S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 2>
|
|
|
|
%S2 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 2>
|
|
|
|
%R = xor <4 x i32> %S1, %S2
|
|
|
|
ret <4 x i32> %R
|
|
|
|
}
|
2012-04-03 07:39:36 +00:00
|
|
|
|
|
|
|
; CHECK: pull_bitcast2
|
|
|
|
; CHECK: xorl
|
|
|
|
; CHECK: ret
|
|
|
|
define <4 x i8> @pull_bitcast2 (<4 x i8>* %pA, <4 x i8>* %pB, <4 x i8>* %pC) {
|
|
|
|
%A = load <4 x i8>* %pA
|
|
|
|
store <4 x i8> %A, <4 x i8>* %pC
|
|
|
|
%B = load <4 x i8>* %pB
|
|
|
|
%C = xor <4 x i8> %A, %B
|
|
|
|
store <4 x i8> %C, <4 x i8>* %pA
|
|
|
|
ret <4 x i8> %C
|
|
|
|
}
|
2012-04-07 21:19:08 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
; CHECK: reverse_1
|
|
|
|
; CHECK-NOT: shuf
|
|
|
|
; CHECK: ret
|
|
|
|
define <4 x i32> @reverse_1 (<4 x i32>* %pA, <4 x i32>* %pB) {
|
|
|
|
%A = load <4 x i32>* %pA
|
|
|
|
%B = load <4 x i32>* %pB
|
|
|
|
%S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
|
|
|
|
%S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
|
|
|
|
ret <4 x i32> %S1
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
; CHECK: no_reverse_shuff
|
|
|
|
; CHECK: shuf
|
|
|
|
; CHECK: ret
|
|
|
|
define <4 x i32> @no_reverse_shuff (<4 x i32>* %pA, <4 x i32>* %pB) {
|
|
|
|
%A = load <4 x i32>* %pA
|
|
|
|
%B = load <4 x i32>* %pB
|
|
|
|
%S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
|
|
|
|
%S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
|
|
|
|
ret <4 x i32> %S1
|
|
|
|
}
|