llvm-6502/test/CodeGen/X86/SwizzleShuff.ll

; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s

; Check that we perform a scalar XOR on i32.

; CHECK: pull_bitcast
; CHECK: xorl
; CHECK: ret
define void @pull_bitcast (<4 x i8>* %pA, <4 x i8>* %pB) {
  %A = load <4 x i8>* %pA
  %B = load <4 x i8>* %pB
  %C = xor <4 x i8> %A, %B
  store <4 x i8> %C, <4 x i8>* %pA
  ret void
}

; CHECK: multi_use_swizzle
; CHECK: mov
; CHECK-NEXT: shuf
; CHECK-NEXT: shuf
; CHECK-NEXT: shuf
; CHECK-NEXT: xor
; CHECK-NEXT: ret
define <4 x i32> @multi_use_swizzle (<4 x i32>* %pA, <4 x i32>* %pB) {
  %A = load <4 x i32>* %pA
  %B = load <4 x i32>* %pB
  %S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 6>
  %S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 2>
  %S2 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 2>
  %R = xor <4 x i32> %S1, %S2
  ret <4 x i32> %R
}

; CHECK: pull_bitcast2
; CHECK: xorl
; CHECK: ret
define <4 x i8> @pull_bitcast2 (<4 x i8>* %pA, <4 x i8>* %pB, <4 x i8>* %pC) {
  %A = load <4 x i8>* %pA
  store <4 x i8> %A, <4 x i8>* %pC
  %B = load <4 x i8>* %pB
  %C = xor <4 x i8> %A, %B
  store <4 x i8> %C, <4 x i8>* %pA
  ret <4 x i8> %C
}


; CHECK: reverse_1
; CHECK-NOT: shuf
; CHECK: ret
define <4 x i32> @reverse_1 (<4 x i32>* %pA, <4 x i32>* %pB) {
  %A = load <4 x i32>* %pA
  %B = load <4 x i32>* %pB
  %S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
  %S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
  ret <4 x i32> %S1
}


; CHECK: no_reverse_shuff
; CHECK: shuf
; CHECK: ret
define <4 x i32> @no_reverse_shuff (<4 x i32>* %pA, <4 x i32>* %pB) {
  %A = load <4 x i32>* %pA
  %B = load <4 x i32>* %pB
  %S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
  %S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
  ret <4 x i32> %S1
}
[x86] Make vector legalization of extloads work more like the "normal" vector operation legalization with support for custom target lowering and fallback to expand when it fails, and use this to implement sext and anyext load lowering for x86 in a more principled way. Previously, the x86 backend relied on a target DAG combine to "combine away" sextload and extload nodes prior to legalization, or would expand them during legalization with terrible code. This is particularly problematic because the DAG combine relies on running over non-canonical DAG nodes at just the right time to match several common and important patterns. It used a combine rather than lowering because we didn't have good lowering support, and to expose some tricks being employed to more combine phases. With this change it becomes a proper lowering operation, the backend marks that it can lower these nodes, and I've added support for handling the canonical forms that don't have direct legal representations such as sextload of a v4i8 -> v4i64 on AVX1. With this change, our test cases for this behavior continue to pass even after the DAG combiner beigns running more systematically over every node. There is some noise caused by this in the test suite where we actually use vector extends instead of subregister extraction. This doesn't really seem like the right thing to do, but is unlikely to be a critical regression. We do regress in one case where by lowering to the target-specific patterns early we were able to combine away extraneous legal math nodes. However, this regression is completely addressed by switching to a widening based legalization which is what I'm working toward anyways, so I've just switched the test to that mode. Differential Revision: http://reviews.llvm.org/D4654 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213897 91177308-0d34-0410-b5e6-96231b3b80d8 2014-07-24 22:09:56 +00:00			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization \| FileCheck %s`
This commit contains a few changes that had to go in together. 1. Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) (and also scalar_to_vector). 2. Xor/and/or are indifferent to the swizzle operation (shuffle of one src). Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A, B)) 3. Optimize swizzles of shuffles: shuff(shuff(x, y), undef) -> shuff(x, y). 4. Fix an X86ISelLowering optimization which was very bitcast-sensitive. Code which was previously compiled to this: movd (%rsi), %xmm0 movdqa .LCPI0_0(%rip), %xmm2 pshufb %xmm2, %xmm0 movd (%rdi), %xmm1 pshufb %xmm2, %xmm1 pxor %xmm0, %xmm1 pshufb .LCPI0_1(%rip), %xmm1 movd %xmm1, (%rdi) ret Now compiles to this: movl (%rsi), %eax xorl %eax, (%rdi) ret git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@153848 91177308-0d34-0410-b5e6-96231b3b80d8 2012-04-01 19:31:22 +00:00
			`; Check that we perform a scalar XOR on i32.`

			`; CHECK: pull_bitcast`
			`; CHECK: xorl`
			`; CHECK: ret`
			`define void @pull_bitcast (<4 x i8>* %pA, <4 x i8>* %pB) {`
			`%A = load <4 x i8>* %pA`
			`%B = load <4 x i8>* %pB`
			`%C = xor <4 x i8> %A, %B`
			`store <4 x i8> %C, <4 x i8>* %pA`
			`ret void`
			`}`
Optimizing swizzles of complex shuffles may generate additional complex shuffles. Do not try to optimize swizzles of shuffles if the source shuffle has more than a single user, except when the source shuffle is also a swizzle. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@153864 91177308-0d34-0410-b5e6-96231b3b80d8 2012-04-02 07:11:12 +00:00
			`; CHECK: multi_use_swizzle`
			`; CHECK: mov`
			`; CHECK-NEXT: shuf`
			`; CHECK-NEXT: shuf`
			`; CHECK-NEXT: shuf`
			`; CHECK-NEXT: xor`
			`; CHECK-NEXT: ret`
			`define <4 x i32> @multi_use_swizzle (<4 x i32>* %pA, <4 x i32>* %pB) {`
			`%A = load <4 x i32>* %pA`
			`%B = load <4 x i32>* %pB`
			`%S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 6>`
			`%S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 2>`
			`%S2 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 2>`
			`%R = xor <4 x i32> %S1, %S2`
			`ret <4 x i32> %R`
			`}`
Add an additional testcase which checks ops with multiple users. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@153939 91177308-0d34-0410-b5e6-96231b3b80d8 2012-04-03 07:39:36 +00:00
			`; CHECK: pull_bitcast2`
			`; CHECK: xorl`
			`; CHECK: ret`
			`define <4 x i8> @pull_bitcast2 (<4 x i8>* %pA, <4 x i8>* %pB, <4 x i8>* %pC) {`
			`%A = load <4 x i8>* %pA`
			`store <4 x i8> %A, <4 x i8>* %pC`
			`%B = load <4 x i8>* %pB`
			`%C = xor <4 x i8> %A, %B`
			`store <4 x i8> %C, <4 x i8>* %pA`
			`ret <4 x i8> %C`
			`}`
1. Remove the part of r153848 which optimizes shuffle-of-shuffle into a new shuffle node because it could introduce new shuffle nodes that were not supported efficiently by the target. 2. Add a more restrictive shuffle-of-shuffle optimization for cases where the second shuffle reverses the transformation of the first shuffle. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154266 91177308-0d34-0410-b5e6-96231b3b80d8 2012-04-07 21:19:08 +00:00


			`; CHECK: reverse_1`
			`; CHECK-NOT: shuf`
			`; CHECK: ret`
			`define <4 x i32> @reverse_1 (<4 x i32>* %pA, <4 x i32>* %pB) {`
			`%A = load <4 x i32>* %pA`
			`%B = load <4 x i32>* %pB`
			`%S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 0, i32 3, i32 2>`
			`%S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>`
			`ret <4 x i32> %S1`
			`}`


			`; CHECK: no_reverse_shuff`
			`; CHECK: shuf`
			`; CHECK: ret`
			`define <4 x i32> @no_reverse_shuff (<4 x i32>* %pA, <4 x i32>* %pB) {`
			`%A = load <4 x i32>* %pA`
			`%B = load <4 x i32>* %pB`
			`%S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 0, i32 3, i32 2>`
			`%S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>`
			`ret <4 x i32> %S1`
			`}`