diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 155bb0a7a58..0b3428ed7a6 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -1185,6 +1185,14 @@ public: return true; } + /// Return true if it's significantly cheaper to shift a vector by a uniform + /// scalar than by an amount which will vary across each lane. On x86, for + /// example, there is a "psllw" instruction for the former case, but no simple + /// instruction for a general "a << b" operation on vectors. + virtual bool isVectorShiftByScalarCheap(Type *Ty) const { + return false; + } + /// Return true if it's free to truncate a value of type Ty1 to type /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16 /// by referencing its sub-register AX. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b1d734e932b..f038580c5f9 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -14172,6 +14172,24 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, return true; } +bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { + unsigned Bits = Ty->getScalarSizeInBits(); + + // 8-bit shifts are always expensive, but versions with a scalar amount aren't + // particularly cheaper than those without. + if (Bits == 8) + return false; + + // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make + // variable shifts just as cheap as scalar ones. + if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64)) + return false; + + // Otherwise, it's significantly cheaper to shift by a scalar amount than by a + // fully general vector. + return true; +} + bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 6ea060ba3bc..ce9594ae3ed 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -679,6 +679,9 @@ namespace llvm { /// the immediate into a register. virtual bool isLegalAddImmediate(int64_t Imm) const; + + virtual bool isVectorShiftByScalarCheap(Type *Ty) const; + /// isTruncateFree - Return true if it's free to truncate a value of /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in /// register EAX to i16 by referencing its sub-register AX. diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index 0fde256943d..3c9ecce8e3e 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -132,6 +132,7 @@ typedef DenseMap InstrToOrigTy; bool MoveExtToFormExtLoad(Instruction *I); bool OptimizeExtUses(Instruction *I); bool OptimizeSelectInst(SelectInst *SI); + bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI); bool DupRetToEnableTailCallOpts(BasicBlock *BB); bool PlaceDbgValues(Function &F); }; @@ -2719,6 +2720,74 @@ bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) { return true; } + +bool isBroadcastShuffle(ShuffleVectorInst *SVI) { + SmallVector Mask(SVI->getShuffleMask()); + int SplatElem = -1; + for (unsigned i = 0; i < Mask.size(); ++i) { + if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem) + return false; + SplatElem = Mask[i]; + } + + return true; +} + +/// Some targets have expensive vector shifts if the lanes aren't all the same +/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases +/// it's often worth sinking a shufflevector splat down to its use so that +/// codegen can spot all lanes are identical. +bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) { + BasicBlock *DefBB = SVI->getParent(); + + // Only do this xform if variable vector shifts are particularly expensive. + if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType())) + return false; + + // We only expect better codegen by sinking a shuffle if we can recognise a + // constant splat. + if (!isBroadcastShuffle(SVI)) + return false; + + // InsertedShuffles - Only insert a shuffle in each block once. + DenseMap InsertedShuffles; + + bool MadeChange = false; + for (Value::use_iterator UI = SVI->use_begin(), E = SVI->use_end(); + UI != E; ++UI) { + Instruction *User = cast(*UI); + + // Figure out which BB this ext is used in. + BasicBlock *UserBB = User->getParent(); + if (UserBB == DefBB) continue; + + // For now only apply this when the splat is used by a shift instruction. + if (!User->isShift()) continue; + + // Everything checks out, sink the shuffle if the user's block doesn't + // already have a copy. + Instruction *&InsertedShuffle = InsertedShuffles[UserBB]; + + if (!InsertedShuffle) { + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); + InsertedShuffle = new ShuffleVectorInst(SVI->getOperand(0), + SVI->getOperand(1), + SVI->getOperand(2), "", InsertPt); + } + + User->replaceUsesOfWith(SVI, InsertedShuffle); + MadeChange = true; + } + + // If we removed all uses, nuke the shuffle. + if (SVI->use_empty()) { + SVI->eraseFromParent(); + MadeChange = true; + } + + return MadeChange; +} + bool CodeGenPrepare::OptimizeInst(Instruction *I) { if (PHINode *P = dyn_cast(I)) { // It is possible for very late stage optimizations (such as SimplifyCFG) @@ -2791,6 +2860,9 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { if (SelectInst *SI = dyn_cast(I)) return OptimizeSelectInst(SI); + if (ShuffleVectorInst *SVI = dyn_cast(I)) + return OptimizeShuffleVectorInst(SVI); + return false; } diff --git a/test/Transforms/CodeGenPrepare/x86-shuffle-sink.ll b/test/Transforms/CodeGenPrepare/x86-shuffle-sink.ll new file mode 100644 index 00000000000..c4ee79cd7fb --- /dev/null +++ b/test/Transforms/CodeGenPrepare/x86-shuffle-sink.ll @@ -0,0 +1,102 @@ +; RUN: opt -S -codegenprepare -mtriple=x86_64-apple-macosx10.9 -mcpu=core-avx2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX2 +; RUN: opt -S -codegenprepare -mtriple=x86_64-apple-macosx10.9 -mcpu=corei7 %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SSE2 + +define <16 x i8> @test_8bit(<16 x i8> %lhs, <16 x i8> %tmp, i1 %tst) { +; CHECK-LABEL: @test_8bit +; CHECK: if_true: +; CHECK-NOT: shufflevector + +; CHECK: if_false: +; CHECK-NOT: shufflevector +; CHECK: shl <16 x i8> %lhs, %mask + %mask = shufflevector <16 x i8> %tmp, <16 x i8> undef, <16 x i32> zeroinitializer + br i1 %tst, label %if_true, label %if_false + +if_true: + ret <16 x i8> %mask + +if_false: + %res = shl <16 x i8> %lhs, %mask + ret <16 x i8> %res +} + +define <8 x i16> @test_16bit(<8 x i16> %lhs, <8 x i16> %tmp, i1 %tst) { +; CHECK-LABEL: @test_16bit +; CHECK: if_true: +; CHECK-NOT: shufflevector + +; CHECK: if_false: +; CHECK: [[SPLAT:%[0-9a-zA-Z_]+]] = shufflevector +; CHECK: shl <8 x i16> %lhs, [[SPLAT]] + %mask = shufflevector <8 x i16> %tmp, <8 x i16> undef, <8 x i32> zeroinitializer + br i1 %tst, label %if_true, label %if_false + +if_true: + ret <8 x i16> %mask + +if_false: + %res = shl <8 x i16> %lhs, %mask + ret <8 x i16> %res +} + +define <4 x i32> @test_notsplat(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) { +; CHECK-LABEL: @test_notsplat +; CHECK: if_true: +; CHECK-NOT: shufflevector + +; CHECK: if_false: +; CHECK-NOT: shufflevector +; CHECK: shl <4 x i32> %lhs, %mask + %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> + br i1 %tst, label %if_true, label %if_false + +if_true: + ret <4 x i32> %mask + +if_false: + %res = shl <4 x i32> %lhs, %mask + ret <4 x i32> %res +} + +define <4 x i32> @test_32bit(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) { +; CHECK-AVX2-LABEL: @test_32bit +; CHECK-AVX2: if_false: +; CHECK-AVX2-NOT: shufflevector +; CHECK-AVX2: ashr <4 x i32> %lhs, %mask + +; CHECK-SSE2-LABEL: @test_32bit +; CHECK-SSE2: if_false: +; CHECK-SSE2: [[SPLAT:%[0-9a-zA-Z_]+]] = shufflevector +; CHECK-SSE2: ashr <4 x i32> %lhs, [[SPLAT]] + %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> + br i1 %tst, label %if_true, label %if_false + +if_true: + ret <4 x i32> %mask + +if_false: + %res = ashr <4 x i32> %lhs, %mask + ret <4 x i32> %res +} + +define <2 x i64> @test_64bit(<2 x i64> %lhs, <2 x i64> %tmp, i1 %tst) { +; CHECK-AVX2-LABEL: @test_64bit +; CHECK-AVX2: if_false: +; CHECK-AVX2-NOT: shufflevector +; CHECK-AVX2: lshr <2 x i64> %lhs, %mask + +; CHECK-SSE2-LABEL: @test_64bit +; CHECK-SSE2: if_false: +; CHECK-SSE2: [[SPLAT:%[0-9a-zA-Z_]+]] = shufflevector +; CHECK-SSE2: lshr <2 x i64> %lhs, [[SPLAT]] + + %mask = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer + br i1 %tst, label %if_true, label %if_false + +if_true: + ret <2 x i64> %mask + +if_false: + %res = lshr <2 x i64> %lhs, %mask + ret <2 x i64> %res +}