Scalarizer for masked load and store intrinsics.

Masked vector intrinsics are a part of common LLVM IR, but they are really supported on AVX2 and AVX-512 targets. I added a code that translates masked intrinsic for all other targets. The masked vector intrinsic is converted to a chain of scalar operations inside conditional basic blocks.

http://reviews.llvm.org/D6436



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224897 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2014-12-28 08:54:45 +00:00
parent 04c853b269
commit 8499a501e4
2 changed files with 287 additions and 38 deletions

View File

@ -164,11 +164,11 @@ class TypePromotionTransaction;
bool EliminateMostlyEmptyBlocks(Function &F);
bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
void EliminateMostlyEmptyBlock(BasicBlock *BB);
bool OptimizeBlock(BasicBlock &BB);
bool OptimizeInst(Instruction *I);
bool OptimizeBlock(BasicBlock &BB, bool& ModifiedDT);
bool OptimizeInst(Instruction *I, bool& ModifiedDT);
bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
bool OptimizeInlineAsmInst(CallInst *CS);
bool OptimizeCallInst(CallInst *CI);
bool OptimizeCallInst(CallInst *CI, bool& ModifiedDT);
bool MoveExtToFormExtLoad(Instruction *&I);
bool OptimizeExtUses(Instruction *I);
bool OptimizeSelectInst(SelectInst *SI);
@ -245,7 +245,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
MadeChange = false;
for (Function::iterator I = F.begin(); I != F.end(); ) {
BasicBlock *BB = I++;
MadeChange |= OptimizeBlock(*BB);
bool ModifiedDTOnIteration = false;
MadeChange |= OptimizeBlock(*BB, ModifiedDTOnIteration);
// Restart BB iteration if the dominator tree of the Function was changed
ModifiedDT |= ModifiedDTOnIteration;
if (ModifiedDTOnIteration)
break;
}
EverMadeChange |= MadeChange;
}
@ -857,7 +863,211 @@ protected:
};
} // end anonymous namespace
bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
// ScalarizeMaskedLoad() translates masked load intrinsic, like
// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
// <16 x i1> %mask, <16 x i32> %passthru)
// to a chain of basic blocks, whith loading element one-by-one if
// the appropriate mask bit is set
//
// %1 = bitcast i8* %addr to i32*
// %2 = extractelement <16 x i1> %mask, i32 0
// %3 = icmp eq i1 %2, true
// br i1 %3, label %cond.load, label %else
//
//cond.load: ; preds = %0
// %4 = getelementptr i32* %1, i32 0
// %5 = load i32* %4
// %6 = insertelement <16 x i32> undef, i32 %5, i32 0
// br label %else
//
//else: ; preds = %0, %cond.load
// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
// %7 = extractelement <16 x i1> %mask, i32 1
// %8 = icmp eq i1 %7, true
// br i1 %8, label %cond.load1, label %else2
//
//cond.load1: ; preds = %else
// %9 = getelementptr i32* %1, i32 1
// %10 = load i32* %9
// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
// br label %else2
//
//else2: ; preds = %else, %cond.load1
// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
// %12 = extractelement <16 x i1> %mask, i32 2
// %13 = icmp eq i1 %12, true
// br i1 %13, label %cond.load4, label %else5
//
static void ScalarizeMaskedLoad(CallInst *CI) {
Value *Ptr = CI->getArgOperand(0);
Value *Src0 = CI->getArgOperand(3);
Value *Mask = CI->getArgOperand(2);
VectorType *VecType = dyn_cast<VectorType>(CI->getType());
Type *EltTy = VecType->getElementType();
assert(VecType && "Unexpected return type of masked load intrinsic");
IRBuilder<> Builder(CI->getContext());
Instruction *InsertPt = CI;
BasicBlock *IfBlock = CI->getParent();
BasicBlock *CondBlock = nullptr;
BasicBlock *PrevIfBlock = CI->getParent();
Builder.SetInsertPoint(InsertPt);
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
// Bitcast %addr fron i8* to EltTy*
Type *NewPtrType =
EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
Value *UndefVal = UndefValue::get(VecType);
// The result vector
Value *VResult = UndefVal;
PHINode *Phi = nullptr;
Value *PrevPhi = UndefVal;
unsigned VectorWidth = VecType->getNumElements();
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
// Fill the "else" block, created in the previous iteration
//
// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
// %mask_1 = extractelement <16 x i1> %mask, i32 Idx
// %to_load = icmp eq i1 %mask_1, true
// br i1 %to_load, label %cond.load, label %else
//
if (Idx > 0) {
Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
Phi->addIncoming(VResult, CondBlock);
Phi->addIncoming(PrevPhi, PrevIfBlock);
PrevPhi = Phi;
VResult = Phi;
}
Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
ConstantInt::get(Predicate->getType(), 1));
// Create "cond" block
//
// %EltAddr = getelementptr i32* %1, i32 0
// %Elt = load i32* %EltAddr
// VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
//
CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
Builder.SetInsertPoint(InsertPt);
Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
LoadInst* Load = Builder.CreateLoad(Gep, false);
VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
// Create "else" block, fill it in the next iteration
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
Builder.SetInsertPoint(InsertPt);
Instruction *OldBr = IfBlock->getTerminator();
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
OldBr->eraseFromParent();
PrevIfBlock = IfBlock;
IfBlock = NewIfBlock;
}
Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
Phi->addIncoming(VResult, CondBlock);
Phi->addIncoming(PrevPhi, PrevIfBlock);
Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
CI->replaceAllUsesWith(NewI);
CI->eraseFromParent();
}
// ScalarizeMaskedStore() translates masked store intrinsic, like
// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
// <16 x i1> %mask)
// to a chain of basic blocks, that stores element one-by-one if
// the appropriate mask bit is set
//
// %1 = bitcast i8* %addr to i32*
// %2 = extractelement <16 x i1> %mask, i32 0
// %3 = icmp eq i1 %2, true
// br i1 %3, label %cond.store, label %else
//
// cond.store: ; preds = %0
// %4 = extractelement <16 x i32> %val, i32 0
// %5 = getelementptr i32* %1, i32 0
// store i32 %4, i32* %5
// br label %else
//
// else: ; preds = %0, %cond.store
// %6 = extractelement <16 x i1> %mask, i32 1
// %7 = icmp eq i1 %6, true
// br i1 %7, label %cond.store1, label %else2
//
// cond.store1: ; preds = %else
// %8 = extractelement <16 x i32> %val, i32 1
// %9 = getelementptr i32* %1, i32 1
// store i32 %8, i32* %9
// br label %else2
// . . .
static void ScalarizeMaskedStore(CallInst *CI) {
Value *Ptr = CI->getArgOperand(1);
Value *Src = CI->getArgOperand(0);
Value *Mask = CI->getArgOperand(3);
VectorType *VecType = dyn_cast<VectorType>(Src->getType());
Type *EltTy = VecType->getElementType();
assert(VecType && "Unexpected data type in masked store intrinsic");
IRBuilder<> Builder(CI->getContext());
Instruction *InsertPt = CI;
BasicBlock *IfBlock = CI->getParent();
Builder.SetInsertPoint(InsertPt);
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
// Bitcast %addr fron i8* to EltTy*
Type *NewPtrType =
EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
unsigned VectorWidth = VecType->getNumElements();
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
// Fill the "else" block, created in the previous iteration
//
// %mask_1 = extractelement <16 x i1> %mask, i32 Idx
// %to_store = icmp eq i1 %mask_1, true
// br i1 %to_load, label %cond.store, label %else
//
Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
ConstantInt::get(Predicate->getType(), 1));
// Create "cond" block
//
// %OneElt = extractelement <16 x i32> %Src, i32 Idx
// %EltAddr = getelementptr i32* %1, i32 0
// %store i32 %OneElt, i32* %EltAddr
//
BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
Builder.SetInsertPoint(InsertPt);
Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
Builder.CreateStore(OneElt, Gep);
// Create "else" block, fill it in the next iteration
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
Builder.SetInsertPoint(InsertPt);
Instruction *OldBr = IfBlock->getTerminator();
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
OldBr->eraseFromParent();
IfBlock = NewIfBlock;
}
CI->eraseFromParent();
}
bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
BasicBlock *BB = CI->getParent();
// Lower inline assembly if we can.
@ -877,38 +1087,60 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
return true;
}
// Lower all uses of llvm.objectsize.*
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
Type *ReturnTy = CI->getType();
Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
if (II) {
switch (II->getIntrinsicID()) {
default: break;
case Intrinsic::objectsize: {
// Lower all uses of llvm.objectsize.*
bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
Type *ReturnTy = CI->getType();
Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
// Substituting this can cause recursive simplifications, which can
// invalidate our iterator. Use a WeakVH to hold onto it in case this
// happens.
WeakVH IterHandle(CurInstIterator);
// Substituting this can cause recursive simplifications, which can
// invalidate our iterator. Use a WeakVH to hold onto it in case this
// happens.
WeakVH IterHandle(CurInstIterator);
replaceAndRecursivelySimplify(CI, RetVal,
TLI ? TLI->getDataLayout() : nullptr,
TLInfo, ModifiedDT ? nullptr : DT);
replaceAndRecursivelySimplify(CI, RetVal,
TLI ? TLI->getDataLayout() : nullptr,
TLInfo, ModifiedDT ? nullptr : DT);
// If the iterator instruction was recursively deleted, start over at the
// start of the block.
if (IterHandle != CurInstIterator) {
CurInstIterator = BB->begin();
SunkAddrs.clear();
// If the iterator instruction was recursively deleted, start over at the
// start of the block.
if (IterHandle != CurInstIterator) {
CurInstIterator = BB->begin();
SunkAddrs.clear();
}
return true;
}
case Intrinsic::masked_load: {
// Scalarize unsupported vector masked load
if (!TTI->isLegalMaskedLoad(CI->getType(), 1)) {
ScalarizeMaskedLoad(CI);
ModifiedDT = true;
return true;
}
return false;
}
case Intrinsic::masked_store: {
if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(), 1)) {
ScalarizeMaskedStore(CI);
ModifiedDT = true;
return true;
}
return false;
}
}
return true;
}
if (II && TLI) {
SmallVector<Value*, 2> PtrOps;
Type *AccessTy;
if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
while (!PtrOps.empty())
if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
return true;
if (TLI) {
SmallVector<Value*, 2> PtrOps;
Type *AccessTy;
if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
while (!PtrOps.empty())
if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
return true;
}
}
// From here on out we're working with named functions.
@ -3801,7 +4033,7 @@ bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
return false;
}
bool CodeGenPrepare::OptimizeInst(Instruction *I) {
bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
if (PHINode *P = dyn_cast<PHINode>(I)) {
// It is possible for very late stage optimizations (such as SimplifyCFG)
// to introduce PHI nodes too late to be cleaned up. If we detect such a
@ -3880,14 +4112,14 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
GEPI->replaceAllUsesWith(NC);
GEPI->eraseFromParent();
++NumGEPsElim;
OptimizeInst(NC);
OptimizeInst(NC, ModifiedDT);
return true;
}
return false;
}
if (CallInst *CI = dyn_cast<CallInst>(I))
return OptimizeCallInst(CI);
return OptimizeCallInst(CI, ModifiedDT);
if (SelectInst *SI = dyn_cast<SelectInst>(I))
return OptimizeSelectInst(SI);
@ -3904,14 +4136,16 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
// In this pass we look for GEP and cast instructions that are used
// across basic blocks and rewrite them to improve basic-block-at-a-time
// selection.
bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
SunkAddrs.clear();
bool MadeChange = false;
CurInstIterator = BB.begin();
while (CurInstIterator != BB.end())
MadeChange |= OptimizeInst(CurInstIterator++);
while (CurInstIterator != BB.end()) {
MadeChange |= OptimizeInst(CurInstIterator++, ModifiedDT);
if (ModifiedDT)
return true;
}
MadeChange |= DupRetToEnableTailCallOpts(&BB);
return MadeChange;

View File

@ -1,5 +1,6 @@
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR
; AVX512-LABEL: test1
; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
@ -9,6 +10,12 @@
; AVX2: vpmaskmovd (%rdi)
; AVX2-NOT: blend
; AVX_SCALAR-LABEL: test1
; AVX_SCALAR-NOT: masked
; AVX_SCALAR: extractelement
; AVX_SCALAR: insertelement
; AVX_SCALAR: extractelement
; AVX_SCALAR: insertelement
define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
%res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
@ -31,6 +38,14 @@ define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
; AVX512-LABEL: test3
; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1}
; AVX_SCALAR-LABEL: test3
; AVX_SCALAR-NOT: masked
; AVX_SCALAR: extractelement
; AVX_SCALAR: store
; AVX_SCALAR: extractelement
; AVX_SCALAR: store
; AVX_SCALAR: extractelement
; AVX_SCALAR: store
define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)