mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-15 23:31:37 +00:00
Scalarizer for masked load and store intrinsics.
Masked vector intrinsics are a part of common LLVM IR, but they are really supported on AVX2 and AVX-512 targets. I added a code that translates masked intrinsic for all other targets. The masked vector intrinsic is converted to a chain of scalar operations inside conditional basic blocks. http://reviews.llvm.org/D6436 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224897 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
04c853b269
commit
8499a501e4
@ -164,11 +164,11 @@ class TypePromotionTransaction;
|
||||
bool EliminateMostlyEmptyBlocks(Function &F);
|
||||
bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
|
||||
void EliminateMostlyEmptyBlock(BasicBlock *BB);
|
||||
bool OptimizeBlock(BasicBlock &BB);
|
||||
bool OptimizeInst(Instruction *I);
|
||||
bool OptimizeBlock(BasicBlock &BB, bool& ModifiedDT);
|
||||
bool OptimizeInst(Instruction *I, bool& ModifiedDT);
|
||||
bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
|
||||
bool OptimizeInlineAsmInst(CallInst *CS);
|
||||
bool OptimizeCallInst(CallInst *CI);
|
||||
bool OptimizeCallInst(CallInst *CI, bool& ModifiedDT);
|
||||
bool MoveExtToFormExtLoad(Instruction *&I);
|
||||
bool OptimizeExtUses(Instruction *I);
|
||||
bool OptimizeSelectInst(SelectInst *SI);
|
||||
@ -245,7 +245,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
|
||||
MadeChange = false;
|
||||
for (Function::iterator I = F.begin(); I != F.end(); ) {
|
||||
BasicBlock *BB = I++;
|
||||
MadeChange |= OptimizeBlock(*BB);
|
||||
bool ModifiedDTOnIteration = false;
|
||||
MadeChange |= OptimizeBlock(*BB, ModifiedDTOnIteration);
|
||||
|
||||
// Restart BB iteration if the dominator tree of the Function was changed
|
||||
ModifiedDT |= ModifiedDTOnIteration;
|
||||
if (ModifiedDTOnIteration)
|
||||
break;
|
||||
}
|
||||
EverMadeChange |= MadeChange;
|
||||
}
|
||||
@ -857,7 +863,211 @@ protected:
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
|
||||
// ScalarizeMaskedLoad() translates masked load intrinsic, like
|
||||
// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
|
||||
// <16 x i1> %mask, <16 x i32> %passthru)
|
||||
// to a chain of basic blocks, whith loading element one-by-one if
|
||||
// the appropriate mask bit is set
|
||||
//
|
||||
// %1 = bitcast i8* %addr to i32*
|
||||
// %2 = extractelement <16 x i1> %mask, i32 0
|
||||
// %3 = icmp eq i1 %2, true
|
||||
// br i1 %3, label %cond.load, label %else
|
||||
//
|
||||
//cond.load: ; preds = %0
|
||||
// %4 = getelementptr i32* %1, i32 0
|
||||
// %5 = load i32* %4
|
||||
// %6 = insertelement <16 x i32> undef, i32 %5, i32 0
|
||||
// br label %else
|
||||
//
|
||||
//else: ; preds = %0, %cond.load
|
||||
// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
|
||||
// %7 = extractelement <16 x i1> %mask, i32 1
|
||||
// %8 = icmp eq i1 %7, true
|
||||
// br i1 %8, label %cond.load1, label %else2
|
||||
//
|
||||
//cond.load1: ; preds = %else
|
||||
// %9 = getelementptr i32* %1, i32 1
|
||||
// %10 = load i32* %9
|
||||
// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
|
||||
// br label %else2
|
||||
//
|
||||
//else2: ; preds = %else, %cond.load1
|
||||
// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
|
||||
// %12 = extractelement <16 x i1> %mask, i32 2
|
||||
// %13 = icmp eq i1 %12, true
|
||||
// br i1 %13, label %cond.load4, label %else5
|
||||
//
|
||||
static void ScalarizeMaskedLoad(CallInst *CI) {
|
||||
Value *Ptr = CI->getArgOperand(0);
|
||||
Value *Src0 = CI->getArgOperand(3);
|
||||
Value *Mask = CI->getArgOperand(2);
|
||||
VectorType *VecType = dyn_cast<VectorType>(CI->getType());
|
||||
Type *EltTy = VecType->getElementType();
|
||||
|
||||
assert(VecType && "Unexpected return type of masked load intrinsic");
|
||||
|
||||
IRBuilder<> Builder(CI->getContext());
|
||||
Instruction *InsertPt = CI;
|
||||
BasicBlock *IfBlock = CI->getParent();
|
||||
BasicBlock *CondBlock = nullptr;
|
||||
BasicBlock *PrevIfBlock = CI->getParent();
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
|
||||
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
|
||||
|
||||
// Bitcast %addr fron i8* to EltTy*
|
||||
Type *NewPtrType =
|
||||
EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
|
||||
Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
|
||||
Value *UndefVal = UndefValue::get(VecType);
|
||||
|
||||
// The result vector
|
||||
Value *VResult = UndefVal;
|
||||
|
||||
PHINode *Phi = nullptr;
|
||||
Value *PrevPhi = UndefVal;
|
||||
|
||||
unsigned VectorWidth = VecType->getNumElements();
|
||||
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
|
||||
|
||||
// Fill the "else" block, created in the previous iteration
|
||||
//
|
||||
// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
|
||||
// %mask_1 = extractelement <16 x i1> %mask, i32 Idx
|
||||
// %to_load = icmp eq i1 %mask_1, true
|
||||
// br i1 %to_load, label %cond.load, label %else
|
||||
//
|
||||
if (Idx > 0) {
|
||||
Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
|
||||
Phi->addIncoming(VResult, CondBlock);
|
||||
Phi->addIncoming(PrevPhi, PrevIfBlock);
|
||||
PrevPhi = Phi;
|
||||
VResult = Phi;
|
||||
}
|
||||
|
||||
Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
|
||||
Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
|
||||
ConstantInt::get(Predicate->getType(), 1));
|
||||
|
||||
// Create "cond" block
|
||||
//
|
||||
// %EltAddr = getelementptr i32* %1, i32 0
|
||||
// %Elt = load i32* %EltAddr
|
||||
// VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
|
||||
//
|
||||
CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
|
||||
Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
|
||||
LoadInst* Load = Builder.CreateLoad(Gep, false);
|
||||
VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
|
||||
|
||||
// Create "else" block, fill it in the next iteration
|
||||
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
Instruction *OldBr = IfBlock->getTerminator();
|
||||
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
|
||||
OldBr->eraseFromParent();
|
||||
PrevIfBlock = IfBlock;
|
||||
IfBlock = NewIfBlock;
|
||||
}
|
||||
|
||||
Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
|
||||
Phi->addIncoming(VResult, CondBlock);
|
||||
Phi->addIncoming(PrevPhi, PrevIfBlock);
|
||||
Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
|
||||
CI->replaceAllUsesWith(NewI);
|
||||
CI->eraseFromParent();
|
||||
}
|
||||
|
||||
// ScalarizeMaskedStore() translates masked store intrinsic, like
|
||||
// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
|
||||
// <16 x i1> %mask)
|
||||
// to a chain of basic blocks, that stores element one-by-one if
|
||||
// the appropriate mask bit is set
|
||||
//
|
||||
// %1 = bitcast i8* %addr to i32*
|
||||
// %2 = extractelement <16 x i1> %mask, i32 0
|
||||
// %3 = icmp eq i1 %2, true
|
||||
// br i1 %3, label %cond.store, label %else
|
||||
//
|
||||
// cond.store: ; preds = %0
|
||||
// %4 = extractelement <16 x i32> %val, i32 0
|
||||
// %5 = getelementptr i32* %1, i32 0
|
||||
// store i32 %4, i32* %5
|
||||
// br label %else
|
||||
//
|
||||
// else: ; preds = %0, %cond.store
|
||||
// %6 = extractelement <16 x i1> %mask, i32 1
|
||||
// %7 = icmp eq i1 %6, true
|
||||
// br i1 %7, label %cond.store1, label %else2
|
||||
//
|
||||
// cond.store1: ; preds = %else
|
||||
// %8 = extractelement <16 x i32> %val, i32 1
|
||||
// %9 = getelementptr i32* %1, i32 1
|
||||
// store i32 %8, i32* %9
|
||||
// br label %else2
|
||||
// . . .
|
||||
static void ScalarizeMaskedStore(CallInst *CI) {
|
||||
Value *Ptr = CI->getArgOperand(1);
|
||||
Value *Src = CI->getArgOperand(0);
|
||||
Value *Mask = CI->getArgOperand(3);
|
||||
|
||||
VectorType *VecType = dyn_cast<VectorType>(Src->getType());
|
||||
Type *EltTy = VecType->getElementType();
|
||||
|
||||
assert(VecType && "Unexpected data type in masked store intrinsic");
|
||||
|
||||
IRBuilder<> Builder(CI->getContext());
|
||||
Instruction *InsertPt = CI;
|
||||
BasicBlock *IfBlock = CI->getParent();
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
|
||||
|
||||
// Bitcast %addr fron i8* to EltTy*
|
||||
Type *NewPtrType =
|
||||
EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
|
||||
Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
|
||||
|
||||
unsigned VectorWidth = VecType->getNumElements();
|
||||
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
|
||||
|
||||
// Fill the "else" block, created in the previous iteration
|
||||
//
|
||||
// %mask_1 = extractelement <16 x i1> %mask, i32 Idx
|
||||
// %to_store = icmp eq i1 %mask_1, true
|
||||
// br i1 %to_load, label %cond.store, label %else
|
||||
//
|
||||
Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
|
||||
Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
|
||||
ConstantInt::get(Predicate->getType(), 1));
|
||||
|
||||
// Create "cond" block
|
||||
//
|
||||
// %OneElt = extractelement <16 x i32> %Src, i32 Idx
|
||||
// %EltAddr = getelementptr i32* %1, i32 0
|
||||
// %store i32 %OneElt, i32* %EltAddr
|
||||
//
|
||||
BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
|
||||
Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
|
||||
Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
|
||||
Builder.CreateStore(OneElt, Gep);
|
||||
|
||||
// Create "else" block, fill it in the next iteration
|
||||
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
Instruction *OldBr = IfBlock->getTerminator();
|
||||
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
|
||||
OldBr->eraseFromParent();
|
||||
IfBlock = NewIfBlock;
|
||||
}
|
||||
CI->eraseFromParent();
|
||||
}
|
||||
|
||||
bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
|
||||
BasicBlock *BB = CI->getParent();
|
||||
|
||||
// Lower inline assembly if we can.
|
||||
@ -877,38 +1087,60 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Lower all uses of llvm.objectsize.*
|
||||
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
|
||||
if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
|
||||
bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
|
||||
Type *ReturnTy = CI->getType();
|
||||
Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
|
||||
if (II) {
|
||||
switch (II->getIntrinsicID()) {
|
||||
default: break;
|
||||
case Intrinsic::objectsize: {
|
||||
// Lower all uses of llvm.objectsize.*
|
||||
bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
|
||||
Type *ReturnTy = CI->getType();
|
||||
Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
|
||||
|
||||
// Substituting this can cause recursive simplifications, which can
|
||||
// invalidate our iterator. Use a WeakVH to hold onto it in case this
|
||||
// happens.
|
||||
WeakVH IterHandle(CurInstIterator);
|
||||
// Substituting this can cause recursive simplifications, which can
|
||||
// invalidate our iterator. Use a WeakVH to hold onto it in case this
|
||||
// happens.
|
||||
WeakVH IterHandle(CurInstIterator);
|
||||
|
||||
replaceAndRecursivelySimplify(CI, RetVal,
|
||||
TLI ? TLI->getDataLayout() : nullptr,
|
||||
TLInfo, ModifiedDT ? nullptr : DT);
|
||||
replaceAndRecursivelySimplify(CI, RetVal,
|
||||
TLI ? TLI->getDataLayout() : nullptr,
|
||||
TLInfo, ModifiedDT ? nullptr : DT);
|
||||
|
||||
// If the iterator instruction was recursively deleted, start over at the
|
||||
// start of the block.
|
||||
if (IterHandle != CurInstIterator) {
|
||||
CurInstIterator = BB->begin();
|
||||
SunkAddrs.clear();
|
||||
// If the iterator instruction was recursively deleted, start over at the
|
||||
// start of the block.
|
||||
if (IterHandle != CurInstIterator) {
|
||||
CurInstIterator = BB->begin();
|
||||
SunkAddrs.clear();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case Intrinsic::masked_load: {
|
||||
// Scalarize unsupported vector masked load
|
||||
if (!TTI->isLegalMaskedLoad(CI->getType(), 1)) {
|
||||
ScalarizeMaskedLoad(CI);
|
||||
ModifiedDT = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case Intrinsic::masked_store: {
|
||||
if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(), 1)) {
|
||||
ScalarizeMaskedStore(CI);
|
||||
ModifiedDT = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (II && TLI) {
|
||||
SmallVector<Value*, 2> PtrOps;
|
||||
Type *AccessTy;
|
||||
if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
|
||||
while (!PtrOps.empty())
|
||||
if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
|
||||
return true;
|
||||
if (TLI) {
|
||||
SmallVector<Value*, 2> PtrOps;
|
||||
Type *AccessTy;
|
||||
if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
|
||||
while (!PtrOps.empty())
|
||||
if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// From here on out we're working with named functions.
|
||||
@ -3801,7 +4033,7 @@ bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool CodeGenPrepare::OptimizeInst(Instruction *I) {
|
||||
bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
|
||||
if (PHINode *P = dyn_cast<PHINode>(I)) {
|
||||
// It is possible for very late stage optimizations (such as SimplifyCFG)
|
||||
// to introduce PHI nodes too late to be cleaned up. If we detect such a
|
||||
@ -3880,14 +4112,14 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
|
||||
GEPI->replaceAllUsesWith(NC);
|
||||
GEPI->eraseFromParent();
|
||||
++NumGEPsElim;
|
||||
OptimizeInst(NC);
|
||||
OptimizeInst(NC, ModifiedDT);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (CallInst *CI = dyn_cast<CallInst>(I))
|
||||
return OptimizeCallInst(CI);
|
||||
return OptimizeCallInst(CI, ModifiedDT);
|
||||
|
||||
if (SelectInst *SI = dyn_cast<SelectInst>(I))
|
||||
return OptimizeSelectInst(SI);
|
||||
@ -3904,14 +4136,16 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
|
||||
// In this pass we look for GEP and cast instructions that are used
|
||||
// across basic blocks and rewrite them to improve basic-block-at-a-time
|
||||
// selection.
|
||||
bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
|
||||
bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
|
||||
SunkAddrs.clear();
|
||||
bool MadeChange = false;
|
||||
|
||||
CurInstIterator = BB.begin();
|
||||
while (CurInstIterator != BB.end())
|
||||
MadeChange |= OptimizeInst(CurInstIterator++);
|
||||
|
||||
while (CurInstIterator != BB.end()) {
|
||||
MadeChange |= OptimizeInst(CurInstIterator++, ModifiedDT);
|
||||
if (ModifiedDT)
|
||||
return true;
|
||||
}
|
||||
MadeChange |= DupRetToEnableTailCallOpts(&BB);
|
||||
|
||||
return MadeChange;
|
||||
|
@ -1,5 +1,6 @@
|
||||
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512
|
||||
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
|
||||
; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR
|
||||
|
||||
; AVX512-LABEL: test1
|
||||
; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
|
||||
@ -9,6 +10,12 @@
|
||||
; AVX2: vpmaskmovd (%rdi)
|
||||
; AVX2-NOT: blend
|
||||
|
||||
; AVX_SCALAR-LABEL: test1
|
||||
; AVX_SCALAR-NOT: masked
|
||||
; AVX_SCALAR: extractelement
|
||||
; AVX_SCALAR: insertelement
|
||||
; AVX_SCALAR: extractelement
|
||||
; AVX_SCALAR: insertelement
|
||||
define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
|
||||
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
|
||||
%res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
|
||||
@ -31,6 +38,14 @@ define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
|
||||
; AVX512-LABEL: test3
|
||||
; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1}
|
||||
|
||||
; AVX_SCALAR-LABEL: test3
|
||||
; AVX_SCALAR-NOT: masked
|
||||
; AVX_SCALAR: extractelement
|
||||
; AVX_SCALAR: store
|
||||
; AVX_SCALAR: extractelement
|
||||
; AVX_SCALAR: store
|
||||
; AVX_SCALAR: extractelement
|
||||
; AVX_SCALAR: store
|
||||
define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
|
||||
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
|
||||
call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
|
||||
|
Loading…
x
Reference in New Issue
Block a user