Two fixes to the vpermilvar optimization.

The instcomine logic to handle vpermilvar's pd and 256 variants was incorrect.
The _256 variants have indexes into the individual 128 bit lanes and in all
cases it also has to mask out unused bits.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207577 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Rafael Espindola
2014-04-29 20:41:54 +00:00
parent 6149bc1e10
commit 984f2fc09e
2 changed files with 28 additions and 5 deletions

View File

@@ -725,9 +725,32 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// Convert vpermil* to shufflevector if the mask is constant.
Value *V = II->getArgOperand(1);
if (auto C = dyn_cast<ConstantDataVector>(V)) {
unsigned Size = C->getNumElements();
assert(Size == 8 || Size == 4 || Size == 2);
uint32_t Indexes[8];
// The intrinsics only read one or two bits, clear the rest.
for (unsigned I = 0; I < Size; ++I) {
uint32_t Index = C->getElementAsInteger(I) & 0x3;
if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd ||
II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256)
Index >>= 1;
Indexes[I] = Index;
}
// The _256 variants are a bit trickier since the mask bits always index
// into the corresponding 128 half. In order to convert to a generic
// shuffle, we have to make that explicit.
if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_ps_256 ||
II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) {
for (unsigned I = Size / 2; I < Size; ++I)
Indexes[I] += Size / 2;
}
auto NewC =
ConstantDataVector::get(C->getContext(), makeArrayRef(Indexes, Size));
auto V1 = II->getArgOperand(0);
auto V2 = UndefValue::get(V1->getType());
auto Shuffle = Builder->CreateShuffleVector(V1, V2, C);
auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC);
return ReplaceInstUsesWith(CI, Shuffle);
}
break;