mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-02-25 03:30:37 +00:00
[X86, SSE] instcombine common cases of insertps intrinsics into shuffles
This is very similar to D8486 / r232852 (vperm2). If we treat insertps intrinsics as shufflevectors, we can optimize them better. I've left all but the full zero case of the zero mask variants out of this patch. I don't think those can be converted into a single shuffle in all cases, but I'd be happy to be proven wrong as I was for vperm2f128. Either way, we'd need to support whatever sequence we come up with for those cases in the backend before converting them here. Differential Revision: http://reviews.llvm.org/D8833 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@235124 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
3cdbd163d0
commit
81b61c0e50
@ -197,12 +197,51 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Value *SimplifyX86insertps(const IntrinsicInst &II,
|
||||||
|
InstCombiner::BuilderTy &Builder) {
|
||||||
|
if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
|
||||||
|
VectorType *VecTy = cast<VectorType>(II.getType());
|
||||||
|
ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
|
||||||
|
|
||||||
|
// The immediate permute control byte looks like this:
|
||||||
|
// [3:0] - zero mask for each 32-bit lane
|
||||||
|
// [5:4] - select one 32-bit destination lane
|
||||||
|
// [7:6] - select one 32-bit source lane
|
||||||
|
|
||||||
|
uint8_t Imm = CInt->getZExtValue();
|
||||||
|
uint8_t ZMask = Imm & 0xf;
|
||||||
|
uint8_t DestLane = (Imm >> 4) & 0x3;
|
||||||
|
uint8_t SourceLane = (Imm >> 6) & 0x3;
|
||||||
|
|
||||||
|
// If all zero mask bits are set, this was just a weird way to
|
||||||
|
// generate a zero vector.
|
||||||
|
if (ZMask == 0xf)
|
||||||
|
return ZeroVector;
|
||||||
|
|
||||||
|
// TODO: Model this case as two shuffles or a 'logical and' plus shuffle?
|
||||||
|
if (ZMask)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
|
||||||
|
|
||||||
|
// If we're not zeroing anything, this is a single shuffle.
|
||||||
|
// Replace the selected destination lane with the selected source lane.
|
||||||
|
// For all other lanes, pass the first source bits through.
|
||||||
|
int ShuffleMask[4] = { 0, 1, 2, 3 };
|
||||||
|
ShuffleMask[DestLane] = SourceLane + 4;
|
||||||
|
|
||||||
|
return Builder.CreateShuffleVector(II.getArgOperand(0), II.getArgOperand(1),
|
||||||
|
ShuffleMask);
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
|
/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
|
||||||
/// source vectors, unless a zero bit is set. If a zero bit is set,
|
/// source vectors, unless a zero bit is set. If a zero bit is set,
|
||||||
/// then ignore that half of the mask and clear that half of the vector.
|
/// then ignore that half of the mask and clear that half of the vector.
|
||||||
static Value *SimplifyX86vperm2(const IntrinsicInst &II,
|
static Value *SimplifyX86vperm2(const IntrinsicInst &II,
|
||||||
InstCombiner::BuilderTy &Builder) {
|
InstCombiner::BuilderTy &Builder) {
|
||||||
if (auto CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
|
if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
|
||||||
VectorType *VecTy = cast<VectorType>(II.getType());
|
VectorType *VecTy = cast<VectorType>(II.getType());
|
||||||
ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
|
ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
|
||||||
|
|
||||||
@ -730,6 +769,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case Intrinsic::x86_sse41_insertps:
|
||||||
|
if (Value *V = SimplifyX86insertps(*II, *Builder))
|
||||||
|
return ReplaceInstUsesWith(*II, V);
|
||||||
|
break;
|
||||||
|
|
||||||
case Intrinsic::x86_sse4a_insertqi: {
|
case Intrinsic::x86_sse4a_insertqi: {
|
||||||
// insertqi x, y, 64, 0 can just copy y's lower bits and leave the top
|
// insertqi x, y, 64, 0 can just copy y's lower bits and leave the top
|
||||||
|
117
test/Transforms/InstCombine/x86-insertps.ll
Normal file
117
test/Transforms/InstCombine/x86-insertps.ll
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
; RUN: opt < %s -instcombine -S | FileCheck %s
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
|
||||||
|
|
||||||
|
; This should never happen, but make sure we don't crash handling a non-constant immediate byte.
|
||||||
|
|
||||||
|
define <4 x float> @insertps_non_const_imm(<4 x float> %v1, <4 x float> %v2, i8 %c) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_non_const_imm
|
||||||
|
; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)
|
||||||
|
; CHECK-NEXT: ret <4 x float>
|
||||||
|
}
|
||||||
|
|
||||||
|
; If all zero mask bits are set, return a zero regardless of the other control bits.
|
||||||
|
|
||||||
|
define <4 x float> @insertps_0x0f(<4 x float> %v1, <4 x float> %v2) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 15)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_0x0f
|
||||||
|
; CHECK-NEXT: ret <4 x float> zeroinitializer
|
||||||
|
}
|
||||||
|
define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 255)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_0xff
|
||||||
|
; CHECK-NEXT: ret <4 x float> zeroinitializer
|
||||||
|
}
|
||||||
|
|
||||||
|
; If some zero mask bits are set, we do not change anything.
|
||||||
|
|
||||||
|
define <4 x float> @insertps_0x03(<4 x float> %v1, <4 x float> %v2) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_0x03
|
||||||
|
; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
|
||||||
|
; CHECK-NEXT: ret <4 x float>
|
||||||
|
}
|
||||||
|
|
||||||
|
; If no zero mask bits are set, convert to a shuffle.
|
||||||
|
|
||||||
|
define <4 x float> @insertps_0x00(<4 x float> %v1, <4 x float> %v2) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 0)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_0x00
|
||||||
|
; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
|
||||||
|
; CHECK-NEXT: ret <4 x float>
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @insertps_0x10(<4 x float> %v1, <4 x float> %v2) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 16)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_0x10
|
||||||
|
; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
|
||||||
|
; CHECK-NEXT: ret <4 x float>
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @insertps_0x20(<4 x float> %v1, <4 x float> %v2) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 32)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_0x20
|
||||||
|
; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
|
||||||
|
; CHECK-NEXT: ret <4 x float>
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @insertps_0x30(<4 x float> %v1, <4 x float> %v2) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 48)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_0x30
|
||||||
|
; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
|
||||||
|
; CHECK-NEXT: ret <4 x float>
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @insertps_0xc0(<4 x float> %v1, <4 x float> %v2) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 192)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_0xc0
|
||||||
|
; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
|
||||||
|
; CHECK-NEXT: ret <4 x float>
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @insertps_0xd0(<4 x float> %v1, <4 x float> %v2) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 208)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_0xd0
|
||||||
|
; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
|
||||||
|
; CHECK-NEXT: ret <4 x float>
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @insertps_0xe0(<4 x float> %v1, <4 x float> %v2) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 224)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_0xe0
|
||||||
|
; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
|
||||||
|
; CHECK-NEXT: ret <4 x float>
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @insertps_0xf0(<4 x float> %v1, <4 x float> %v2) {
|
||||||
|
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 240)
|
||||||
|
ret <4 x float> %res
|
||||||
|
|
||||||
|
; CHECK-LABEL: @insertps_0xf0
|
||||||
|
; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
|
||||||
|
; CHECK-NEXT: ret <4 x float>
|
||||||
|
}
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user