[X86] Remove the multiply by 8 that goes into the shift constant for X86ISD::VSHLDQ and X86ISD::VSRLDQ. This simplifies the pattern matching in isel and allows these nodes to become the patterns embedded in the instruction.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229431 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Craig Topper
2015-02-16 20:52:07 +00:00
parent e124dc723b
commit 4031c08c87
5 changed files with 52 additions and 58 deletions

View File

@@ -538,10 +538,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
if (Shift < 16) { if (Shift < 16) {
SmallVector<Constant*, 32> Idxs; SmallVector<Constant*, 32> Idxs;
for (unsigned l = 0; l < 32; l += 16) for (unsigned l = 0; l != 32; l += 16)
for (unsigned i = 0; i != 16; ++i) { for (unsigned i = 0; i != 16; ++i) {
unsigned Idx = i + Shift; unsigned Idx = 32 + i - Shift;
if (Idx >= 16) Idx += 16; // end of lane, switch operand. if (Idx < 32) Idx -= 16; // end of lane, switch operand.
Idxs.push_back(Builder.getInt32(Idx + l)); Idxs.push_back(Builder.getInt32(Idx + l));
} }
@@ -561,10 +561,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
if (Shift < 16) { if (Shift < 16) {
SmallVector<Constant*, 32> Idxs; SmallVector<Constant*, 32> Idxs;
for (unsigned l = 0; l < 32; l += 16) for (unsigned l = 0; l != 32; l += 16)
for (unsigned i = 0; i != 16; ++i) { for (unsigned i = 0; i != 16; ++i) {
unsigned Idx = 32 + i - Shift; unsigned Idx = i + Shift;
if (Idx < 32) Idx -= 16; // end of lane, switch operand. if (Idx >= 16) Idx += 16; // end of lane, switch operand.
Idxs.push_back(Builder.getInt32(Idx + l)); Idxs.push_back(Builder.getInt32(Idx + l));
} }

View File

@@ -5930,7 +5930,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType()); MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy); assert(NumBits % 8 == 0 && "Only support byte sized shifts");
SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy);
return DAG.getNode(ISD::BITCAST, dl, VT, return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
} }
@@ -7761,9 +7762,9 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi); Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
DAG.getConstant(8 * LoByteShift, MVT::i8)); DAG.getConstant(LoByteShift, MVT::i8));
SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
DAG.getConstant(8 * HiByteShift, MVT::i8)); DAG.getConstant(HiByteShift, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT, return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
} }
@@ -7907,7 +7908,7 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
SDValue V = MatchV1 ? V1 : V2; SDValue V = MatchV1 ? V1 : V2;
V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
V = DAG.getNode(Op, DL, ShiftVT, V, V = DAG.getNode(Op, DL, ShiftVT, V,
DAG.getConstant(ByteShift * 8, MVT::i8)); DAG.getConstant(ByteShift, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT, V); return DAG.getNode(ISD::BITCAST, DL, VT, V);
}; };
@@ -8300,7 +8301,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
V2 = DAG.getNode( V2 = DAG.getNode(
X86ISD::VSHLDQ, DL, MVT::v2i64, V2, X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
DAG.getConstant( DAG.getConstant(
V2Index * EltVT.getSizeInBits(), V2Index * EltVT.getSizeInBits()/8,
DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
} }

View File

@@ -4174,16 +4174,20 @@ defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
VR128, v4i32, v4i32, bc_v4i32, loadv2i64, VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
// 128-bit logical shifts. // 128-bit logical shifts.
def VPSLLDQri : PDIi8<0x73, MRM7r, def VPSLLDQri : PDIi8<0x73, MRM7r,
(outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
"vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, VEX_4V; [(set VR128:$dst,
(v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>,
VEX_4V;
def VPSRLDQri : PDIi8<0x73, MRM3r, def VPSRLDQri : PDIi8<0x73, MRM3r,
(outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
"vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, VEX_4V; [(set VR128:$dst,
(v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
VEX_4V;
// PSRADQri doesn't exist in SSE[1-3]. // PSRADQri doesn't exist in SSE[1-3].
} }
} // Predicates = [HasAVX] } // Predicates = [HasAVX]
@@ -4219,13 +4223,17 @@ defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
// 256-bit logical shifts. // 256-bit logical shifts.
def VPSLLDQYri : PDIi8<0x73, MRM7r, def VPSLLDQYri : PDIi8<0x73, MRM7r,
(outs VR256:$dst), (ins VR256:$src1, i32u8imm:$src2), (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
"vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, VEX_4V, VEX_L; [(set VR256:$dst,
(v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>,
VEX_4V, VEX_L;
def VPSRLDQYri : PDIi8<0x73, MRM3r, def VPSRLDQYri : PDIi8<0x73, MRM3r,
(outs VR256:$dst), (ins VR256:$src1, i32u8imm:$src2), (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
"vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, VEX_4V, VEX_L; [(set VR256:$dst,
(v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
VEX_4V, VEX_L;
// PSRADQYri doesn't exist in SSE[1-3]. // PSRADQYri doesn't exist in SSE[1-3].
} }
} // Predicates = [HasAVX2] } // Predicates = [HasAVX2]
@@ -4261,13 +4269,17 @@ defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
// 128-bit logical shifts. // 128-bit logical shifts.
def PSLLDQri : PDIi8<0x73, MRM7r, def PSLLDQri : PDIi8<0x73, MRM7r,
(outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
"pslldq\t{$src2, $dst|$dst, $src2}", "pslldq\t{$src2, $dst|$dst, $src2}",
[], IIC_SSE_INTSHDQ_P_RI>; [(set VR128:$dst,
(v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))],
IIC_SSE_INTSHDQ_P_RI>;
def PSRLDQri : PDIi8<0x73, MRM3r, def PSRLDQri : PDIi8<0x73, MRM3r,
(outs VR128:$dst), (ins VR128:$src1, i32u8imm:$src2), (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
"psrldq\t{$src2, $dst|$dst, $src2}", "psrldq\t{$src2, $dst|$dst, $src2}",
[], IIC_SSE_INTSHDQ_P_RI>; [(set VR128:$dst,
(v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))],
IIC_SSE_INTSHDQ_P_RI>;
// PSRADQri doesn't exist in SSE[1-3]. // PSRADQri doesn't exist in SSE[1-3].
} }
} // Constraints = "$src1 = $dst" } // Constraints = "$src1 = $dst"
@@ -4279,12 +4291,6 @@ let Predicates = [HasAVX] in {
(VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
(VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
// Shift up / down and insert zero's.
def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
(VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
(VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
} }
let Predicates = [HasAVX2] in { let Predicates = [HasAVX2] in {
@@ -4292,12 +4298,6 @@ let Predicates = [HasAVX2] in {
(VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2), def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
(VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
// Shift up / down and insert zero's.
def : Pat<(v4i64 (X86vshldq VR256:$src, (i8 imm:$amt))),
(VPSLLDQYri VR256:$src, (BYTE_imm imm:$amt))>;
def : Pat<(v4i64 (X86vshrdq VR256:$src, (i8 imm:$amt))),
(VPSRLDQYri VR256:$src, (BYTE_imm imm:$amt))>;
} }
let Predicates = [UseSSE2] in { let Predicates = [UseSSE2] in {
@@ -4307,12 +4307,6 @@ let Predicates = [UseSSE2] in {
(PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
(PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
// Shift up / down and insert zero's.
def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
(PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
(PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
} }
//===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===//

View File

@@ -31,3 +31,18 @@ define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
} }
declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
%res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
%res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone

View File

@@ -168,14 +168,6 @@ define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
%res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) { define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) {
; CHECK: vpsllq ; CHECK: vpsllq
%res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
@@ -264,14 +256,6 @@ define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
%res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) { define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) {
; CHECK: vpsrlq ; CHECK: vpsrlq
%res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]