From 46154eb6fd7d0dc908eda5dd52fe16d893e8e008 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 11 Nov 2011 07:39:23 +0000 Subject: [PATCH] Add lowering for AVX2 shift instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@144380 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 185 +++++++++++++++++-------- lib/Target/X86/X86InstrSSE.td | 99 +++++--------- test/CodeGen/X86/avx2-logic.ll | 73 ---------- test/CodeGen/X86/avx2-shift.ll | 210 +++++++++++++++++++++++++++++ 4 files changed, 377 insertions(+), 190 deletions(-) create mode 100644 test/CodeGen/X86/avx2-shift.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 93f7de89de2..e77b1dfcd58 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1050,21 +1050,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Legal); setOperationAction(ISD::MUL, MVT::v16i16, Legal); + // Don't lower v32i8 because there is no 128-bit byte mul setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - - setOperationAction(ISD::SHL, MVT::v4i32, Legal); - setOperationAction(ISD::SHL, MVT::v2i64, Legal); - setOperationAction(ISD::SRL, MVT::v4i32, Legal); - setOperationAction(ISD::SRL, MVT::v2i64, Legal); - setOperationAction(ISD::SRA, MVT::v4i32, Legal); - - setOperationAction(ISD::SHL, MVT::v8i32, Legal); - setOperationAction(ISD::SHL, MVT::v4i64, Legal); - setOperationAction(ISD::SRL, MVT::v8i32, Legal); - setOperationAction(ISD::SRL, MVT::v4i64, Legal); - setOperationAction(ISD::SRA, MVT::v8i32, Legal); - // Don't lower v32i8 because there is no 128-bit byte mul } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -10130,47 +10118,6 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget->hasXMMInt()) return SDValue(); - // Decompose 256-bit shifts into smaller 128-bit shifts. - if (VT.getSizeInBits() == 256) { - int NumElems = VT.getVectorNumElements(); - MVT EltVT = VT.getVectorElementType().getSimpleVT(); - EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); - - // Extract the two vectors - SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); - SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); - - // Recreate the shift amount vectors - SDValue Amt1, Amt2; - if (Amt.getOpcode() == ISD::BUILD_VECTOR) { - // Constant shift amount - SmallVector Amt1Csts; - SmallVector Amt2Csts; - for (int i = 0; i < NumElems/2; ++i) - Amt1Csts.push_back(Amt->getOperand(i)); - for (int i = NumElems/2; i < NumElems; ++i) - Amt2Csts.push_back(Amt->getOperand(i)); - - Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, - &Amt1Csts[0], NumElems/2); - Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, - &Amt2Csts[0], NumElems/2); - } else { - // Variable shift amount - Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl); - Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); - } - - // Issue new vector shifts for the smaller types - V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); - V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); - - // Concatenate the result back - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); - } - // Optimize shl/srl/sra with constant shift amount. if (isSplatVector(Amt.getNode())) { SDValue SclrAmt = Amt->getOperand(0); @@ -10259,9 +10206,97 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; } + + if (Subtarget->hasAVX2()) { + if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + } } } + // AVX2 variable shifts + if (Subtarget->hasAVX2()) { + if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psllv_d, MVT::i32), + R, Amt); + if (VT == MVT::v8i32 && Op->getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psllv_d_256, MVT::i32), + R, Amt); + if (VT == MVT::v2i64 && Op->getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psllv_q, MVT::i32), + R, Amt); + if (VT == MVT::v4i64 && Op->getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psllv_q_256, MVT::i32), + R, Amt); + + if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrlv_d, MVT::i32), + R, Amt); + if (VT == MVT::v8i32 && Op->getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrlv_d_256, MVT::i32), + R, Amt); + if (VT == MVT::v2i64 && Op->getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrlv_q, MVT::i32), + R, Amt); + if (VT == MVT::v4i64 && Op->getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrlv_q_256, MVT::i32), + R, Amt); + + if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrav_d, MVT::i32), + R, Amt); + if (VT == MVT::v8i32 && Op->getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_avx2_psrav_d_256, MVT::i32), + R, Amt); + } + // Lower SHL with variable shift amount. if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, @@ -10328,6 +10363,48 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { R, DAG.getNode(ISD::ADD, dl, VT, R, R)); return R; } + + // Decompose 256-bit shifts into smaller 128-bit shifts. + if (VT.getSizeInBits() == 256) { + int NumElems = VT.getVectorNumElements(); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + // Extract the two vectors + SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); + SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + + // Recreate the shift amount vectors + SDValue Amt1, Amt2; + if (Amt.getOpcode() == ISD::BUILD_VECTOR) { + // Constant shift amount + SmallVector Amt1Csts; + SmallVector Amt2Csts; + for (int i = 0; i < NumElems/2; ++i) + Amt1Csts.push_back(Amt->getOperand(i)); + for (int i = NumElems/2; i < NumElems; ++i) + Amt2Csts.push_back(Amt->getOperand(i)); + + Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, + &Amt1Csts[0], NumElems/2); + Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, + &Amt2Csts[0], NumElems/2); + } else { + // Variable shift amount + Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl); + Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + } + + // Issue new vector shifts for the smaller types + V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); + V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); + + // Concatenate the result back + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); + } + return SDValue(); } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 91c84dd6a1a..10f527cdeda 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7655,7 +7655,6 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", // Variable Bit Shifts // multiclass avx2_var_shift opc, string OpcodeStr, - PatFrag pf128, PatFrag pf256, Intrinsic Int128, Intrinsic Int256> { def rr : AVX28I opc, string OpcodeStr, def rm : AVX28I, + [(set VR128:$dst, + (Int128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>, VEX_4V; def Yrr : AVX28I opc, string OpcodeStr, def Yrm : AVX28I, + [(set VR256:$dst, + (Int256 VR256:$src1, (bitconvert (memopv4i64 addr:$src2))))]>, VEX_4V; } -defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", memopv4i32, memopv8i32, - int_x86_avx2_psllv_d, int_x86_avx2_psllv_d_256>; -defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", memopv2i64, memopv4i64, - int_x86_avx2_psllv_q, int_x86_avx2_psllv_q_256>, - VEX_W; -defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", memopv4i32, memopv8i32, - int_x86_avx2_psrlv_d, int_x86_avx2_psrlv_d_256>; -defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", memopv2i64, memopv4i64, - int_x86_avx2_psrlv_q, int_x86_avx2_psrlv_q_256>, - VEX_W; -defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", memopv4i32, memopv8i32, - int_x86_avx2_psrav_d, int_x86_avx2_psrav_d_256>; - - -let Predicates = [HasAVX2] in { - - def : Pat<(v4i32 (shl (v4i32 VR128:$src1), (v4i32 VR128:$src2))), - (VPSLLVDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (shl (v2i64 VR128:$src1), (v2i64 VR128:$src2))), - (VPSLLVQrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4i32 (srl (v4i32 VR128:$src1), (v4i32 VR128:$src2))), - (VPSRLVDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (srl (v2i64 VR128:$src1), (v2i64 VR128:$src2))), - (VPSRLVQrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4i32 (sra (v4i32 VR128:$src1), (v4i32 VR128:$src2))), - (VPSRAVDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v8i32 (shl (v8i32 VR256:$src1), (v8i32 VR256:$src2))), - (VPSLLVDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4i64 (shl (v4i64 VR256:$src1), (v4i64 VR256:$src2))), - (VPSLLVQYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (srl (v8i32 VR256:$src1), (v8i32 VR256:$src2))), - (VPSRLVDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4i64 (srl (v4i64 VR256:$src1), (v4i64 VR256:$src2))), - (VPSRLVQYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (sra (v8i32 VR256:$src1), (v8i32 VR256:$src2))), - (VPSRAVDYrr VR256:$src1, VR256:$src2)>; - - def : Pat<(v4i32 (shl (v4i32 VR128:$src1),(loadv4i32 addr:$src2))), - (VPSLLVDrm VR128:$src1, addr:$src2)>; - def : Pat<(v4i32 (shl (v4i32 VR128:$src1),(loadv2i64 addr:$src2))), - (VPSLLVDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (shl (v2i64 VR128:$src1),(loadv2i64 addr:$src2))), - (VPSLLVQrm VR128:$src1, addr:$src2)>; - def : Pat<(v4i32 (srl (v4i32 VR128:$src1),(loadv4i32 addr:$src2))), - (VPSRLVDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (srl (v2i64 VR128:$src1),(loadv2i64 addr:$src2))), - (VPSRLVQrm VR128:$src1, addr:$src2)>; - def : Pat<(v4i32 (sra (v4i32 VR128:$src1),(loadv4i32 addr:$src2))), - (VPSRAVDrm VR128:$src1, addr:$src2)>; - def : Pat<(v8i32 (shl (v8i32 VR256:$src1),(loadv8i32 addr:$src2))), - (VPSLLVDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4i64 (shl (v4i64 VR256:$src1),(loadv4i64 addr:$src2))), - (VPSLLVQYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8i32 (srl (v8i32 VR256:$src1),(loadv8i32 addr:$src2))), - (VPSRLVDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4i64 (srl (v4i64 VR256:$src1),(loadv4i64 addr:$src2))), - (VPSRLVQYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8i32 (sra (v8i32 VR256:$src1),(loadv8i32 addr:$src2))), - (VPSRAVDYrm VR256:$src1, addr:$src2)>; +multiclass avx2_var_shift_i64 opc, string OpcodeStr, + Intrinsic Int128, Intrinsic Int256> { + def rr : AVX28I, VEX_4V; + def rm : AVX28I, + VEX_4V; + def Yrr : AVX28I, VEX_4V; + def Yrm : AVX28I, + VEX_4V; } - +defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", int_x86_avx2_psllv_d, + int_x86_avx2_psllv_d_256>; +defm VPSLLVQ : avx2_var_shift_i64<0x47, "vpsllvq", int_x86_avx2_psllv_q, + int_x86_avx2_psllv_q_256>, VEX_W; +defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", int_x86_avx2_psrlv_d, + int_x86_avx2_psrlv_d_256>; +defm VPSRLVQ : avx2_var_shift_i64<0x45, "vpsrlvq", int_x86_avx2_psrlv_q, + int_x86_avx2_psrlv_q_256>, VEX_W; +defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", int_x86_avx2_psrav_d, + int_x86_avx2_psrav_d_256>; diff --git a/test/CodeGen/X86/avx2-logic.ll b/test/CodeGen/X86/avx2-logic.ll index 7df1a306e62..f1c294c0666 100644 --- a/test/CodeGen/X86/avx2-logic.ll +++ b/test/CodeGen/X86/avx2-logic.ll @@ -53,76 +53,3 @@ define <32 x i8> @vpblendvb(<32 x i8> %x, <32 x i8> %y) { %min = select <32 x i1> %min_is_x, <32 x i8> %x, <32 x i8> %y ret <32 x i8> %min } - - -; CHECK: variable_shl0 -; CHECK: psllvd -; CHECK: ret -define <4 x i32> @variable_shl0(<4 x i32> %x, <4 x i32> %y) { - %k = shl <4 x i32> %x, %y - ret <4 x i32> %k -} -; CHECK: variable_shl1 -; CHECK: psllvd -; CHECK: ret -define <8 x i32> @variable_shl1(<8 x i32> %x, <8 x i32> %y) { - %k = shl <8 x i32> %x, %y - ret <8 x i32> %k -} -; CHECK: variable_shl2 -; CHECK: psllvq -; CHECK: ret -define <2 x i64> @variable_shl2(<2 x i64> %x, <2 x i64> %y) { - %k = shl <2 x i64> %x, %y - ret <2 x i64> %k -} -; CHECK: variable_shl3 -; CHECK: psllvq -; CHECK: ret -define <4 x i64> @variable_shl3(<4 x i64> %x, <4 x i64> %y) { - %k = shl <4 x i64> %x, %y - ret <4 x i64> %k -} -; CHECK: variable_srl0 -; CHECK: psrlvd -; CHECK: ret -define <4 x i32> @variable_srl0(<4 x i32> %x, <4 x i32> %y) { - %k = lshr <4 x i32> %x, %y - ret <4 x i32> %k -} -; CHECK: variable_srl1 -; CHECK: psrlvd -; CHECK: ret -define <8 x i32> @variable_srl1(<8 x i32> %x, <8 x i32> %y) { - %k = lshr <8 x i32> %x, %y - ret <8 x i32> %k -} -; CHECK: variable_srl2 -; CHECK: psrlvq -; CHECK: ret -define <2 x i64> @variable_srl2(<2 x i64> %x, <2 x i64> %y) { - %k = lshr <2 x i64> %x, %y - ret <2 x i64> %k -} -; CHECK: variable_srl3 -; CHECK: psrlvq -; CHECK: ret -define <4 x i64> @variable_srl3(<4 x i64> %x, <4 x i64> %y) { - %k = lshr <4 x i64> %x, %y - ret <4 x i64> %k -} - -; CHECK: variable_sra0 -; CHECK: psravd -; CHECK: ret -define <4 x i32> @variable_sra0(<4 x i32> %x, <4 x i32> %y) { - %k = ashr <4 x i32> %x, %y - ret <4 x i32> %k -} -; CHECK: variable_sra1 -; CHECK: psravd -; CHECK: ret -define <8 x i32> @variable_sra1(<8 x i32> %x, <8 x i32> %y) { - %k = ashr <8 x i32> %x, %y - ret <8 x i32> %k -} diff --git a/test/CodeGen/X86/avx2-shift.ll b/test/CodeGen/X86/avx2-shift.ll new file mode 100644 index 00000000000..f7593616a81 --- /dev/null +++ b/test/CodeGen/X86/avx2-shift.ll @@ -0,0 +1,210 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s + +; CHECK: variable_shl0 +; CHECK: psllvd +; CHECK: ret +define <4 x i32> @variable_shl0(<4 x i32> %x, <4 x i32> %y) { + %k = shl <4 x i32> %x, %y + ret <4 x i32> %k +} +; CHECK: variable_shl1 +; CHECK: psllvd +; CHECK: ret +define <8 x i32> @variable_shl1(<8 x i32> %x, <8 x i32> %y) { + %k = shl <8 x i32> %x, %y + ret <8 x i32> %k +} +; CHECK: variable_shl2 +; CHECK: psllvq +; CHECK: ret +define <2 x i64> @variable_shl2(<2 x i64> %x, <2 x i64> %y) { + %k = shl <2 x i64> %x, %y + ret <2 x i64> %k +} +; CHECK: variable_shl3 +; CHECK: psllvq +; CHECK: ret +define <4 x i64> @variable_shl3(<4 x i64> %x, <4 x i64> %y) { + %k = shl <4 x i64> %x, %y + ret <4 x i64> %k +} +; CHECK: variable_srl0 +; CHECK: psrlvd +; CHECK: ret +define <4 x i32> @variable_srl0(<4 x i32> %x, <4 x i32> %y) { + %k = lshr <4 x i32> %x, %y + ret <4 x i32> %k +} +; CHECK: variable_srl1 +; CHECK: psrlvd +; CHECK: ret +define <8 x i32> @variable_srl1(<8 x i32> %x, <8 x i32> %y) { + %k = lshr <8 x i32> %x, %y + ret <8 x i32> %k +} +; CHECK: variable_srl2 +; CHECK: psrlvq +; CHECK: ret +define <2 x i64> @variable_srl2(<2 x i64> %x, <2 x i64> %y) { + %k = lshr <2 x i64> %x, %y + ret <2 x i64> %k +} +; CHECK: variable_srl3 +; CHECK: psrlvq +; CHECK: ret +define <4 x i64> @variable_srl3(<4 x i64> %x, <4 x i64> %y) { + %k = lshr <4 x i64> %x, %y + ret <4 x i64> %k +} + +; CHECK: variable_sra0 +; CHECK: psravd +; CHECK: ret +define <4 x i32> @variable_sra0(<4 x i32> %x, <4 x i32> %y) { + %k = ashr <4 x i32> %x, %y + ret <4 x i32> %k +} +; CHECK: variable_sra1 +; CHECK: psravd +; CHECK: ret +define <8 x i32> @variable_sra1(<8 x i32> %x, <8 x i32> %y) { + %k = ashr <8 x i32> %x, %y + ret <8 x i32> %k +} + +;;; Shift left +; CHECK: vpslld +define <8 x i32> @vshift00(<8 x i32> %a) nounwind readnone { + %s = shl <8 x i32> %a, + ret <8 x i32> %s +} + +; CHECK: vpsllw +define <16 x i16> @vshift01(<16 x i16> %a) nounwind readnone { + %s = shl <16 x i16> %a, + ret <16 x i16> %s +} + +; CHECK: vpsllq +define <4 x i64> @vshift02(<4 x i64> %a) nounwind readnone { + %s = shl <4 x i64> %a, + ret <4 x i64> %s +} + +;;; Logical Shift right +; CHECK: vpsrld +define <8 x i32> @vshift03(<8 x i32> %a) nounwind readnone { + %s = lshr <8 x i32> %a, + ret <8 x i32> %s +} + +; CHECK: vpsrlw +define <16 x i16> @vshift04(<16 x i16> %a) nounwind readnone { + %s = lshr <16 x i16> %a, + ret <16 x i16> %s +} + +; CHECK: vpsrlq +define <4 x i64> @vshift05(<4 x i64> %a) nounwind readnone { + %s = lshr <4 x i64> %a, + ret <4 x i64> %s +} + +;;; Arithmetic Shift right +; CHECK: vpsrad +define <8 x i32> @vshift06(<8 x i32> %a) nounwind readnone { + %s = ashr <8 x i32> %a, + ret <8 x i32> %s +} + +; CHECK: vpsraw +define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone { + %s = ashr <16 x i16> %a, + ret <16 x i16> %s +} + +; CHECK: variable_sra0_load +; CHECK: psravd (% +; CHECK: ret +define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) { + %y1 = load <4 x i32>* %y + %k = ashr <4 x i32> %x, %y1 + ret <4 x i32> %k +} + +; CHECK: variable_sra1_load +; CHECK: psravd (% +; CHECK: ret +define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) { + %y1 = load <8 x i32>* %y + %k = ashr <8 x i32> %x, %y1 + ret <8 x i32> %k +} + +; CHECK: variable_shl0_load +; CHECK: psllvd (% +; CHECK: ret +define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) { + %y1 = load <4 x i32>* %y + %k = shl <4 x i32> %x, %y1 + ret <4 x i32> %k +} +; CHECK: variable_shl1_load +; CHECK: psllvd (% +; CHECK: ret +define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) { + %y1 = load <8 x i32>* %y + %k = shl <8 x i32> %x, %y1 + ret <8 x i32> %k +} +; CHECK: variable_shl2_load +; CHECK: psllvq (% +; CHECK: ret +define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) { + %y1 = load <2 x i64>* %y + %k = shl <2 x i64> %x, %y1 + ret <2 x i64> %k +} +; CHECK: variable_shl3_load +; CHECK: psllvq (% +; CHECK: ret +define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) { + %y1 = load <4 x i64>* %y + %k = shl <4 x i64> %x, %y1 + ret <4 x i64> %k +} +; CHECK: variable_srl0_load +; CHECK: psrlvd (% +; CHECK: ret +define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) { + %y1 = load <4 x i32>* %y + %k = lshr <4 x i32> %x, %y1 + ret <4 x i32> %k +} +; CHECK: variable_srl1_load +; CHECK: psrlvd (% +; CHECK: ret +define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) { + %y1 = load <8 x i32>* %y + %k = lshr <8 x i32> %x, %y1 + ret <8 x i32> %k +} +; CHECK: variable_srl2_load +; CHECK: psrlvq (% +; CHECK: ret +define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) { + %y1 = load <2 x i64>* %y + %k = lshr <2 x i64> %x, %y1 + ret <2 x i64> %k +} +; CHECK: variable_srl3_load +; CHECK: psrlvq (% +; CHECK: ret +define <4 x i64> @variable_srl3_load(<4 x i64> %x, <4 x i64>* %y) { + %y1 = load <4 x i64>* %y + %k = lshr <4 x i64> %x, %y1 + ret <4 x i64> %k +}