mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-31 09:32:11 +00:00
Enable SSE4 codegen and pattern matching.
Add some notes to the README. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@46949 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
a6ed0aa8ec
commit
14d12caf1d
@ -181,8 +181,8 @@ def SDTVecShuffle : SDTypeProfile<1, 3, [
|
||||
def SDTVecExtract : SDTypeProfile<1, 2, [ // vector extract
|
||||
SDTCisEltOfVec<0, 1>, SDTCisPtrTy<2>
|
||||
]>;
|
||||
def SDTVecInsert : SDTypeProfile<1, 2, [ // vector insert
|
||||
SDTCisEltOfVec<1, 0>, SDTCisPtrTy<2>
|
||||
def SDTVecInsert : SDTypeProfile<1, 3, [ // vector insert
|
||||
SDTCisEltOfVec<2, 1>, SDTCisSameAs<0, 1>, SDTCisPtrTy<3>
|
||||
]>;
|
||||
|
||||
class SDCallSeqStart<list<SDTypeConstraint> constraints> :
|
||||
|
@ -761,3 +761,23 @@ an X86 fxor. This means that we need to handle this case in the x86 backend
|
||||
instead of in target independent code.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Non-SSE4 insert into 16 x i8 is atrociously bad.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
|
||||
is memory.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
|
||||
sitting between the truncate and the extract.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
|
||||
any number of 0.0 simultaneously. Currently we only use it for simple
|
||||
insertions.
|
||||
|
||||
See comments in LowerINSERT_VECTOR_ELT_SSE4.
|
||||
|
@ -678,6 +678,33 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
|
||||
setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
|
||||
setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
|
||||
}
|
||||
|
||||
if (Subtarget->hasSSE41()) {
|
||||
// FIXME: Do we need to handle scalar-to-vector here?
|
||||
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
|
||||
|
||||
// i8 and i16 vectors are custom , because the source register and source
|
||||
// source memory operand types are not the same width. f32 vectors are
|
||||
// custom since the immediate controlling the insert encodes additional
|
||||
// information.
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
|
||||
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
|
||||
|
||||
if (Subtarget->is64Bit()) {
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Legal);
|
||||
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
|
||||
}
|
||||
}
|
||||
|
||||
// We want to custom lower some of our intrinsics.
|
||||
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
|
||||
@ -3654,11 +3681,35 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
|
||||
return SDOperand();
|
||||
}
|
||||
|
||||
SDOperand
|
||||
X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDOperand Op,
|
||||
SelectionDAG &DAG) {
|
||||
MVT::ValueType VT = Op.getValueType();
|
||||
if (MVT::getSizeInBits(VT) == 8) {
|
||||
SDOperand Extract = DAG.getNode(X86ISD::PEXTRB, MVT::i32,
|
||||
Op.getOperand(0), Op.getOperand(1));
|
||||
SDOperand Assert = DAG.getNode(ISD::AssertZext, MVT::i32, Extract,
|
||||
DAG.getValueType(VT));
|
||||
return DAG.getNode(ISD::TRUNCATE, VT, Assert);
|
||||
} else if (MVT::getSizeInBits(VT) == 16) {
|
||||
SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, MVT::i32,
|
||||
Op.getOperand(0), Op.getOperand(1));
|
||||
SDOperand Assert = DAG.getNode(ISD::AssertZext, MVT::i32, Extract,
|
||||
DAG.getValueType(VT));
|
||||
return DAG.getNode(ISD::TRUNCATE, VT, Assert);
|
||||
}
|
||||
return SDOperand();
|
||||
}
|
||||
|
||||
|
||||
SDOperand
|
||||
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
|
||||
if (!isa<ConstantSDNode>(Op.getOperand(1)))
|
||||
return SDOperand();
|
||||
|
||||
if (Subtarget->hasSSE41())
|
||||
return LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
|
||||
|
||||
MVT::ValueType VT = Op.getValueType();
|
||||
// TODO: handle v16i8.
|
||||
if (MVT::getSizeInBits(VT) == 16) {
|
||||
@ -3699,6 +3750,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
|
||||
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
|
||||
DAG.getIntPtrConstant(0));
|
||||
} else if (MVT::getSizeInBits(VT) == 64) {
|
||||
// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
|
||||
// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
|
||||
// to match extract_elt for f64.
|
||||
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
|
||||
if (Idx == 0)
|
||||
return Op;
|
||||
@ -3723,10 +3777,48 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
|
||||
return SDOperand();
|
||||
}
|
||||
|
||||
SDOperand
|
||||
X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDOperand Op, SelectionDAG &DAG){
|
||||
MVT::ValueType VT = Op.getValueType();
|
||||
MVT::ValueType EVT = MVT::getVectorElementType(VT);
|
||||
|
||||
SDOperand N0 = Op.getOperand(0);
|
||||
SDOperand N1 = Op.getOperand(1);
|
||||
SDOperand N2 = Op.getOperand(2);
|
||||
|
||||
if ((MVT::getSizeInBits(EVT) == 8) || (MVT::getSizeInBits(EVT) == 16)) {
|
||||
unsigned Opc = (MVT::getSizeInBits(EVT) == 8) ? X86ISD::PINSRB
|
||||
: X86ISD::PINSRW;
|
||||
// Transform it so it match pinsr{b,w} which expects a GR32 as its second
|
||||
// argument.
|
||||
if (N1.getValueType() != MVT::i32)
|
||||
N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1);
|
||||
if (N2.getValueType() != MVT::i32)
|
||||
N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue());
|
||||
return DAG.getNode(Opc, VT, N0, N1, N2);
|
||||
} else if (EVT == MVT::f32) {
|
||||
// Bits [7:6] of the constant are the source select. This will always be
|
||||
// zero here. The DAG Combiner may combine an extract_elt index into these
|
||||
// bits. For example (insert (extract, 3), 2) could be matched by putting
|
||||
// the '3' into bits [7:6] of X86ISD::INSERTPS.
|
||||
// Bits [5:4] of the constant are the destination select. This is the
|
||||
// value of the incoming immediate.
|
||||
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
|
||||
// combine either bitwise AND or insert of float 0.0 to set these bits.
|
||||
N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue() << 4);
|
||||
return DAG.getNode(X86ISD::INSERTPS, VT, N0, N1, N2);
|
||||
}
|
||||
return SDOperand();
|
||||
}
|
||||
|
||||
SDOperand
|
||||
X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
|
||||
MVT::ValueType VT = Op.getValueType();
|
||||
MVT::ValueType EVT = MVT::getVectorElementType(VT);
|
||||
|
||||
if (Subtarget->hasSSE41())
|
||||
return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
|
||||
|
||||
if (EVT == MVT::i8)
|
||||
return SDOperand();
|
||||
|
||||
@ -5273,7 +5365,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
|
||||
case X86ISD::Wrapper: return "X86ISD::Wrapper";
|
||||
case X86ISD::S2VEC: return "X86ISD::S2VEC";
|
||||
case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
|
||||
case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
|
||||
case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
|
||||
case X86ISD::PINSRB: return "X86ISD::PINSRB";
|
||||
case X86ISD::PINSRW: return "X86ISD::PINSRW";
|
||||
case X86ISD::FMAX: return "X86ISD::FMAX";
|
||||
case X86ISD::FMIN: return "X86ISD::FMIN";
|
||||
|
@ -170,10 +170,22 @@ namespace llvm {
|
||||
/// have to match the operand type.
|
||||
S2VEC,
|
||||
|
||||
/// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
|
||||
/// i32, corresponds to X86::PEXTRB.
|
||||
PEXTRB,
|
||||
|
||||
/// PEXTRW - Extract a 16-bit value from a vector and zero extend it to
|
||||
/// i32, corresponds to X86::PEXTRW.
|
||||
PEXTRW,
|
||||
|
||||
/// INSERTPS - Insert any element of a 4 x float vector into any element
|
||||
/// of a destination 4 x floatvector.
|
||||
INSERTPS,
|
||||
|
||||
/// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector,
|
||||
/// corresponds to X86::PINSRB.
|
||||
PINSRB,
|
||||
|
||||
/// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
|
||||
/// corresponds to X86::PINSRW.
|
||||
PINSRW,
|
||||
@ -493,7 +505,9 @@ namespace llvm {
|
||||
SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG);
|
||||
SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG);
|
||||
SDOperand LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG);
|
||||
SDOperand LowerEXTRACT_VECTOR_ELT_SSE4(SDOperand Op, SelectionDAG &DAG);
|
||||
SDOperand LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG);
|
||||
SDOperand LowerINSERT_VECTOR_ELT_SSE4(SDOperand Op, SelectionDAG &DAG);
|
||||
SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG);
|
||||
SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG);
|
||||
SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG);
|
||||
|
@ -35,8 +35,19 @@ def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>;
|
||||
def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
|
||||
def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
|
||||
def X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, []>;
|
||||
def X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
|
||||
def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
|
||||
def X86pextrb : SDNode<"X86ISD::PEXTRB",
|
||||
SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
|
||||
def X86pextrw : SDNode<"X86ISD::PEXTRW",
|
||||
SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
|
||||
def X86pinsrb : SDNode<"X86ISD::PINSRB",
|
||||
SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
|
||||
SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
|
||||
def X86pinsrw : SDNode<"X86ISD::PINSRW",
|
||||
SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
|
||||
SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
|
||||
def X86insrtps : SDNode<"X86ISD::INSERTPS",
|
||||
SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
|
||||
SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SSE 'Special' Instructions
|
||||
@ -2087,23 +2098,21 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
|
||||
(outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
|
||||
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
|
||||
[(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
|
||||
(iPTR imm:$src2)))]>;
|
||||
imm:$src2))]>;
|
||||
let isTwoAddress = 1 in {
|
||||
def PINSRWrri : PDIi8<0xC4, MRMSrcReg,
|
||||
(outs VR128:$dst), (ins VR128:$src1,
|
||||
GR32:$src2, i32i8imm:$src3),
|
||||
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
|
||||
[(set VR128:$dst,
|
||||
(v8i16 (X86pinsrw (v8i16 VR128:$src1),
|
||||
GR32:$src2, (iPTR imm:$src3))))]>;
|
||||
(X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>;
|
||||
def PINSRWrmi : PDIi8<0xC4, MRMSrcMem,
|
||||
(outs VR128:$dst), (ins VR128:$src1,
|
||||
i16mem:$src2, i32i8imm:$src3),
|
||||
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
|
||||
[(set VR128:$dst,
|
||||
(v8i16 (X86pinsrw (v8i16 VR128:$src1),
|
||||
(i32 (anyext (loadi16 addr:$src2))),
|
||||
(iPTR imm:$src3))))]>;
|
||||
[(set VR128:$dst,
|
||||
(X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
|
||||
imm:$src3))]>;
|
||||
}
|
||||
|
||||
// Mask creation
|
||||
@ -3255,7 +3264,7 @@ defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw",
|
||||
|
||||
|
||||
/// SS41I_binop_rmi_int - SSE 4.1 binary operator with immediate
|
||||
let isTwoAddress = 1 in {
|
||||
let Uses = [XMM0], isTwoAddress = 1 in {
|
||||
multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
|
||||
def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
|
||||
(ins VR128:$src1, VR128:$src2),
|
||||
@ -3328,26 +3337,44 @@ defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
|
||||
defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovsxbq", int_x86_sse41_pmovzxbq>;
|
||||
|
||||
|
||||
/// SS41I_binop_ext8 - SSE 4.1 binary operator with immediate
|
||||
multiclass SS41I_binop_ext8<bits<8> opc, string OpcodeStr> {
|
||||
/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
|
||||
multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
|
||||
def rr : SS4AI<opc, MRMSrcReg, (outs GR32:$dst),
|
||||
(ins VR128:$src1, i32i8imm:$src2),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
[(set GR32:$dst, (zext
|
||||
(extractelt (v16i8 VR128:$src1), imm:$src2)))]>, OpSize;
|
||||
[(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
|
||||
OpSize;
|
||||
def mr : SS4AI<opc, MRMDestMem, (outs),
|
||||
(ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
[(store (extractelt (v16i8 VR128:$src1), imm:$src2),
|
||||
addr:$dst)]>, OpSize;
|
||||
[]>, OpSize;
|
||||
// FIXME:
|
||||
// There's an AssertZext in the way of writing the store pattern
|
||||
// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
|
||||
}
|
||||
|
||||
defm PEXTRB : SS41I_binop_ext8<0x14, "pextrb">;
|
||||
defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
|
||||
|
||||
/// SS41I_binop_ext32 - SSE 4.1 binary operator with immediate
|
||||
multiclass SS41I_binop_ext32<bits<8> opc, string OpcodeStr> {
|
||||
|
||||
/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
|
||||
multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
|
||||
def mr : SS4AI<opc, MRMDestMem, (outs),
|
||||
(ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
[]>, OpSize;
|
||||
// FIXME:
|
||||
// There's an AssertZext in the way of writing the store pattern
|
||||
// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
|
||||
}
|
||||
|
||||
defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
|
||||
|
||||
|
||||
/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
|
||||
multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
|
||||
def rr : SS4AI<opc, MRMSrcReg, (outs GR32:$dst),
|
||||
(ins VR128:$src1, i32i8imm:$src2),
|
||||
!strconcat(OpcodeStr,
|
||||
@ -3362,10 +3389,11 @@ multiclass SS41I_binop_ext32<bits<8> opc, string OpcodeStr> {
|
||||
addr:$dst)]>, OpSize;
|
||||
}
|
||||
|
||||
defm PEXTRD : SS41I_binop_ext32<0x16, "pextrd">;
|
||||
defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
|
||||
|
||||
/// SS41I_binop_extf32 - SSE 4.1 binary operator with immediate
|
||||
multiclass SS41I_binop_extf32<bits<8> opc, string OpcodeStr> {
|
||||
|
||||
/// SS41I_extractf32 - SSE 4.1 extract 32 bits to fp reg or memory destination
|
||||
multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
|
||||
def rr : SS4AI<opc, MRMSrcReg, (outs FR32:$dst),
|
||||
(ins VR128:$src1, i32i8imm:$src2),
|
||||
!strconcat(OpcodeStr,
|
||||
@ -3380,5 +3408,65 @@ multiclass SS41I_binop_extf32<bits<8> opc, string OpcodeStr> {
|
||||
addr:$dst)]>, OpSize;
|
||||
}
|
||||
|
||||
defm EXTRACTPS : SS41I_binop_extf32<0x17, "extractps">;
|
||||
defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
|
||||
|
||||
let isTwoAddress = 1 in {
|
||||
multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> {
|
||||
def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst),
|
||||
(ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set VR128:$dst,
|
||||
(X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
|
||||
def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst),
|
||||
(ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set VR128:$dst,
|
||||
(X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
|
||||
imm:$src3))]>, OpSize;
|
||||
}
|
||||
}
|
||||
|
||||
defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
|
||||
|
||||
let isTwoAddress = 1 in {
|
||||
multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> {
|
||||
def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst),
|
||||
(ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set VR128:$dst,
|
||||
(v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
|
||||
OpSize;
|
||||
def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst),
|
||||
(ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set VR128:$dst,
|
||||
(v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
|
||||
imm:$src3)))]>, OpSize;
|
||||
}
|
||||
}
|
||||
|
||||
defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
|
||||
|
||||
let isTwoAddress = 1 in {
|
||||
multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> {
|
||||
def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst),
|
||||
(ins VR128:$src1, FR32:$src2, i32i8imm:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set VR128:$dst,
|
||||
(X86insrtps VR128:$src1, FR32:$src2, imm:$src3))]>, OpSize;
|
||||
def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst),
|
||||
(ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set VR128:$dst,
|
||||
(X86insrtps VR128:$src1, (loadf32 addr:$src2),
|
||||
imm:$src3))]>, OpSize;
|
||||
}
|
||||
}
|
||||
|
||||
defm INSERTPS : SS41I_insertf32<0x31, "insertps">;
|
||||
|
Loading…
Reference in New Issue
Block a user