mirror of
				https://github.com/c64scene-ar/llvm-6502.git
				synced 2025-11-03 14:21:30 +00:00 
			
		
		
		
	[SystemZ] Handle sub-128 vectors
The ABI allows sub-128 vectors to be passed and returned in registers, with the vector occupying the upper part of a register. We therefore want to legalize those types by widening the vector rather than promoting the elements. The patch includes some simple tests for sub-128 vectors and also tests that we can recognize various pack sequences, some of which use sub-128 vectors as temporary results. One of these forms is based on the pack sequences generated by llvmpipe when no intrinsics are used. Signed unpacks are recognized as BUILD_VECTORs whose elements are individually sign-extended. Unsigned unpacks can have the equivalent form with zero extension, but they also occur as shuffles in which some elements are zero. Based on a patch by Richard Sandiford. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236525 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
		@@ -28,6 +28,14 @@ private:
 | 
			
		||||
  /// See ISD::OutputArg::IsFixed.
 | 
			
		||||
  SmallVector<bool, 4> ArgIsFixed;
 | 
			
		||||
 | 
			
		||||
  /// Records whether the value was widened from a short vector type.
 | 
			
		||||
  SmallVector<bool, 4> ArgIsShortVector;
 | 
			
		||||
 | 
			
		||||
  // Check whether ArgVT is a short vector type.
 | 
			
		||||
  bool IsShortVectorType(EVT ArgVT) {
 | 
			
		||||
    return ArgVT.isVector() && ArgVT.getStoreSize() <= 8;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
public:
 | 
			
		||||
  SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
 | 
			
		||||
                 SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
 | 
			
		||||
@@ -39,6 +47,10 @@ public:
 | 
			
		||||
    ArgIsFixed.clear();
 | 
			
		||||
    for (unsigned i = 0; i < Ins.size(); ++i)
 | 
			
		||||
      ArgIsFixed.push_back(true);
 | 
			
		||||
    // Record whether the call operand was a short vector.
 | 
			
		||||
    ArgIsShortVector.clear();
 | 
			
		||||
    for (unsigned i = 0; i < Ins.size(); ++i)
 | 
			
		||||
      ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT));
 | 
			
		||||
 | 
			
		||||
    CCState::AnalyzeFormalArguments(Ins, Fn);
 | 
			
		||||
  }
 | 
			
		||||
@@ -49,6 +61,10 @@ public:
 | 
			
		||||
    ArgIsFixed.clear();
 | 
			
		||||
    for (unsigned i = 0; i < Outs.size(); ++i)
 | 
			
		||||
      ArgIsFixed.push_back(Outs[i].IsFixed);
 | 
			
		||||
    // Record whether the call operand was a short vector.
 | 
			
		||||
    ArgIsShortVector.clear();
 | 
			
		||||
    for (unsigned i = 0; i < Outs.size(); ++i)
 | 
			
		||||
      ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT));
 | 
			
		||||
 | 
			
		||||
    CCState::AnalyzeCallOperands(Outs, Fn);
 | 
			
		||||
  }
 | 
			
		||||
@@ -60,6 +76,7 @@ public:
 | 
			
		||||
                           CCAssignFn Fn) = delete;
 | 
			
		||||
 | 
			
		||||
  bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; }
 | 
			
		||||
  bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
} // end namespace llvm
 | 
			
		||||
 
 | 
			
		||||
@@ -21,6 +21,11 @@ class CCIfSubtarget<string F, CCAction A>
 | 
			
		||||
class CCIfFixed<CCAction A>
 | 
			
		||||
    : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;
 | 
			
		||||
 | 
			
		||||
// Match if this specific argument was widened from a short vector type.
 | 
			
		||||
class CCIfShortVector<CCAction A>
 | 
			
		||||
    : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//===----------------------------------------------------------------------===//
 | 
			
		||||
// z/Linux return value calling convention
 | 
			
		||||
//===----------------------------------------------------------------------===//
 | 
			
		||||
@@ -43,6 +48,8 @@ def RetCC_SystemZ : CallingConv<[
 | 
			
		||||
  CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
 | 
			
		||||
 | 
			
		||||
  // Similarly for vectors, with V24 being the ABI-compliant choice.
 | 
			
		||||
  // Sub-128 vectors are returned in the same way, but they're widened
 | 
			
		||||
  // to one of these types during type legalization.
 | 
			
		||||
  CCIfSubtarget<"hasVector()",
 | 
			
		||||
    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
 | 
			
		||||
             CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
 | 
			
		||||
@@ -74,12 +81,20 @@ def CC_SystemZ : CallingConv<[
 | 
			
		||||
  CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
 | 
			
		||||
  CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
 | 
			
		||||
 | 
			
		||||
  // The first 8 named vector arguments are passed in V24-V31.
 | 
			
		||||
  // The first 8 named vector arguments are passed in V24-V31.  Sub-128 vectors
 | 
			
		||||
  // are passed in the same way, but they're widened to one of these types
 | 
			
		||||
  // during type legalization.
 | 
			
		||||
  CCIfSubtarget<"hasVector()",
 | 
			
		||||
    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
 | 
			
		||||
             CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
 | 
			
		||||
                                      V25, V27, V29, V31]>>>>,
 | 
			
		||||
 | 
			
		||||
  // However, sub-128 vectors which need to go on the stack occupy just a
 | 
			
		||||
  // single 8-byte-aligned 8-byte stack slot.  Pass as i64.
 | 
			
		||||
  CCIfSubtarget<"hasVector()",
 | 
			
		||||
    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
 | 
			
		||||
             CCIfShortVector<CCBitConvertToType<i64>>>>,
 | 
			
		||||
 | 
			
		||||
  // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
 | 
			
		||||
  CCIfSubtarget<"hasVector()",
 | 
			
		||||
    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
 | 
			
		||||
 
 | 
			
		||||
@@ -318,6 +318,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
 | 
			
		||||
      // Convert a GPR scalar to a vector by inserting it into element 0.
 | 
			
		||||
      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
 | 
			
		||||
 | 
			
		||||
      // Use a series of unpacks for extensions.
 | 
			
		||||
      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
 | 
			
		||||
      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
 | 
			
		||||
 | 
			
		||||
      // Detect shifts by a scalar amount and convert them into
 | 
			
		||||
      // V*_BY_SCALAR.
 | 
			
		||||
      setOperationAction(ISD::SHL, VT, Custom);
 | 
			
		||||
@@ -793,7 +797,15 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
 | 
			
		||||
  else if (VA.getLocInfo() == CCValAssign::Indirect)
 | 
			
		||||
    Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value,
 | 
			
		||||
                        MachinePointerInfo(), false, false, false, 0);
 | 
			
		||||
  else
 | 
			
		||||
  else if (VA.getLocInfo() == CCValAssign::BCvt) {
 | 
			
		||||
    // If this is a short vector argument loaded from the stack,
 | 
			
		||||
    // extend from i64 to full vector size and then bitcast.
 | 
			
		||||
    assert(VA.getLocVT() == MVT::i64);
 | 
			
		||||
    assert(VA.getValVT().isVector());
 | 
			
		||||
    Value = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64,
 | 
			
		||||
                        Value, DAG.getUNDEF(MVT::i64));
 | 
			
		||||
    Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
 | 
			
		||||
  } else
 | 
			
		||||
    assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
 | 
			
		||||
  return Value;
 | 
			
		||||
}
 | 
			
		||||
@@ -810,6 +822,14 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
 | 
			
		||||
    return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
 | 
			
		||||
  case CCValAssign::AExt:
 | 
			
		||||
    return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
 | 
			
		||||
  case CCValAssign::BCvt:
 | 
			
		||||
    // If this is a short vector argument to be stored to the stack,
 | 
			
		||||
    // bitcast to v2i64 and then extract first element.
 | 
			
		||||
    assert(VA.getLocVT() == MVT::i64);
 | 
			
		||||
    assert(VA.getValVT().isVector());
 | 
			
		||||
    Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
 | 
			
		||||
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
 | 
			
		||||
                       DAG.getConstant(0, DL, MVT::i32));
 | 
			
		||||
  case CCValAssign::Full:
 | 
			
		||||
    return Value;
 | 
			
		||||
  default:
 | 
			
		||||
@@ -3910,6 +3930,23 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
 | 
			
		||||
  return DAG.getNode(ISD::BITCAST, DL, VT, Res);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
SDValue
 | 
			
		||||
SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
 | 
			
		||||
					      unsigned UnpackHigh) const {
 | 
			
		||||
  SDValue PackedOp = Op.getOperand(0);
 | 
			
		||||
  EVT OutVT = Op.getValueType();
 | 
			
		||||
  EVT InVT = PackedOp.getValueType();
 | 
			
		||||
  unsigned ToBits = OutVT.getVectorElementType().getSizeInBits();
 | 
			
		||||
  unsigned FromBits = InVT.getVectorElementType().getSizeInBits();
 | 
			
		||||
  do {
 | 
			
		||||
    FromBits *= 2;
 | 
			
		||||
    EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
 | 
			
		||||
                                 SystemZ::VectorBits / FromBits);
 | 
			
		||||
    PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
 | 
			
		||||
  } while (FromBits != ToBits);
 | 
			
		||||
  return PackedOp;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
 | 
			
		||||
                                          unsigned ByScalar) const {
 | 
			
		||||
  // Look for cases where a vector shift can use the *_BY_SCALAR form.
 | 
			
		||||
@@ -4058,6 +4095,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
 | 
			
		||||
    return lowerINSERT_VECTOR_ELT(Op, DAG);
 | 
			
		||||
  case ISD::EXTRACT_VECTOR_ELT:
 | 
			
		||||
    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
 | 
			
		||||
  case ISD::SIGN_EXTEND_VECTOR_INREG:
 | 
			
		||||
    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
 | 
			
		||||
  case ISD::ZERO_EXTEND_VECTOR_INREG:
 | 
			
		||||
    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
 | 
			
		||||
  case ISD::SHL:
 | 
			
		||||
    return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
 | 
			
		||||
  case ISD::SRL:
 | 
			
		||||
@@ -4122,6 +4163,10 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
 | 
			
		||||
    OPCODE(PERMUTE_DWORDS);
 | 
			
		||||
    OPCODE(PERMUTE);
 | 
			
		||||
    OPCODE(PACK);
 | 
			
		||||
    OPCODE(UNPACK_HIGH);
 | 
			
		||||
    OPCODE(UNPACKL_HIGH);
 | 
			
		||||
    OPCODE(UNPACK_LOW);
 | 
			
		||||
    OPCODE(UNPACKL_LOW);
 | 
			
		||||
    OPCODE(VSHL_BY_SCALAR);
 | 
			
		||||
    OPCODE(VSRL_BY_SCALAR);
 | 
			
		||||
    OPCODE(VSRA_BY_SCALAR);
 | 
			
		||||
@@ -4334,17 +4379,35 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  // (z_merge_high 0, 0) -> 0.  This is mostly useful for using VLLEZF
 | 
			
		||||
  // for v4f32.
 | 
			
		||||
  if (Opcode == SystemZISD::MERGE_HIGH) {
 | 
			
		||||
  if (Opcode == SystemZISD::MERGE_HIGH ||
 | 
			
		||||
      Opcode == SystemZISD::MERGE_LOW) {
 | 
			
		||||
    SDValue Op0 = N->getOperand(0);
 | 
			
		||||
    SDValue Op1 = N->getOperand(1);
 | 
			
		||||
    if (Op0 == Op1) {
 | 
			
		||||
    if (Op0.getOpcode() == ISD::BITCAST)
 | 
			
		||||
      Op0 = Op0.getOperand(0);
 | 
			
		||||
    if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
 | 
			
		||||
          cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0)
 | 
			
		||||
        cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
 | 
			
		||||
      // (z_merge_* 0, 0) -> 0.  This is mostly useful for using VLLEZF
 | 
			
		||||
      // for v4f32.
 | 
			
		||||
      if (Op1 == N->getOperand(0))
 | 
			
		||||
        return Op1;
 | 
			
		||||
      // (z_merge_? 0, X) -> (z_unpackl_? 0, X).
 | 
			
		||||
      EVT VT = Op1.getValueType();
 | 
			
		||||
      unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
 | 
			
		||||
      if (ElemBytes <= 4) {
 | 
			
		||||
        Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
 | 
			
		||||
                  SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
 | 
			
		||||
        EVT InVT = VT.changeVectorElementTypeToInteger();
 | 
			
		||||
        EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
 | 
			
		||||
                                     SystemZ::VectorBytes / ElemBytes / 2);
 | 
			
		||||
        if (VT != InVT) {
 | 
			
		||||
          Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
 | 
			
		||||
          DCI.AddToWorklist(Op1.getNode());
 | 
			
		||||
        }
 | 
			
		||||
        SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
 | 
			
		||||
        DCI.AddToWorklist(Op.getNode());
 | 
			
		||||
        return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
 | 
			
		||||
 
 | 
			
		||||
@@ -201,6 +201,15 @@ enum {
 | 
			
		||||
  // Pack vector operands 0 and 1 into a single vector with half-sized elements.
 | 
			
		||||
  PACK,
 | 
			
		||||
 | 
			
		||||
  // Unpack the first half of vector operand 0 into double-sized elements.
 | 
			
		||||
  // UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
 | 
			
		||||
  UNPACK_HIGH,
 | 
			
		||||
  UNPACKL_HIGH,
 | 
			
		||||
 | 
			
		||||
  // Likewise for the second half.
 | 
			
		||||
  UNPACK_LOW,
 | 
			
		||||
  UNPACKL_LOW,
 | 
			
		||||
 | 
			
		||||
  // Shift each element of vector operand 0 by the number of bits specified
 | 
			
		||||
  // by scalar operand 1.
 | 
			
		||||
  VSHL_BY_SCALAR,
 | 
			
		||||
@@ -306,6 +315,23 @@ public:
 | 
			
		||||
    // want to clobber the upper 32 bits of a GPR unnecessarily.
 | 
			
		||||
    return MVT::i32;
 | 
			
		||||
  }
 | 
			
		||||
  TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
 | 
			
		||||
    const override {
 | 
			
		||||
    // Widen subvectors to the full width rather than promoting integer
 | 
			
		||||
    // elements.  This is better because:
 | 
			
		||||
    //
 | 
			
		||||
    // (a) it means that we can handle the ABI for passing and returning
 | 
			
		||||
    //     sub-128 vectors without having to handle them as legal types.
 | 
			
		||||
    //
 | 
			
		||||
    // (b) we don't have instructions to extend on load and truncate on store,
 | 
			
		||||
    //     so promoting the integers is less efficient.
 | 
			
		||||
    //
 | 
			
		||||
    // (c) there are no multiplication instructions for the widest integer
 | 
			
		||||
    //     type (v2i64).
 | 
			
		||||
    if (VT.getVectorElementType().getSizeInBits() % 8 == 0)
 | 
			
		||||
      return TypeWidenVector;
 | 
			
		||||
    return TargetLoweringBase::getPreferredVectorAction(VT);
 | 
			
		||||
  }
 | 
			
		||||
  EVT getSetCCResultType(LLVMContext &, EVT) const override;
 | 
			
		||||
  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 | 
			
		||||
  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 | 
			
		||||
@@ -417,6 +443,8 @@ private:
 | 
			
		||||
  SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
 | 
			
		||||
  SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
 | 
			
		||||
  SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
 | 
			
		||||
  SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
 | 
			
		||||
				 unsigned UnpackHigh) const;
 | 
			
		||||
  SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
 | 
			
		||||
 | 
			
		||||
  SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
 | 
			
		||||
 
 | 
			
		||||
@@ -290,24 +290,24 @@ let Predicates = [FeatureVector] in {
 | 
			
		||||
  def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>;
 | 
			
		||||
 | 
			
		||||
  // Unpack high.
 | 
			
		||||
  def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, null_frag, v128h, v128b, 0>;
 | 
			
		||||
  def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, null_frag, v128f, v128h, 1>;
 | 
			
		||||
  def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, null_frag, v128g, v128f, 2>;
 | 
			
		||||
  def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, z_unpack_high, v128h, v128b, 0>;
 | 
			
		||||
  def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, z_unpack_high, v128f, v128h, 1>;
 | 
			
		||||
  def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, z_unpack_high, v128g, v128f, 2>;
 | 
			
		||||
 | 
			
		||||
  // Unpack logical high.
 | 
			
		||||
  def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, null_frag, v128h, v128b, 0>;
 | 
			
		||||
  def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, null_frag, v128f, v128h, 1>;
 | 
			
		||||
  def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, null_frag, v128g, v128f, 2>;
 | 
			
		||||
  def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, z_unpackl_high, v128h, v128b, 0>;
 | 
			
		||||
  def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, z_unpackl_high, v128f, v128h, 1>;
 | 
			
		||||
  def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, z_unpackl_high, v128g, v128f, 2>;
 | 
			
		||||
 | 
			
		||||
  // Unpack low.
 | 
			
		||||
  def VUPLB  : UnaryVRRa<"vuplb",  0xE7D6, null_frag, v128h, v128b, 0>;
 | 
			
		||||
  def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, null_frag, v128f, v128h, 1>;
 | 
			
		||||
  def VUPLF  : UnaryVRRa<"vuplf",  0xE7D6, null_frag, v128g, v128f, 2>;
 | 
			
		||||
  def VUPLB  : UnaryVRRa<"vuplb",  0xE7D6, z_unpack_low, v128h, v128b, 0>;
 | 
			
		||||
  def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, z_unpack_low, v128f, v128h, 1>;
 | 
			
		||||
  def VUPLF  : UnaryVRRa<"vuplf",  0xE7D6, z_unpack_low, v128g, v128f, 2>;
 | 
			
		||||
 | 
			
		||||
  // Unpack logical low.
 | 
			
		||||
  def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, null_frag, v128h, v128b, 0>;
 | 
			
		||||
  def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, null_frag, v128f, v128h, 1>;
 | 
			
		||||
  def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, null_frag, v128g, v128f, 2>;
 | 
			
		||||
  def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, z_unpackl_low, v128h, v128b, 0>;
 | 
			
		||||
  def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, z_unpackl_low, v128f, v128h, 1>;
 | 
			
		||||
  def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, z_unpackl_low, v128g, v128f, 2>;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//===----------------------------------------------------------------------===//
 | 
			
		||||
 
 | 
			
		||||
@@ -193,6 +193,10 @@ def z_permute_dwords    : SDNode<"SystemZISD::PERMUTE_DWORDS",
 | 
			
		||||
                                 SDT_ZVecTernaryInt>;
 | 
			
		||||
def z_permute           : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
 | 
			
		||||
def z_pack              : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
 | 
			
		||||
def z_unpack_high       : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>;
 | 
			
		||||
def z_unpackl_high      : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>;
 | 
			
		||||
def z_unpack_low        : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>;
 | 
			
		||||
def z_unpackl_low       : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnaryConv>;
 | 
			
		||||
def z_vshl_by_scalar    : SDNode<"SystemZISD::VSHL_BY_SCALAR",
 | 
			
		||||
                                 SDT_ZVecBinaryInt>;
 | 
			
		||||
def z_vsrl_by_scalar    : SDNode<"SystemZISD::VSRL_BY_SCALAR",
 | 
			
		||||
@@ -544,11 +548,12 @@ def z_vllezi64 : PatFrag<(ops node:$addr),
 | 
			
		||||
def z_vllezf32 : PatFrag<(ops node:$addr),
 | 
			
		||||
                         (bitconvert
 | 
			
		||||
                          (z_merge_high
 | 
			
		||||
                           (v2i64 (bitconvert
 | 
			
		||||
                                   (z_merge_high
 | 
			
		||||
                                    (v4f32 (z_vzero)),
 | 
			
		||||
                           (v2i64
 | 
			
		||||
                            (z_unpackl_high
 | 
			
		||||
                             (v4i32
 | 
			
		||||
                              (bitconvert
 | 
			
		||||
                               (v4f32 (scalar_to_vector
 | 
			
		||||
                                            (f32 (load node:$addr))))))),
 | 
			
		||||
                                       (f32 (load node:$addr)))))))),
 | 
			
		||||
                           (v2i64 (z_vzero))))>;
 | 
			
		||||
def z_vllezf64 : PatFrag<(ops node:$addr),
 | 
			
		||||
                         (z_merge_high
 | 
			
		||||
 
 | 
			
		||||
@@ -14,3 +14,17 @@ define <4 x i32> @foo(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4
 | 
			
		||||
  %y = sub <4 x i32> %v2, %v10
 | 
			
		||||
  ret <4 x i32> %y
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; This routine has 10 vector arguments, which fill up %v24-%v31 and
 | 
			
		||||
; the two single-wide stack slots at 160 and 168.
 | 
			
		||||
define <4 x i8> @bar(<4 x i8> %v1, <4 x i8> %v2, <4 x i8> %v3, <4 x i8> %v4,
 | 
			
		||||
                     <4 x i8> %v5, <4 x i8> %v6, <4 x i8> %v7, <4 x i8> %v8,
 | 
			
		||||
                     <4 x i8> %v9, <4 x i8> %v10) {
 | 
			
		||||
; CHECK-LABEL: bar:
 | 
			
		||||
; CHECK: vlrepg [[REG1:%v[0-9]+]], 168(%r15)
 | 
			
		||||
; CHECK: vsb %v24, %v26, [[REG1]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %y = sub <4 x i8> %v2, %v10
 | 
			
		||||
  ret <4 x i8> %y
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										50
									
								
								test/CodeGen/SystemZ/vec-args-04.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								test/CodeGen/SystemZ/vec-args-04.ll
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,50 @@
 | 
			
		||||
; Test the handling of named short vector arguments.
 | 
			
		||||
;
 | 
			
		||||
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC
 | 
			
		||||
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK
 | 
			
		||||
 | 
			
		||||
; This routine has 12 vector arguments, which fill up %v24-%v31
 | 
			
		||||
; and the four single-wide stack slots starting at 160.
 | 
			
		||||
declare void @bar(<1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>,
 | 
			
		||||
                  <1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>,
 | 
			
		||||
                  <1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>)
 | 
			
		||||
 | 
			
		||||
define void @foo() {
 | 
			
		||||
; CHECK-VEC-LABEL: foo:
 | 
			
		||||
; CHECK-VEC-DAG: vrepib %v24, 1
 | 
			
		||||
; CHECK-VEC-DAG: vrepib %v26, 2
 | 
			
		||||
; CHECK-VEC-DAG: vrepib %v28, 3
 | 
			
		||||
; CHECK-VEC-DAG: vrepib %v30, 4
 | 
			
		||||
; CHECK-VEC-DAG: vrepib %v25, 5
 | 
			
		||||
; CHECK-VEC-DAG: vrepib %v27, 6
 | 
			
		||||
; CHECK-VEC-DAG: vrepib %v29, 7
 | 
			
		||||
; CHECK-VEC-DAG: vrepib %v31, 8
 | 
			
		||||
; CHECK-VEC: brasl %r14, bar@PLT
 | 
			
		||||
;
 | 
			
		||||
; CHECK-STACK-LABEL: foo:
 | 
			
		||||
; CHECK-STACK: aghi %r15, -192
 | 
			
		||||
; CHECK-STACK-DAG: llihh [[REG1:%r[0-9]+]], 2304
 | 
			
		||||
; CHECK-STACK-DAG: stg [[REG1]], 160(%r15)
 | 
			
		||||
; CHECK-STACK-DAG: llihh [[REG2:%r[0-9]+]], 2570
 | 
			
		||||
; CHECK-STACK-DAG: stg [[REG2]], 168(%r15)
 | 
			
		||||
; CHECK-STACK-DAG: llihf [[REG3:%r[0-9]+]], 185273099
 | 
			
		||||
; CHECK-STACK-DAG: stg [[REG3]], 176(%r15)
 | 
			
		||||
; CHECK-STACK-DAG: llihf [[REG4:%r[0-9]+]], 202116108
 | 
			
		||||
; CHECK-STACK-DAG: oilf [[REG4]], 202116108
 | 
			
		||||
; CHECK-STACK-DAG: stg [[REG4]], 176(%r15)
 | 
			
		||||
; CHECK-STACK: brasl %r14, bar@PLT
 | 
			
		||||
 | 
			
		||||
  call void @bar (<1 x i8> <i8 1>,
 | 
			
		||||
                  <2 x i8> <i8 2, i8 2>,
 | 
			
		||||
                  <4 x i8> <i8 3, i8 3, i8 3, i8 3>,
 | 
			
		||||
                  <8 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>,
 | 
			
		||||
                  <1 x i8> <i8 5>,
 | 
			
		||||
                  <2 x i8> <i8 6, i8 6>,
 | 
			
		||||
                  <4 x i8> <i8 7, i8 7, i8 7, i8 7>,
 | 
			
		||||
                  <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>,
 | 
			
		||||
                  <1 x i8> <i8 9>,
 | 
			
		||||
                  <2 x i8> <i8 10, i8 10>,
 | 
			
		||||
                  <4 x i8> <i8 11, i8 11, i8 11, i8 11>,
 | 
			
		||||
                  <8 x i8> <i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12>)
 | 
			
		||||
  ret void
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										32
									
								
								test/CodeGen/SystemZ/vec-args-05.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								test/CodeGen/SystemZ/vec-args-05.ll
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,32 @@
 | 
			
		||||
; Test the handling of unnamed short vector arguments.
 | 
			
		||||
;
 | 
			
		||||
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC
 | 
			
		||||
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK
 | 
			
		||||
 | 
			
		||||
; This routine is called with two named vector argument (passed
 | 
			
		||||
; in %v24 and %v26) and two unnamed vector arguments (passed
 | 
			
		||||
; in the single-wide stack slots at 160 and 168).
 | 
			
		||||
declare void @bar(<4 x i8>, <4 x i8>, ...)
 | 
			
		||||
 | 
			
		||||
define void @foo() {
 | 
			
		||||
; CHECK-VEC-LABEL: foo:
 | 
			
		||||
; CHECK-VEC-DAG: vrepib %v24, 1
 | 
			
		||||
; CHECK-VEC-DAG: vrepib %v26, 2
 | 
			
		||||
; CHECK-VEC: brasl %r14, bar@PLT
 | 
			
		||||
;
 | 
			
		||||
; CHECK-STACK-LABEL: foo:
 | 
			
		||||
; CHECK-STACK: aghi %r15, -176
 | 
			
		||||
; CHECK-STACK-DAG: llihf [[REG1:%r[0-9]+]], 50529027
 | 
			
		||||
; CHECK-STACK-DAG: stg [[REG1]], 160(%r15)
 | 
			
		||||
; CHECK-STACK-DAG: llihf [[REG2:%r[0-9]+]], 67372036
 | 
			
		||||
; CHECK-STACK-DAG: stg [[REG2]], 168(%r15)
 | 
			
		||||
; CHECK-STACK: brasl %r14, bar@PLT
 | 
			
		||||
 | 
			
		||||
  call void (<4 x i8>, <4 x i8>, ...) @bar
 | 
			
		||||
              (<4 x i8> <i8 1, i8 1, i8 1, i8 1>,
 | 
			
		||||
               <4 x i8> <i8 2, i8 2, i8 2, i8 2>,
 | 
			
		||||
               <4 x i8> <i8 3, i8 3, i8 3, i8 3>,
 | 
			
		||||
               <4 x i8> <i8 4, i8 4, i8 4, i8 4>)
 | 
			
		||||
  ret void
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -105,3 +105,51 @@ define i16 @f5(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) {
 | 
			
		||||
  %res = add i16 %elem1, %elem2
 | 
			
		||||
  ret i16 %res
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a case where an unpack high can be eliminated from the usual
 | 
			
		||||
; load-extend sequence.
 | 
			
		||||
define void @f6(<8 x i8> *%ptr1, i8 *%ptr2, i8 *%ptr3, i8 *%ptr4) {
 | 
			
		||||
; CHECK-LABEL: f6:
 | 
			
		||||
; CHECK: vlrepg [[REG:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK-NOT: vup
 | 
			
		||||
; CHECK-DAG: vsteb [[REG]], 0(%r3), 1
 | 
			
		||||
; CHECK-DAG: vsteb [[REG]], 0(%r4), 2
 | 
			
		||||
; CHECK-DAG: vsteb [[REG]], 0(%r5), 7
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %vec = load <8 x i8>, <8 x i8> *%ptr1
 | 
			
		||||
  %ext = sext <8 x i8> %vec to <8 x i16>
 | 
			
		||||
  %elem1 = extractelement <8 x i16> %ext, i32 1
 | 
			
		||||
  %elem2 = extractelement <8 x i16> %ext, i32 2
 | 
			
		||||
  %elem3 = extractelement <8 x i16> %ext, i32 7
 | 
			
		||||
  %trunc1 = trunc i16 %elem1 to i8
 | 
			
		||||
  %trunc2 = trunc i16 %elem2 to i8
 | 
			
		||||
  %trunc3 = trunc i16 %elem3 to i8
 | 
			
		||||
  store i8 %trunc1, i8 *%ptr2
 | 
			
		||||
  store i8 %trunc2, i8 *%ptr3
 | 
			
		||||
  store i8 %trunc3, i8 *%ptr4
 | 
			
		||||
  ret void
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; ...and again with a bitcast inbetween.
 | 
			
		||||
define void @f7(<4 x i8> *%ptr1, i8 *%ptr2, i8 *%ptr3, i8 *%ptr4) {
 | 
			
		||||
; CHECK-LABEL: f7:
 | 
			
		||||
; CHECK: vlrepf [[REG:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK-NOT: vup
 | 
			
		||||
; CHECK-DAG: vsteb [[REG]], 0(%r3), 0
 | 
			
		||||
; CHECK-DAG: vsteb [[REG]], 0(%r4), 1
 | 
			
		||||
; CHECK-DAG: vsteb [[REG]], 0(%r5), 3
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %vec = load <4 x i8>, <4 x i8> *%ptr1
 | 
			
		||||
  %ext = sext <4 x i8> %vec to <4 x i32>
 | 
			
		||||
  %bitcast = bitcast <4 x i32> %ext to <8 x i16>
 | 
			
		||||
  %elem1 = extractelement <8 x i16> %bitcast, i32 1
 | 
			
		||||
  %elem2 = extractelement <8 x i16> %bitcast, i32 3
 | 
			
		||||
  %elem3 = extractelement <8 x i16> %bitcast, i32 7
 | 
			
		||||
  %trunc1 = trunc i16 %elem1 to i8
 | 
			
		||||
  %trunc2 = trunc i16 %elem2 to i8
 | 
			
		||||
  %trunc3 = trunc i16 %elem3 to i8
 | 
			
		||||
  store i8 %trunc1, i8 *%ptr2
 | 
			
		||||
  store i8 %trunc2, i8 *%ptr3
 | 
			
		||||
  store i8 %trunc3, i8 *%ptr4
 | 
			
		||||
  ret void
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										433
									
								
								test/CodeGen/SystemZ/vec-combine-02.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										433
									
								
								test/CodeGen/SystemZ/vec-combine-02.ll
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,433 @@
 | 
			
		||||
; Test various representations of pack-like operations.
 | 
			
		||||
;
 | 
			
		||||
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 | 
			
		||||
 | 
			
		||||
; One way of writing a <4 x i32> -> <8 x i16> pack.
 | 
			
		||||
define <8 x i16> @f1(<4 x i32> %val0, <4 x i32> %val1) {
 | 
			
		||||
; CHECK-LABEL: f1:
 | 
			
		||||
; CHECK: vpkf %v24, %v24, %v26
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %elem0 = extractelement <4 x i32> %val0, i32 0
 | 
			
		||||
  %elem1 = extractelement <4 x i32> %val0, i32 1
 | 
			
		||||
  %elem2 = extractelement <4 x i32> %val0, i32 2
 | 
			
		||||
  %elem3 = extractelement <4 x i32> %val0, i32 3
 | 
			
		||||
  %elem4 = extractelement <4 x i32> %val1, i32 0
 | 
			
		||||
  %elem5 = extractelement <4 x i32> %val1, i32 1
 | 
			
		||||
  %elem6 = extractelement <4 x i32> %val1, i32 2
 | 
			
		||||
  %elem7 = extractelement <4 x i32> %val1, i32 3
 | 
			
		||||
  %hboth0 = bitcast i32 %elem0 to <2 x i16>
 | 
			
		||||
  %hboth1 = bitcast i32 %elem1 to <2 x i16>
 | 
			
		||||
  %hboth2 = bitcast i32 %elem2 to <2 x i16>
 | 
			
		||||
  %hboth3 = bitcast i32 %elem3 to <2 x i16>
 | 
			
		||||
  %hboth4 = bitcast i32 %elem4 to <2 x i16>
 | 
			
		||||
  %hboth5 = bitcast i32 %elem5 to <2 x i16>
 | 
			
		||||
  %hboth6 = bitcast i32 %elem6 to <2 x i16>
 | 
			
		||||
  %hboth7 = bitcast i32 %elem7 to <2 x i16>
 | 
			
		||||
  %hlow0 = shufflevector <2 x i16> %hboth0, <2 x i16> %hboth1,
 | 
			
		||||
                         <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %hlow1 = shufflevector <2 x i16> %hboth2, <2 x i16> %hboth3,
 | 
			
		||||
                         <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %hlow2 = shufflevector <2 x i16> %hboth4, <2 x i16> %hboth5,
 | 
			
		||||
                         <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %hlow3 = shufflevector <2 x i16> %hboth6, <2 x i16> %hboth7,
 | 
			
		||||
                         <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %join0 = shufflevector <2 x i16> %hlow0, <2 x i16> %hlow1,
 | 
			
		||||
                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 | 
			
		||||
  %join1 = shufflevector <2 x i16> %hlow2, <2 x i16> %hlow3,
 | 
			
		||||
                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 | 
			
		||||
  %ret = shufflevector <4 x i16> %join0, <4 x i16> %join1,
 | 
			
		||||
                       <8 x i32> <i32 0, i32 1, i32 2, i32 3,
 | 
			
		||||
                                  i32 4, i32 5, i32 6, i32 7>
 | 
			
		||||
  ret <8 x i16> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; A different way of writing a <4 x i32> -> <8 x i16> pack.
 | 
			
		||||
define <8 x i16> @f2(<4 x i32> %val0, <4 x i32> %val1) {
 | 
			
		||||
; CHECK-LABEL: f2:
 | 
			
		||||
; CHECK: vpkf %v24, %v24, %v26
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %elem0 = extractelement <4 x i32> %val0, i32 0
 | 
			
		||||
  %elem1 = extractelement <4 x i32> %val0, i32 1
 | 
			
		||||
  %elem2 = extractelement <4 x i32> %val0, i32 2
 | 
			
		||||
  %elem3 = extractelement <4 x i32> %val0, i32 3
 | 
			
		||||
  %elem4 = extractelement <4 x i32> %val1, i32 0
 | 
			
		||||
  %elem5 = extractelement <4 x i32> %val1, i32 1
 | 
			
		||||
  %elem6 = extractelement <4 x i32> %val1, i32 2
 | 
			
		||||
  %elem7 = extractelement <4 x i32> %val1, i32 3
 | 
			
		||||
  %wvec0 = insertelement <4 x i32> undef, i32 %elem0, i32 0
 | 
			
		||||
  %wvec1 = insertelement <4 x i32> undef, i32 %elem1, i32 0
 | 
			
		||||
  %wvec2 = insertelement <4 x i32> undef, i32 %elem2, i32 0
 | 
			
		||||
  %wvec3 = insertelement <4 x i32> undef, i32 %elem3, i32 0
 | 
			
		||||
  %wvec4 = insertelement <4 x i32> undef, i32 %elem4, i32 0
 | 
			
		||||
  %wvec5 = insertelement <4 x i32> undef, i32 %elem5, i32 0
 | 
			
		||||
  %wvec6 = insertelement <4 x i32> undef, i32 %elem6, i32 0
 | 
			
		||||
  %wvec7 = insertelement <4 x i32> undef, i32 %elem7, i32 0
 | 
			
		||||
  %hvec0 = bitcast <4 x i32> %wvec0 to <8 x i16>
 | 
			
		||||
  %hvec1 = bitcast <4 x i32> %wvec1 to <8 x i16>
 | 
			
		||||
  %hvec2 = bitcast <4 x i32> %wvec2 to <8 x i16>
 | 
			
		||||
  %hvec3 = bitcast <4 x i32> %wvec3 to <8 x i16>
 | 
			
		||||
  %hvec4 = bitcast <4 x i32> %wvec4 to <8 x i16>
 | 
			
		||||
  %hvec5 = bitcast <4 x i32> %wvec5 to <8 x i16>
 | 
			
		||||
  %hvec6 = bitcast <4 x i32> %wvec6 to <8 x i16>
 | 
			
		||||
  %hvec7 = bitcast <4 x i32> %wvec7 to <8 x i16>
 | 
			
		||||
  %hlow0 = shufflevector <8 x i16> %hvec0, <8 x i16> %hvec1,
 | 
			
		||||
                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
 | 
			
		||||
                                    i32 undef, i32 undef, i32 undef, i32 undef>
 | 
			
		||||
  %hlow1 = shufflevector <8 x i16> %hvec2, <8 x i16> %hvec3,
 | 
			
		||||
                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
 | 
			
		||||
                                    i32 undef, i32 undef, i32 undef, i32 undef>
 | 
			
		||||
  %hlow2 = shufflevector <8 x i16> %hvec4, <8 x i16> %hvec5,
 | 
			
		||||
                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
 | 
			
		||||
                                    i32 undef, i32 undef, i32 undef, i32 undef>
 | 
			
		||||
  %hlow3 = shufflevector <8 x i16> %hvec6, <8 x i16> %hvec7,
 | 
			
		||||
                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
 | 
			
		||||
                                    i32 undef, i32 undef, i32 undef, i32 undef>
 | 
			
		||||
  %join0 = shufflevector <8 x i16> %hlow0, <8 x i16> %hlow1,
 | 
			
		||||
                         <8 x i32> <i32 0, i32 1, i32 8, i32 9,
 | 
			
		||||
                                    i32 undef, i32 undef, i32 undef, i32 undef>
 | 
			
		||||
  %join1 = shufflevector <8 x i16> %hlow2, <8 x i16> %hlow3,
 | 
			
		||||
                         <8 x i32> <i32 0, i32 1, i32 8, i32 9,
 | 
			
		||||
                                    i32 undef, i32 undef, i32 undef, i32 undef>
 | 
			
		||||
  %ret = shufflevector <8 x i16> %join0, <8 x i16> %join1,
 | 
			
		||||
                       <8 x i32> <i32 0, i32 1, i32 2, i32 3,
 | 
			
		||||
                                  i32 8, i32 9, i32 10, i32 11>
 | 
			
		||||
  ret <8 x i16> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; A direct pack operation.
 | 
			
		||||
define <8 x i16> @f3(<4 x i32> %val0, <4 x i32> %val1) {
 | 
			
		||||
; CHECK-LABEL: f3:
 | 
			
		||||
; CHECK: vpkf %v24, %v24, %v26
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
 | 
			
		||||
  %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
 | 
			
		||||
  %ret = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
 | 
			
		||||
                       <8 x i32> <i32 1, i32 3, i32 5, i32 7,
 | 
			
		||||
                                  i32 9, i32 11, i32 13, i32 15>
 | 
			
		||||
  ret <8 x i16> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; One way of writing a <4 x i32> -> <16 x i8> pack.  It doesn't matter
 | 
			
		||||
; whether the first pack is VPKF or VPKH since the even bytes of the
 | 
			
		||||
; result are discarded.
 | 
			
		||||
define <16 x i8> @f4(<4 x i32> %val0, <4 x i32> %val1,
 | 
			
		||||
                     <4 x i32> %val2, <4 x i32> %val3) {
 | 
			
		||||
; CHECK-LABEL: f4:
 | 
			
		||||
; CHECK-DAG: vpk{{[hf]}} [[REG1:%v[0-9]+]], %v24, %v26
 | 
			
		||||
; CHECK-DAG: vpk{{[hf]}} [[REG2:%v[0-9]+]], %v28, %v30
 | 
			
		||||
; CHECK: vpkh %v24, [[REG1]], [[REG2]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
 | 
			
		||||
  %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
 | 
			
		||||
  %bitcast2 = bitcast <4 x i32> %val2 to <8 x i16>
 | 
			
		||||
  %bitcast3 = bitcast <4 x i32> %val3 to <8 x i16>
 | 
			
		||||
  %join0 = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
 | 
			
		||||
                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
 | 
			
		||||
                                    i32 9, i32 11, i32 13, i32 15>
 | 
			
		||||
  %join1 = shufflevector <8 x i16> %bitcast2, <8 x i16> %bitcast3,
 | 
			
		||||
                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
 | 
			
		||||
                                    i32 9, i32 11, i32 13, i32 15>
 | 
			
		||||
  %bitcast4 = bitcast <8 x i16> %join0 to <16 x i8>
 | 
			
		||||
  %bitcast5 = bitcast <8 x i16> %join1 to <16 x i8>
 | 
			
		||||
  %ret = shufflevector <16 x i8> %bitcast4, <16 x i8> %bitcast5,
 | 
			
		||||
                       <16 x i32> <i32 1, i32 3, i32 5, i32 7,
 | 
			
		||||
                                   i32 9, i32 11, i32 13, i32 15,
 | 
			
		||||
                                   i32 17, i32 19, i32 21, i32 23,
 | 
			
		||||
                                   i32 25, i32 27, i32 29, i32 31>
 | 
			
		||||
  ret <16 x i8> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Check the same operation, but with elements being extracted from the result.
 | 
			
		||||
define void @f5(<4 x i32> %val0, <4 x i32> %val1,
 | 
			
		||||
                <4 x i32> %val2, <4 x i32> %val3,
 | 
			
		||||
                i8 *%base) {
 | 
			
		||||
; CHECK-LABEL: f5:
 | 
			
		||||
; CHECK-DAG: vsteb %v24, 0(%r2), 11
 | 
			
		||||
; CHECK-DAG: vsteb %v26, 1(%r2), 15
 | 
			
		||||
; CHECK-DAG: vsteb %v28, 2(%r2), 3
 | 
			
		||||
; CHECK-DAG: vsteb %v30, 3(%r2), 7
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
 | 
			
		||||
  %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
 | 
			
		||||
  %bitcast2 = bitcast <4 x i32> %val2 to <8 x i16>
 | 
			
		||||
  %bitcast3 = bitcast <4 x i32> %val3 to <8 x i16>
 | 
			
		||||
  %join0 = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
 | 
			
		||||
                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
 | 
			
		||||
                                    i32 9, i32 11, i32 13, i32 15>
 | 
			
		||||
  %join1 = shufflevector <8 x i16> %bitcast2, <8 x i16> %bitcast3,
 | 
			
		||||
                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
 | 
			
		||||
                                    i32 9, i32 11, i32 13, i32 15>
 | 
			
		||||
  %bitcast4 = bitcast <8 x i16> %join0 to <16 x i8>
 | 
			
		||||
  %bitcast5 = bitcast <8 x i16> %join1 to <16 x i8>
 | 
			
		||||
  %vec = shufflevector <16 x i8> %bitcast4, <16 x i8> %bitcast5,
 | 
			
		||||
                       <16 x i32> <i32 1, i32 3, i32 5, i32 7,
 | 
			
		||||
                                   i32 9, i32 11, i32 13, i32 15,
 | 
			
		||||
                                   i32 17, i32 19, i32 21, i32 23,
 | 
			
		||||
                                   i32 25, i32 27, i32 29, i32 31>
 | 
			
		||||
 | 
			
		||||
  %ptr0 = getelementptr i8, i8 *%base, i64 0
 | 
			
		||||
  %ptr1 = getelementptr i8, i8 *%base, i64 1
 | 
			
		||||
  %ptr2 = getelementptr i8, i8 *%base, i64 2
 | 
			
		||||
  %ptr3 = getelementptr i8, i8 *%base, i64 3
 | 
			
		||||
 | 
			
		||||
  %byte0 = extractelement <16 x i8> %vec, i32 2
 | 
			
		||||
  %byte1 = extractelement <16 x i8> %vec, i32 7
 | 
			
		||||
  %byte2 = extractelement <16 x i8> %vec, i32 8
 | 
			
		||||
  %byte3 = extractelement <16 x i8> %vec, i32 13
 | 
			
		||||
 | 
			
		||||
  store i8 %byte0, i8 *%ptr0
 | 
			
		||||
  store i8 %byte1, i8 *%ptr1
 | 
			
		||||
  store i8 %byte2, i8 *%ptr2
 | 
			
		||||
  store i8 %byte3, i8 *%ptr3
 | 
			
		||||
 | 
			
		||||
  ret void
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; A different way of writing a <4 x i32> -> <16 x i8> pack.
 | 
			
		||||
define <16 x i8> @f6(<4 x i32> %val0, <4 x i32> %val1,
 | 
			
		||||
                     <4 x i32> %val2, <4 x i32> %val3) {
 | 
			
		||||
; CHECK-LABEL: f6:
 | 
			
		||||
; CHECK-DAG: vpk{{[hf]}} [[REG1:%v[0-9]+]], %v24, %v26
 | 
			
		||||
; CHECK-DAG: vpk{{[hf]}} [[REG2:%v[0-9]+]], %v28, %v30
 | 
			
		||||
; CHECK: vpkh %v24, [[REG1]], [[REG2]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %elem0 = extractelement <4 x i32> %val0, i32 0
 | 
			
		||||
  %elem1 = extractelement <4 x i32> %val0, i32 1
 | 
			
		||||
  %elem2 = extractelement <4 x i32> %val0, i32 2
 | 
			
		||||
  %elem3 = extractelement <4 x i32> %val0, i32 3
 | 
			
		||||
  %elem4 = extractelement <4 x i32> %val1, i32 0
 | 
			
		||||
  %elem5 = extractelement <4 x i32> %val1, i32 1
 | 
			
		||||
  %elem6 = extractelement <4 x i32> %val1, i32 2
 | 
			
		||||
  %elem7 = extractelement <4 x i32> %val1, i32 3
 | 
			
		||||
  %elem8 = extractelement <4 x i32> %val2, i32 0
 | 
			
		||||
  %elem9 = extractelement <4 x i32> %val2, i32 1
 | 
			
		||||
  %elem10 = extractelement <4 x i32> %val2, i32 2
 | 
			
		||||
  %elem11 = extractelement <4 x i32> %val2, i32 3
 | 
			
		||||
  %elem12 = extractelement <4 x i32> %val3, i32 0
 | 
			
		||||
  %elem13 = extractelement <4 x i32> %val3, i32 1
 | 
			
		||||
  %elem14 = extractelement <4 x i32> %val3, i32 2
 | 
			
		||||
  %elem15 = extractelement <4 x i32> %val3, i32 3
 | 
			
		||||
  %bitcast0 = bitcast i32 %elem0 to <2 x i16>
 | 
			
		||||
  %bitcast1 = bitcast i32 %elem1 to <2 x i16>
 | 
			
		||||
  %bitcast2 = bitcast i32 %elem2 to <2 x i16>
 | 
			
		||||
  %bitcast3 = bitcast i32 %elem3 to <2 x i16>
 | 
			
		||||
  %bitcast4 = bitcast i32 %elem4 to <2 x i16>
 | 
			
		||||
  %bitcast5 = bitcast i32 %elem5 to <2 x i16>
 | 
			
		||||
  %bitcast6 = bitcast i32 %elem6 to <2 x i16>
 | 
			
		||||
  %bitcast7 = bitcast i32 %elem7 to <2 x i16>
 | 
			
		||||
  %bitcast8 = bitcast i32 %elem8 to <2 x i16>
 | 
			
		||||
  %bitcast9 = bitcast i32 %elem9 to <2 x i16>
 | 
			
		||||
  %bitcast10 = bitcast i32 %elem10 to <2 x i16>
 | 
			
		||||
  %bitcast11 = bitcast i32 %elem11 to <2 x i16>
 | 
			
		||||
  %bitcast12 = bitcast i32 %elem12 to <2 x i16>
 | 
			
		||||
  %bitcast13 = bitcast i32 %elem13 to <2 x i16>
 | 
			
		||||
  %bitcast14 = bitcast i32 %elem14 to <2 x i16>
 | 
			
		||||
  %bitcast15 = bitcast i32 %elem15 to <2 x i16>
 | 
			
		||||
  %low0 = shufflevector <2 x i16> %bitcast0, <2 x i16> %bitcast1,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low1 = shufflevector <2 x i16> %bitcast2, <2 x i16> %bitcast3,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low2 = shufflevector <2 x i16> %bitcast4, <2 x i16> %bitcast5,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low3 = shufflevector <2 x i16> %bitcast6, <2 x i16> %bitcast7,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low4 = shufflevector <2 x i16> %bitcast8, <2 x i16> %bitcast9,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low5 = shufflevector <2 x i16> %bitcast10, <2 x i16> %bitcast11,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low6 = shufflevector <2 x i16> %bitcast12, <2 x i16> %bitcast13,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low7 = shufflevector <2 x i16> %bitcast14, <2 x i16> %bitcast15,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %bytes0 = bitcast <2 x i16> %low0 to <4 x i8>
 | 
			
		||||
  %bytes1 = bitcast <2 x i16> %low1 to <4 x i8>
 | 
			
		||||
  %bytes2 = bitcast <2 x i16> %low2 to <4 x i8>
 | 
			
		||||
  %bytes3 = bitcast <2 x i16> %low3 to <4 x i8>
 | 
			
		||||
  %bytes4 = bitcast <2 x i16> %low4 to <4 x i8>
 | 
			
		||||
  %bytes5 = bitcast <2 x i16> %low5 to <4 x i8>
 | 
			
		||||
  %bytes6 = bitcast <2 x i16> %low6 to <4 x i8>
 | 
			
		||||
  %bytes7 = bitcast <2 x i16> %low7 to <4 x i8>
 | 
			
		||||
  %blow0 = shufflevector <4 x i8> %bytes0, <4 x i8> %bytes1,
 | 
			
		||||
                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 | 
			
		||||
  %blow1 = shufflevector <4 x i8> %bytes2, <4 x i8> %bytes3,
 | 
			
		||||
                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 | 
			
		||||
  %blow2 = shufflevector <4 x i8> %bytes4, <4 x i8> %bytes5,
 | 
			
		||||
                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 | 
			
		||||
  %blow3 = shufflevector <4 x i8> %bytes6, <4 x i8> %bytes7,
 | 
			
		||||
                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 | 
			
		||||
  %join0 = shufflevector <4 x i8> %blow0, <4 x i8> %blow1,
 | 
			
		||||
                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
 | 
			
		||||
                                    i32 4, i32 5, i32 6, i32 7>
 | 
			
		||||
  %join1 = shufflevector <4 x i8> %blow2, <4 x i8> %blow3,
 | 
			
		||||
                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
 | 
			
		||||
                                    i32 4, i32 5, i32 6, i32 7>
 | 
			
		||||
  %ret = shufflevector <8 x i8> %join0, <8 x i8> %join1,
 | 
			
		||||
                       <16 x i32> <i32 0, i32 1, i32 2, i32 3,
 | 
			
		||||
                                   i32 4, i32 5, i32 6, i32 7,
 | 
			
		||||
                                   i32 8, i32 9, i32 10, i32 11,
 | 
			
		||||
                                   i32 12, i32 13, i32 14, i32 15>
 | 
			
		||||
  ret <16 x i8> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; One way of writing a <2 x i64> -> <16 x i8> pack.
 | 
			
		||||
define <16 x i8> @f7(<2 x i64> %val0, <2 x i64> %val1,
 | 
			
		||||
                     <2 x i64> %val2, <2 x i64> %val3,
 | 
			
		||||
                     <2 x i64> %val4, <2 x i64> %val5,
 | 
			
		||||
                     <2 x i64> %val6, <2 x i64> %val7) {
 | 
			
		||||
; CHECK-LABEL: f7:
 | 
			
		||||
; CHECK-DAG: vpk{{[hfg]}} [[REG1:%v[0-9]+]], %v24, %v26
 | 
			
		||||
; CHECK-DAG: vpk{{[hfg]}} [[REG2:%v[0-9]+]], %v28, %v30
 | 
			
		||||
; CHECK-DAG: vpk{{[hfg]}} [[REG3:%v[0-9]+]], %v25, %v27
 | 
			
		||||
; CHECK-DAG: vpk{{[hfg]}} [[REG4:%v[0-9]+]], %v29, %v31
 | 
			
		||||
; CHECK-DAG: vpk{{[hf]}} [[REG5:%v[0-9]+]], [[REG1]], [[REG2]]
 | 
			
		||||
; CHECK-DAG: vpk{{[hf]}} [[REG6:%v[0-9]+]], [[REG3]], [[REG4]]
 | 
			
		||||
; CHECK: vpkh %v24, [[REG5]], [[REG6]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %elem0 = extractelement <2 x i64> %val0, i32 0
 | 
			
		||||
  %elem1 = extractelement <2 x i64> %val0, i32 1
 | 
			
		||||
  %elem2 = extractelement <2 x i64> %val1, i32 0
 | 
			
		||||
  %elem3 = extractelement <2 x i64> %val1, i32 1
 | 
			
		||||
  %elem4 = extractelement <2 x i64> %val2, i32 0
 | 
			
		||||
  %elem5 = extractelement <2 x i64> %val2, i32 1
 | 
			
		||||
  %elem6 = extractelement <2 x i64> %val3, i32 0
 | 
			
		||||
  %elem7 = extractelement <2 x i64> %val3, i32 1
 | 
			
		||||
  %elem8 = extractelement <2 x i64> %val4, i32 0
 | 
			
		||||
  %elem9 = extractelement <2 x i64> %val4, i32 1
 | 
			
		||||
  %elem10 = extractelement <2 x i64> %val5, i32 0
 | 
			
		||||
  %elem11 = extractelement <2 x i64> %val5, i32 1
 | 
			
		||||
  %elem12 = extractelement <2 x i64> %val6, i32 0
 | 
			
		||||
  %elem13 = extractelement <2 x i64> %val6, i32 1
 | 
			
		||||
  %elem14 = extractelement <2 x i64> %val7, i32 0
 | 
			
		||||
  %elem15 = extractelement <2 x i64> %val7, i32 1
 | 
			
		||||
  %bitcast0 = bitcast i64 %elem0 to <2 x i32>
 | 
			
		||||
  %bitcast1 = bitcast i64 %elem1 to <2 x i32>
 | 
			
		||||
  %bitcast2 = bitcast i64 %elem2 to <2 x i32>
 | 
			
		||||
  %bitcast3 = bitcast i64 %elem3 to <2 x i32>
 | 
			
		||||
  %bitcast4 = bitcast i64 %elem4 to <2 x i32>
 | 
			
		||||
  %bitcast5 = bitcast i64 %elem5 to <2 x i32>
 | 
			
		||||
  %bitcast6 = bitcast i64 %elem6 to <2 x i32>
 | 
			
		||||
  %bitcast7 = bitcast i64 %elem7 to <2 x i32>
 | 
			
		||||
  %bitcast8 = bitcast i64 %elem8 to <2 x i32>
 | 
			
		||||
  %bitcast9 = bitcast i64 %elem9 to <2 x i32>
 | 
			
		||||
  %bitcast10 = bitcast i64 %elem10 to <2 x i32>
 | 
			
		||||
  %bitcast11 = bitcast i64 %elem11 to <2 x i32>
 | 
			
		||||
  %bitcast12 = bitcast i64 %elem12 to <2 x i32>
 | 
			
		||||
  %bitcast13 = bitcast i64 %elem13 to <2 x i32>
 | 
			
		||||
  %bitcast14 = bitcast i64 %elem14 to <2 x i32>
 | 
			
		||||
  %bitcast15 = bitcast i64 %elem15 to <2 x i32>
 | 
			
		||||
  %low0 = shufflevector <2 x i32> %bitcast0, <2 x i32> %bitcast1,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low1 = shufflevector <2 x i32> %bitcast2, <2 x i32> %bitcast3,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low2 = shufflevector <2 x i32> %bitcast4, <2 x i32> %bitcast5,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low3 = shufflevector <2 x i32> %bitcast6, <2 x i32> %bitcast7,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low4 = shufflevector <2 x i32> %bitcast8, <2 x i32> %bitcast9,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low5 = shufflevector <2 x i32> %bitcast10, <2 x i32> %bitcast11,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low6 = shufflevector <2 x i32> %bitcast12, <2 x i32> %bitcast13,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %low7 = shufflevector <2 x i32> %bitcast14, <2 x i32> %bitcast15,
 | 
			
		||||
                        <2 x i32> <i32 1, i32 3>
 | 
			
		||||
  %half0 = bitcast <2 x i32> %low0 to <4 x i16>
 | 
			
		||||
  %half1 = bitcast <2 x i32> %low1 to <4 x i16>
 | 
			
		||||
  %half2 = bitcast <2 x i32> %low2 to <4 x i16>
 | 
			
		||||
  %half3 = bitcast <2 x i32> %low3 to <4 x i16>
 | 
			
		||||
  %half4 = bitcast <2 x i32> %low4 to <4 x i16>
 | 
			
		||||
  %half5 = bitcast <2 x i32> %low5 to <4 x i16>
 | 
			
		||||
  %half6 = bitcast <2 x i32> %low6 to <4 x i16>
 | 
			
		||||
  %half7 = bitcast <2 x i32> %low7 to <4 x i16>
 | 
			
		||||
  %hlow0 = shufflevector <4 x i16> %half0, <4 x i16> %half1,
 | 
			
		||||
                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 | 
			
		||||
  %hlow1 = shufflevector <4 x i16> %half2, <4 x i16> %half3,
 | 
			
		||||
                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 | 
			
		||||
  %hlow2 = shufflevector <4 x i16> %half4, <4 x i16> %half5,
 | 
			
		||||
                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 | 
			
		||||
  %hlow3 = shufflevector <4 x i16> %half6, <4 x i16> %half7,
 | 
			
		||||
                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 | 
			
		||||
  %bytes0 = bitcast <4 x i16> %hlow0 to <8 x i8>
 | 
			
		||||
  %bytes1 = bitcast <4 x i16> %hlow1 to <8 x i8>
 | 
			
		||||
  %bytes2 = bitcast <4 x i16> %hlow2 to <8 x i8>
 | 
			
		||||
  %bytes3 = bitcast <4 x i16> %hlow3 to <8 x i8>
 | 
			
		||||
  %join0 = shufflevector <8 x i8> %bytes0, <8 x i8> %bytes1,
 | 
			
		||||
                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
 | 
			
		||||
                                    i32 9, i32 11, i32 13, i32 15>
 | 
			
		||||
  %join1 = shufflevector <8 x i8> %bytes2, <8 x i8> %bytes3,
 | 
			
		||||
                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
 | 
			
		||||
                                    i32 9, i32 11, i32 13, i32 15>
 | 
			
		||||
  %ret = shufflevector <8 x i8> %join0, <8 x i8> %join1,
 | 
			
		||||
                       <16 x i32> <i32 0, i32 1, i32 2, i32 3,
 | 
			
		||||
                                   i32 4, i32 5, i32 6, i32 7,
 | 
			
		||||
                                   i32 8, i32 9, i32 10, i32 11,
 | 
			
		||||
                                   i32 12, i32 13, i32 14, i32 15>
 | 
			
		||||
  ret <16 x i8> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a <2 x i64> -> <4 x f32> pack in which only individual elements are
 | 
			
		||||
; needed.
 | 
			
		||||
define float @f8(i64 %scalar0, i64 %scalar1, i64 %scalar2, i64 %scalar3) {
 | 
			
		||||
; CHECK-LABEL: f8:
 | 
			
		||||
; CHECK-NOT: vperm
 | 
			
		||||
; CHECK-NOT: vpk
 | 
			
		||||
; CHECK-NOT: vmrh
 | 
			
		||||
; CHECK: aebr {{%f[0-7]}},
 | 
			
		||||
; CHECK: aebr {{%f[0-7]}},
 | 
			
		||||
; CHECK: meebr %f0,
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %vec0 = insertelement <2 x i64> undef, i64 %scalar0, i32 0
 | 
			
		||||
  %vec1 = insertelement <2 x i64> undef, i64 %scalar1, i32 0
 | 
			
		||||
  %vec2 = insertelement <2 x i64> undef, i64 %scalar2, i32 0
 | 
			
		||||
  %vec3 = insertelement <2 x i64> undef, i64 %scalar3, i32 0
 | 
			
		||||
  %join0 = shufflevector <2 x i64> %vec0, <2 x i64> %vec1,
 | 
			
		||||
                         <2 x i32> <i32 0, i32 2>
 | 
			
		||||
  %join1 = shufflevector <2 x i64> %vec2, <2 x i64> %vec3,
 | 
			
		||||
                         <2 x i32> <i32 0, i32 2>
 | 
			
		||||
  %bitcast0 = bitcast <2 x i64> %join0 to <4 x float>
 | 
			
		||||
  %bitcast1 = bitcast <2 x i64> %join1 to <4 x float>
 | 
			
		||||
  %pack = shufflevector <4 x float> %bitcast0, <4 x float> %bitcast1,
 | 
			
		||||
                        <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 | 
			
		||||
  %elt0 = extractelement <4 x float> %pack, i32 0
 | 
			
		||||
  %elt1 = extractelement <4 x float> %pack, i32 1
 | 
			
		||||
  %elt2 = extractelement <4 x float> %pack, i32 2
 | 
			
		||||
  %elt3 = extractelement <4 x float> %pack, i32 3
 | 
			
		||||
  %add0 = fadd float %elt0, %elt2
 | 
			
		||||
  %add1 = fadd float %elt1, %elt3
 | 
			
		||||
  %ret = fmul float %add0, %add1
 | 
			
		||||
  ret float %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a <2 x f64> -> <4 x i32> pack in which only individual elements are
 | 
			
		||||
; needed.
 | 
			
		||||
define i32 @f9(double %scalar0, double %scalar1, double %scalar2,
 | 
			
		||||
               double %scalar3) {
 | 
			
		||||
; CHECK-LABEL: f9:
 | 
			
		||||
; CHECK-NOT: vperm
 | 
			
		||||
; CHECK-NOT: vpk
 | 
			
		||||
; CHECK-NOT: vmrh
 | 
			
		||||
; CHECK: ar {{%r[0-5]}},
 | 
			
		||||
; CHECK: ar {{%r[0-5]}},
 | 
			
		||||
; CHECK: or %r2,
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %vec0 = insertelement <2 x double> undef, double %scalar0, i32 0
 | 
			
		||||
  %vec1 = insertelement <2 x double> undef, double %scalar1, i32 0
 | 
			
		||||
  %vec2 = insertelement <2 x double> undef, double %scalar2, i32 0
 | 
			
		||||
  %vec3 = insertelement <2 x double> undef, double %scalar3, i32 0
 | 
			
		||||
  %join0 = shufflevector <2 x double> %vec0, <2 x double> %vec1,
 | 
			
		||||
                         <2 x i32> <i32 0, i32 2>
 | 
			
		||||
  %join1 = shufflevector <2 x double> %vec2, <2 x double> %vec3,
 | 
			
		||||
                         <2 x i32> <i32 0, i32 2>
 | 
			
		||||
  %bitcast0 = bitcast <2 x double> %join0 to <4 x i32>
 | 
			
		||||
  %bitcast1 = bitcast <2 x double> %join1 to <4 x i32>
 | 
			
		||||
  %pack = shufflevector <4 x i32> %bitcast0, <4 x i32> %bitcast1,
 | 
			
		||||
                        <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 | 
			
		||||
  %elt0 = extractelement <4 x i32> %pack, i32 0
 | 
			
		||||
  %elt1 = extractelement <4 x i32> %pack, i32 1
 | 
			
		||||
  %elt2 = extractelement <4 x i32> %pack, i32 2
 | 
			
		||||
  %elt3 = extractelement <4 x i32> %pack, i32 3
 | 
			
		||||
  %add0 = add i32 %elt0, %elt2
 | 
			
		||||
  %add1 = add i32 %elt1, %elt3
 | 
			
		||||
  %ret = or i32 %add0, %add1
 | 
			
		||||
  ret i32 %ret
 | 
			
		||||
}
 | 
			
		||||
@@ -53,3 +53,51 @@ define <16 x i8> @f5() {
 | 
			
		||||
                 i8 0, i8 -1, i8 -1, i8 -1,
 | 
			
		||||
                 i8 0, i8 -1, i8 0, i8 -1>
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test an all-zeros v2i8 that gets promoted to v16i8.
 | 
			
		||||
define <2 x i8> @f6() {
 | 
			
		||||
; CHECK-LABEL: f6:
 | 
			
		||||
; CHECK: vgbm %v24, 0
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x i8> zeroinitializer
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a mixed v2i8 that gets promoted to v16i8 (mask 0x8000).
 | 
			
		||||
define <2 x i8> @f7() {
 | 
			
		||||
; CHECK-LABEL: f7:
 | 
			
		||||
; CHECK: vgbm %v24, 32768
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x i8> <i8 255, i8 0>
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test an all-zeros v4i8 that gets promoted to v16i8.
 | 
			
		||||
define <4 x i8> @f8() {
 | 
			
		||||
; CHECK-LABEL: f8:
 | 
			
		||||
; CHECK: vgbm %v24, 0
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <4 x i8> zeroinitializer
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a mixed v4i8 that gets promoted to v16i8 (mask 0x9000).
 | 
			
		||||
define <4 x i8> @f9() {
 | 
			
		||||
; CHECK-LABEL: f9:
 | 
			
		||||
; CHECK: vgbm %v24, 36864
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <4 x i8> <i8 255, i8 0, i8 0, i8 255>
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test an all-zeros v8i8 that gets promoted to v16i8.
 | 
			
		||||
define <8 x i8> @f10() {
 | 
			
		||||
; CHECK-LABEL: f10:
 | 
			
		||||
; CHECK: vgbm %v24, 0
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <8 x i8> zeroinitializer
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a mixed v8i8 that gets promoted to v16i8 (mask 0xE500).
 | 
			
		||||
define <8 x i8> @f11() {
 | 
			
		||||
; CHECK-LABEL: f11:
 | 
			
		||||
; CHECK: vgbm %v24, 58624
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <8 x i8> <i8 255, i8 255, i8 255, i8 0, i8 0, i8 255, i8 0, i8 255>
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -45,3 +45,35 @@ define <8 x i16> @f5() {
 | 
			
		||||
  ret <8 x i16> <i16 65280, i16 0, i16 65535, i16 0,
 | 
			
		||||
                 i16 255, i16 65535, i16 256, i16 65280>
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test an all-zeros v2i16 that gets promoted to v8i16.
 | 
			
		||||
define <2 x i16> @f6() {
 | 
			
		||||
; CHECK-LABEL: f6:
 | 
			
		||||
; CHECK: vgbm %v24, 0
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x i16> zeroinitializer
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a mixed v2i16 that gets promoted to v8i16 (mask 0xc000).
 | 
			
		||||
define <2 x i16> @f7() {
 | 
			
		||||
; CHECK-LABEL: f7:
 | 
			
		||||
; CHECK: vgbm %v24, 49152
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x i16> <i16 65535, i16 0>
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test an all-zeros v4i16 that gets promoted to v8i16.
 | 
			
		||||
define <4 x i16> @f8() {
 | 
			
		||||
; CHECK-LABEL: f8:
 | 
			
		||||
; CHECK: vgbm %v24, 0
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <4 x i16> zeroinitializer
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a mixed v4i16 that gets promoted to v8i16 (mask 0x7200).
 | 
			
		||||
define <4 x i16> @f9() {
 | 
			
		||||
; CHECK-LABEL: f9:
 | 
			
		||||
; CHECK: vgbm %v24, 29184
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <4 x i16> <i16 255, i16 65535, i16 0, i16 65280>
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -41,3 +41,19 @@ define <4 x i32> @f5() {
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <4 x i32> <i32 4278190080, i32 1, i32 16777215, i32 16776960>
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test an all-zeros v2i32 that gets promoted to v4i32.
 | 
			
		||||
define <2 x i32> @f6() {
 | 
			
		||||
; CHECK-LABEL: f6:
 | 
			
		||||
; CHECK: vgbm %v24, 0
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x i32> zeroinitializer
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a mixed v2i32 that gets promoted to v4i32 (mask 0xae00).
 | 
			
		||||
define <2 x i32> @f7() {
 | 
			
		||||
; CHECK-LABEL: f7:
 | 
			
		||||
; CHECK: vgbm %v24, 44544
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x i32> <i32 4278255360, i32 -256>
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -45,3 +45,19 @@ define <4 x float> @f5() {
 | 
			
		||||
  ret <4 x float> <float 0xffffe00000000000, float 0x381fffffc0000000,
 | 
			
		||||
                   float 0x379fffe000000000, float 0x371fe00000000000>
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test an all-zeros v2f32 that gets promoted to v4f32.
 | 
			
		||||
define <2 x float> @f6() {
 | 
			
		||||
; CHECK-LABEL: f6:
 | 
			
		||||
; CHECK: vgbm %v24, 0
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x float> zeroinitializer
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a mixed v2f32 that gets promoted to v4f32 (mask 0xc700).
 | 
			
		||||
define <2 x float> @f7() {
 | 
			
		||||
; CHECK-LABEL: f7:
 | 
			
		||||
; CHECK: vgbm %v24, 50944
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x float> <float 0xffffe00000000000, float 0x381fffffe0000000>
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -49,3 +49,59 @@ define <2 x double> @f6(<2 x double> %val1, <2 x double> %val2) {
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x double> %val2
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test v2i8 moves.
 | 
			
		||||
define <2 x i8> @f7(<2 x i8> %val1, <2 x i8> %val2) {
 | 
			
		||||
; CHECK-LABEL: f7:
 | 
			
		||||
; CHECK: vlr %v24, %v26
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x i8> %val2
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test v4i8 moves.
 | 
			
		||||
define <4 x i8> @f8(<4 x i8> %val1, <4 x i8> %val2) {
 | 
			
		||||
; CHECK-LABEL: f8:
 | 
			
		||||
; CHECK: vlr %v24, %v26
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <4 x i8> %val2
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test v8i8 moves.
 | 
			
		||||
define <8 x i8> @f9(<8 x i8> %val1, <8 x i8> %val2) {
 | 
			
		||||
; CHECK-LABEL: f9:
 | 
			
		||||
; CHECK: vlr %v24, %v26
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <8 x i8> %val2
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test v2i16 moves.
 | 
			
		||||
define <2 x i16> @f10(<2 x i16> %val1, <2 x i16> %val2) {
 | 
			
		||||
; CHECK-LABEL: f10:
 | 
			
		||||
; CHECK: vlr %v24, %v26
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x i16> %val2
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test v4i16 moves.
 | 
			
		||||
define <4 x i16> @f11(<4 x i16> %val1, <4 x i16> %val2) {
 | 
			
		||||
; CHECK-LABEL: f11:
 | 
			
		||||
; CHECK: vlr %v24, %v26
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <4 x i16> %val2
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test v2i32 moves.
 | 
			
		||||
define <2 x i32> @f12(<2 x i32> %val1, <2 x i32> %val2) {
 | 
			
		||||
; CHECK-LABEL: f12:
 | 
			
		||||
; CHECK: vlr %v24, %v26
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x i32> %val2
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test v2f32 moves.
 | 
			
		||||
define <2 x float> @f13(<2 x float> %val1, <2 x float> %val2) {
 | 
			
		||||
; CHECK-LABEL: f13:
 | 
			
		||||
; CHECK: vlr %v24, %v26
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  ret <2 x float> %val2
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -49,8 +49,8 @@ define <2 x i64> @f4(i64 %val) {
 | 
			
		||||
; Test v4f32 insertion into 0.
 | 
			
		||||
define <4 x float> @f5(float %val) {
 | 
			
		||||
; CHECK-LABEL: f5:
 | 
			
		||||
; CHECK: vgbm [[ZERO:%v[0-9]+]], 0
 | 
			
		||||
; CHECK: vmrhf [[REG:%v[0-9]+]], [[ZERO]], %v0
 | 
			
		||||
; CHECK-DAG: vuplhf [[REG:%v[0-9]+]], %v0
 | 
			
		||||
; CHECK-DAG: vgbm [[ZERO:%v[0-9]+]], 0
 | 
			
		||||
; CHECK: vmrhg %v24, [[ZERO]], [[REG]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %ret = insertelement <4 x float> zeroinitializer, float %val, i32 3
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										106
									
								
								test/CodeGen/SystemZ/vec-move-15.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								test/CodeGen/SystemZ/vec-move-15.ll
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,106 @@
 | 
			
		||||
; Test vector sign-extending loads.
 | 
			
		||||
;
 | 
			
		||||
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 | 
			
		||||
 | 
			
		||||
; Test a v16i1->v16i8 extension.
 | 
			
		||||
define <16 x i8> @f1(<16 x i1> *%ptr) {
 | 
			
		||||
; No expected output, but must compile.
 | 
			
		||||
  %val = load <16 x i1>, <16 x i1> *%ptr
 | 
			
		||||
  %ret = sext <16 x i1> %val to <16 x i8>
 | 
			
		||||
  ret <16 x i8> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v8i1->v8i16 extension.
 | 
			
		||||
define <8 x i16> @f2(<8 x i1> *%ptr) {
 | 
			
		||||
; No expected output, but must compile.
 | 
			
		||||
  %val = load <8 x i1>, <8 x i1> *%ptr
 | 
			
		||||
  %ret = sext <8 x i1> %val to <8 x i16>
 | 
			
		||||
  ret <8 x i16> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v8i8->v8i16 extension.
 | 
			
		||||
define <8 x i16> @f3(<8 x i8> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f3:
 | 
			
		||||
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vuphb %v24, [[REG1]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <8 x i8>, <8 x i8> *%ptr
 | 
			
		||||
  %ret = sext <8 x i8> %val to <8 x i16>
 | 
			
		||||
  ret <8 x i16> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v4i1->v4i32 extension.
 | 
			
		||||
define <4 x i32> @f4(<4 x i1> *%ptr) {
 | 
			
		||||
; No expected output, but must compile.
 | 
			
		||||
  %val = load <4 x i1>, <4 x i1> *%ptr
 | 
			
		||||
  %ret = sext <4 x i1> %val to <4 x i32>
 | 
			
		||||
  ret <4 x i32> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v4i8->v4i32 extension.
 | 
			
		||||
define <4 x i32> @f5(<4 x i8> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f5:
 | 
			
		||||
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vuphb [[REG2:%v[0-9]+]], [[REG1]]
 | 
			
		||||
; CHECK: vuphh %v24, [[REG2]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <4 x i8>, <4 x i8> *%ptr
 | 
			
		||||
  %ret = sext <4 x i8> %val to <4 x i32>
 | 
			
		||||
  ret <4 x i32> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v4i16->v4i32 extension.
 | 
			
		||||
define <4 x i32> @f6(<4 x i16> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f6:
 | 
			
		||||
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vuphh %v24, [[REG1]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <4 x i16>, <4 x i16> *%ptr
 | 
			
		||||
  %ret = sext <4 x i16> %val to <4 x i32>
 | 
			
		||||
  ret <4 x i32> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2i1->v2i64 extension.
 | 
			
		||||
define <2 x i64> @f7(<2 x i1> *%ptr) {
 | 
			
		||||
; No expected output, but must compile.
 | 
			
		||||
  %val = load <2 x i1>, <2 x i1> *%ptr
 | 
			
		||||
  %ret = sext <2 x i1> %val to <2 x i64>
 | 
			
		||||
  ret <2 x i64> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2i8->v2i64 extension.
 | 
			
		||||
define <2 x i64> @f8(<2 x i8> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f8:
 | 
			
		||||
; CHECK: vlrepb [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vleb [[REG1]], 1(%r2), 1
 | 
			
		||||
; CHECK: vuphb [[REG2:%v[0-9]+]], [[REG1]]
 | 
			
		||||
; CHECK: vuphh [[REG3:%v[0-9]+]], [[REG2]]
 | 
			
		||||
; CHECK: vuphf %v24, [[REG3]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <2 x i8>, <2 x i8> *%ptr
 | 
			
		||||
  %ret = sext <2 x i8> %val to <2 x i64>
 | 
			
		||||
  ret <2 x i64> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2i16->v2i64 extension.
 | 
			
		||||
define <2 x i64> @f9(<2 x i16> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f9:
 | 
			
		||||
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vuphh [[REG2:%v[0-9]+]], [[REG1]]
 | 
			
		||||
; CHECK: vuphf %v24, [[REG2]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <2 x i16>, <2 x i16> *%ptr
 | 
			
		||||
  %ret = sext <2 x i16> %val to <2 x i64>
 | 
			
		||||
  ret <2 x i64> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2i32->v2i64 extension.
 | 
			
		||||
define <2 x i64> @f10(<2 x i32> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f10:
 | 
			
		||||
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vuphf %v24, [[REG1]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <2 x i32>, <2 x i32> *%ptr
 | 
			
		||||
  %ret = sext <2 x i32> %val to <2 x i64>
 | 
			
		||||
  ret <2 x i64> %ret
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										106
									
								
								test/CodeGen/SystemZ/vec-move-16.ll
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								test/CodeGen/SystemZ/vec-move-16.ll
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,106 @@
 | 
			
		||||
; Test vector zero-extending loads.
 | 
			
		||||
;
 | 
			
		||||
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 | 
			
		||||
 | 
			
		||||
; Test a v16i1->v16i8 extension.
 | 
			
		||||
define <16 x i8> @f1(<16 x i1> *%ptr) {
 | 
			
		||||
; No expected output, but must compile.
 | 
			
		||||
  %val = load <16 x i1>, <16 x i1> *%ptr
 | 
			
		||||
  %ret = zext <16 x i1> %val to <16 x i8>
 | 
			
		||||
  ret <16 x i8> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v8i1->v8i16 extension.
 | 
			
		||||
define <8 x i16> @f2(<8 x i1> *%ptr) {
 | 
			
		||||
; No expected output, but must compile.
 | 
			
		||||
  %val = load <8 x i1>, <8 x i1> *%ptr
 | 
			
		||||
  %ret = zext <8 x i1> %val to <8 x i16>
 | 
			
		||||
  ret <8 x i16> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v8i8->v8i16 extension.
 | 
			
		||||
define <8 x i16> @f3(<8 x i8> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f3:
 | 
			
		||||
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vuplhb %v24, [[REG1]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <8 x i8>, <8 x i8> *%ptr
 | 
			
		||||
  %ret = zext <8 x i8> %val to <8 x i16>
 | 
			
		||||
  ret <8 x i16> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v4i1->v4i32 extension.
 | 
			
		||||
define <4 x i32> @f4(<4 x i1> *%ptr) {
 | 
			
		||||
; No expected output, but must compile.
 | 
			
		||||
  %val = load <4 x i1>, <4 x i1> *%ptr
 | 
			
		||||
  %ret = zext <4 x i1> %val to <4 x i32>
 | 
			
		||||
  ret <4 x i32> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v4i8->v4i32 extension.
 | 
			
		||||
define <4 x i32> @f5(<4 x i8> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f5:
 | 
			
		||||
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
 | 
			
		||||
; CHECK: vuplhh %v24, [[REG2]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <4 x i8>, <4 x i8> *%ptr
 | 
			
		||||
  %ret = zext <4 x i8> %val to <4 x i32>
 | 
			
		||||
  ret <4 x i32> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v4i16->v4i32 extension.
 | 
			
		||||
define <4 x i32> @f6(<4 x i16> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f6:
 | 
			
		||||
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vuplhh %v24, [[REG1]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <4 x i16>, <4 x i16> *%ptr
 | 
			
		||||
  %ret = zext <4 x i16> %val to <4 x i32>
 | 
			
		||||
  ret <4 x i32> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2i1->v2i64 extension.
 | 
			
		||||
define <2 x i64> @f7(<2 x i1> *%ptr) {
 | 
			
		||||
; No expected output, but must compile.
 | 
			
		||||
  %val = load <2 x i1>, <2 x i1> *%ptr
 | 
			
		||||
  %ret = zext <2 x i1> %val to <2 x i64>
 | 
			
		||||
  ret <2 x i64> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2i8->v2i64 extension.
 | 
			
		||||
define <2 x i64> @f8(<2 x i8> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f8:
 | 
			
		||||
; CHECK: vlrepb [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vleb [[REG1]], 1(%r2), 1
 | 
			
		||||
; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
 | 
			
		||||
; CHECK: vuplhh [[REG3:%v[0-9]+]], [[REG2]]
 | 
			
		||||
; CHECK: vuplhf %v24, [[REG3]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <2 x i8>, <2 x i8> *%ptr
 | 
			
		||||
  %ret = zext <2 x i8> %val to <2 x i64>
 | 
			
		||||
  ret <2 x i64> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2i16->v2i64 extension.
 | 
			
		||||
define <2 x i64> @f9(<2 x i16> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f9:
 | 
			
		||||
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vuplhh [[REG2:%v[0-9]+]], [[REG1]]
 | 
			
		||||
; CHECK: vuplhf %v24, [[REG2]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <2 x i16>, <2 x i16> *%ptr
 | 
			
		||||
  %ret = zext <2 x i16> %val to <2 x i64>
 | 
			
		||||
  ret <2 x i64> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2i32->v2i64 extension.
 | 
			
		||||
define <2 x i64> @f10(<2 x i32> *%ptr) {
 | 
			
		||||
; CHECK-LABEL: f10:
 | 
			
		||||
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
 | 
			
		||||
; CHECK: vuplhf %v24, [[REG1]]
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %val = load <2 x i32>, <2 x i32> *%ptr
 | 
			
		||||
  %ret = zext <2 x i32> %val to <2 x i64>
 | 
			
		||||
  ret <2 x i64> %ret
 | 
			
		||||
}
 | 
			
		||||
@@ -85,3 +85,64 @@ define double @f7(<2 x double> %val1, <2 x double> %val2) {
 | 
			
		||||
  %ret = fsub double %scalar1, %scalar2
 | 
			
		||||
  ret double %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2i8 subtraction, which gets promoted to v16i8.
 | 
			
		||||
define <2 x i8> @f8(<2 x i8> %dummy, <2 x i8> %val1, <2 x i8> %val2) {
 | 
			
		||||
; CHECK-LABEL: f8:
 | 
			
		||||
; CHECK: vsb %v24, %v26, %v28
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %ret = sub <2 x i8> %val1, %val2
 | 
			
		||||
  ret <2 x i8> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v4i8 subtraction, which gets promoted to v16i8.
 | 
			
		||||
define <4 x i8> @f9(<4 x i8> %dummy, <4 x i8> %val1, <4 x i8> %val2) {
 | 
			
		||||
; CHECK-LABEL: f9:
 | 
			
		||||
; CHECK: vsb %v24, %v26, %v28
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %ret = sub <4 x i8> %val1, %val2
 | 
			
		||||
  ret <4 x i8> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v8i8 subtraction, which gets promoted to v16i8.
 | 
			
		||||
define <8 x i8> @f10(<8 x i8> %dummy, <8 x i8> %val1, <8 x i8> %val2) {
 | 
			
		||||
; CHECK-LABEL: f10:
 | 
			
		||||
; CHECK: vsb %v24, %v26, %v28
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %ret = sub <8 x i8> %val1, %val2
 | 
			
		||||
  ret <8 x i8> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2i16 subtraction, which gets promoted to v8i16.
 | 
			
		||||
define <2 x i16> @f11(<2 x i16> %dummy, <2 x i16> %val1, <2 x i16> %val2) {
 | 
			
		||||
; CHECK-LABEL: f11:
 | 
			
		||||
; CHECK: vsh %v24, %v26, %v28
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %ret = sub <2 x i16> %val1, %val2
 | 
			
		||||
  ret <2 x i16> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v4i16 subtraction, which gets promoted to v8i16.
 | 
			
		||||
define <4 x i16> @f12(<4 x i16> %dummy, <4 x i16> %val1, <4 x i16> %val2) {
 | 
			
		||||
; CHECK-LABEL: f12:
 | 
			
		||||
; CHECK: vsh %v24, %v26, %v28
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %ret = sub <4 x i16> %val1, %val2
 | 
			
		||||
  ret <4 x i16> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2i32 subtraction, which gets promoted to v4i32.
 | 
			
		||||
define <2 x i32> @f13(<2 x i32> %dummy, <2 x i32> %val1, <2 x i32> %val2) {
 | 
			
		||||
; CHECK-LABEL: f13:
 | 
			
		||||
; CHECK: vsf %v24, %v26, %v28
 | 
			
		||||
; CHECK: br %r14
 | 
			
		||||
  %ret = sub <2 x i32> %val1, %val2
 | 
			
		||||
  ret <2 x i32> %ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
; Test a v2f32 subtraction, which gets promoted to v4f32.
 | 
			
		||||
define <2 x float> @f14(<2 x float> %val1, <2 x float> %val2) {
 | 
			
		||||
; No particular output expected, but must compile.
 | 
			
		||||
  %ret = fsub <2 x float> %val1, %val2
 | 
			
		||||
  ret <2 x float> %ret
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user