[Hexagon] Add support for vector instructions

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@232728 91177308-0d34-0410-b5e6-96231b3b80d8
2025-07-15 19:24:33 +00:00 · 2015-03-19 16:33:08 +00:00
parent c6e489793c
commit 07121ea974
49 changed files with 2912 additions and 40 deletions
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -711,6 +711,36 @@ SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
 //
 SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
  SDLoc dl(N);
+
+  SDValue Op0 = N->getOperand(0);
+  EVT OpVT = Op0.getValueType();
+  unsigned OpBW = OpVT.getSizeInBits();
+
+  // Special handling for zero-extending a vector of booleans.
+  if (OpVT.isVector() && OpVT.getVectorElementType() == MVT::i1 && OpBW <= 64) {
+    SDNode *Mask = CurDAG->getMachineNode(Hexagon::C2_mask, dl, MVT::i64, Op0);
+    unsigned NE = OpVT.getVectorNumElements();
+    EVT ExVT = N->getValueType(0);
+    unsigned ES = ExVT.getVectorElementType().getSizeInBits();
+    uint64_t MV = 0, Bit = 1;
+    for (unsigned i = 0; i < NE; ++i) {
+      MV |= Bit;
+      Bit <<= ES;
+    }
+    SDValue Ones = CurDAG->getTargetConstant(MV, MVT::i64);
+    SDNode *OnesReg = CurDAG->getMachineNode(Hexagon::CONST64_Int_Real, dl,
+                                             MVT::i64, Ones);
+    if (ExVT.getSizeInBits() == 32) {
+      SDNode *And = CurDAG->getMachineNode(Hexagon::A2_andp, dl, MVT::i64,
+                                           SDValue(Mask,0), SDValue(OnesReg,0));
+      SDValue SubR = CurDAG->getTargetConstant(Hexagon::subreg_loreg, MVT::i32);
+      return CurDAG->getMachineNode(Hexagon::EXTRACT_SUBREG, dl, ExVT,
+                                    SDValue(And,0), SubR);
+    }
+    return CurDAG->getMachineNode(Hexagon::A2_andp, dl, ExVT,
+                                  SDValue(Mask,0), SDValue(OnesReg,0));
+  }
+
  SDNode *IsIntrinsic = N->getOperand(0).getNode();
  if ((IsIntrinsic->getOpcode() == ISD::INTRINSIC_WO_CHAIN)) {
    unsigned ID =
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -164,6 +164,12 @@ CC_Hexagon (unsigned ValNo, MVT ValVT,
      LocInfo = CCValAssign::ZExt;
    else
      LocInfo = CCValAssign::AExt;
+  } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
+    LocVT = MVT::i32;
+    LocInfo = CCValAssign::BCvt;
+  } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
+    LocVT = MVT::i64;
+    LocInfo = CCValAssign::BCvt;
  }

  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
@@ -239,6 +245,12 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
      LocInfo = CCValAssign::ZExt;
    else
      LocInfo = CCValAssign::AExt;
+  } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
+    LocVT = MVT::i32;
+    LocInfo = CCValAssign::BCvt;
+  } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
+    LocVT = MVT::i64;
+    LocInfo = CCValAssign::BCvt;
  }

  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
@@ -944,6 +956,192 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                      false, 0);
 }

+// Creates a SPLAT instruction for a constant value VAL.
+static SDValue createSplat(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue Val) {
+  if (VT.getSimpleVT() == MVT::v4i8)
+    return DAG.getNode(HexagonISD::VSPLATB, dl, VT, Val);
+
+  if (VT.getSimpleVT() == MVT::v4i16)
+    return DAG.getNode(HexagonISD::VSPLATH, dl, VT, Val);
+
+  return SDValue();
+}
+
+static bool isSExtFree(SDValue N) {
+  // A sign-extend of a truncate of a sign-extend is free.
+  if (N.getOpcode() == ISD::TRUNCATE &&
+      N.getOperand(0).getOpcode() == ISD::AssertSext)
+    return true;
+  // We have sign-extended loads.
+  if (N.getOpcode() == ISD::LOAD)
+    return true;
+  return false;
+}
+
+SDValue HexagonTargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue InpVal = Op.getOperand(0);
+  if (isa<ConstantSDNode>(InpVal)) {
+    uint64_t V = cast<ConstantSDNode>(InpVal)->getZExtValue();
+    return DAG.getTargetConstant(countPopulation(V), MVT::i64);
+  }
+  SDValue PopOut = DAG.getNode(HexagonISD::POPCOUNT, dl, MVT::i32, InpVal);
+  return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, PopOut);
+}
+
+SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Cmp = Op.getOperand(2);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cmp)->get();
+
+  EVT VT = Op.getValueType();
+  EVT LHSVT = LHS.getValueType();
+  EVT RHSVT = RHS.getValueType();
+
+  if (LHSVT == MVT::v2i16) {
+    assert(ISD::isSignedIntSetCC(CC) || ISD::isUnsignedIntSetCC(CC));
+    unsigned ExtOpc = ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND
+                                                : ISD::ZERO_EXTEND;
+    SDValue LX = DAG.getNode(ExtOpc, dl, MVT::v2i32, LHS);
+    SDValue RX = DAG.getNode(ExtOpc, dl, MVT::v2i32, RHS);
+    SDValue SC = DAG.getNode(ISD::SETCC, dl, MVT::v2i1, LX, RX, Cmp);
+    return SC;
+  }
+
+  // Treat all other vector types as legal.
+  if (VT.isVector())
+    return Op;
+
+  // Equals and not equals should use sign-extend, not zero-extend, since
+  // we can represent small negative values in the compare instructions.
+  // The LLVM default is to use zero-extend arbitrarily in these cases.
+  if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+      (RHSVT == MVT::i8 || RHSVT == MVT::i16) &&
+      (LHSVT == MVT::i8 || LHSVT == MVT::i16)) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
+    if (C && C->getAPIntValue().isNegative()) {
+      LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
+      RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
+      return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
+                         LHS, RHS, Op.getOperand(2));
+    }
+    if (isSExtFree(LHS) || isSExtFree(RHS)) {
+      LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
+      RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
+      return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
+                         LHS, RHS, Op.getOperand(2));
+    }
+  }
+  return SDValue();
+}
+
+SDValue HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG)
+      const {
+  SDValue PredOp = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1), Op2 = Op.getOperand(2);
+  EVT OpVT = Op1.getValueType();
+  SDLoc DL(Op);
+
+  if (OpVT == MVT::v2i16) {
+    SDValue X1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op1);
+    SDValue X2 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op2);
+    SDValue SL = DAG.getNode(ISD::VSELECT, DL, MVT::v2i32, PredOp, X1, X2);
+    SDValue TR = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i16, SL);
+    return TR;
+  }
+
+  return SDValue();
+}
+
+// Handle only specific vector loads.
+SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+  SDValue Chain = LoadNode->getChain();
+  SDValue Ptr = Op.getOperand(1);
+  SDValue LoweredLoad;
+  SDValue Result;
+  SDValue Base = LoadNode->getBasePtr();
+  ISD::LoadExtType Ext = LoadNode->getExtensionType();
+  unsigned Alignment = LoadNode->getAlignment();
+  SDValue LoadChain;
+
+  if(Ext == ISD::NON_EXTLOAD)
+    Ext = ISD::ZEXTLOAD;
+
+  if (VT == MVT::v4i16) {
+    if (Alignment == 2) {
+      SDValue Loads[4];
+      // Base load.
+      Loads[0] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Base,
+                                LoadNode->getPointerInfo(), MVT::i16,
+                                LoadNode->isVolatile(),
+                                LoadNode->isNonTemporal(),
+                                LoadNode->isInvariant(),
+                                Alignment);
+      // Base+2 load.
+      SDValue Increment = DAG.getConstant(2, MVT::i32);
+      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
+      Loads[1] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
+                                LoadNode->getPointerInfo(), MVT::i16,
+                                LoadNode->isVolatile(),
+                                LoadNode->isNonTemporal(),
+                                LoadNode->isInvariant(),
+                                Alignment);
+      // SHL 16, then OR base and base+2.
+      SDValue ShiftAmount = DAG.getConstant(16, MVT::i32);
+      SDValue Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[1], ShiftAmount);
+      SDValue Tmp2 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[0]);
+      // Base + 4.
+      Increment = DAG.getConstant(4, MVT::i32);
+      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
+      Loads[2] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
+                                LoadNode->getPointerInfo(), MVT::i16,
+                                LoadNode->isVolatile(),
+                                LoadNode->isNonTemporal(),
+                                LoadNode->isInvariant(),
+                                Alignment);
+      // Base + 6.
+      Increment = DAG.getConstant(6, MVT::i32);
+      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
+      Loads[3] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
+                                LoadNode->getPointerInfo(), MVT::i16,
+                                LoadNode->isVolatile(),
+                                LoadNode->isNonTemporal(),
+                                LoadNode->isInvariant(),
+                                Alignment);
+      // SHL 16, then OR base+4 and base+6.
+      Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[3], ShiftAmount);
+      SDValue Tmp4 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[2]);
+      // Combine to i64. This could be optimised out later if we can
+      // affect reg allocation of this code.
+      Result = DAG.getNode(HexagonISD::COMBINE, DL, MVT::i64, Tmp4, Tmp2);
+      LoadChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                              Loads[0].getValue(1), Loads[1].getValue(1),
+                              Loads[2].getValue(1), Loads[3].getValue(1));
+    } else {
+      // Perform default type expansion.
+      Result = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
+                           LoadNode->isVolatile(), LoadNode->isNonTemporal(),
+                          LoadNode->isInvariant(), LoadNode->getAlignment());
+      LoadChain = Result.getValue(1);
+    }
+  } else
+    llvm_unreachable("Custom lowering unsupported load");
+
+  Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
+  // Since we pretend to lower a load, we need the original chain
+  // info attached to the result.
+  SDValue Ops[] = { Result, LoadChain };
+
+  return DAG.getMergeValues(Ops, DL);
+}
+
+
 SDValue
 HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
  EVT ValTy = Op.getValueType();
@@ -1028,6 +1226,19 @@ SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
  return DAG.getNode(HexagonISD::CONST32, dl, getPointerTy(), Result);
 }

+// Specifies that for loads and stores VT can be promoted to PromotedLdStVT.
+void HexagonTargetLowering::promoteLdStType(EVT VT, EVT PromotedLdStVT) {
+  if (VT != PromotedLdStVT) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(),
+                      PromotedLdStVT.getSimpleVT());
+
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(),
+                      PromotedLdStVT.getSimpleVT());
+  }
+}
+
 SDValue
 HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
@@ -1045,14 +1256,105 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
    : TargetLowering(TM), Subtarget(&STI) {

  // Set up the register classes.
+  addRegisterClass(MVT::v2i1, &Hexagon::PredRegsRegClass);  // bbbbaaaa
+  addRegisterClass(MVT::v4i1, &Hexagon::PredRegsRegClass);  // ddccbbaa
+  addRegisterClass(MVT::v8i1, &Hexagon::PredRegsRegClass);  // hgfedcba
  addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass);
-  addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass);
+  addRegisterClass(MVT::v4i8, &Hexagon::IntRegsRegClass);
+  addRegisterClass(MVT::v2i16, &Hexagon::IntRegsRegClass);
+  promoteLdStType(MVT::v4i8, MVT::i32);
+  promoteLdStType(MVT::v2i16, MVT::i32);

  if (Subtarget->hasV5TOps()) {
    addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
    addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
  }

+  addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass);
+  addRegisterClass(MVT::v8i8, &Hexagon::DoubleRegsRegClass);
+  addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass);
+  addRegisterClass(MVT::v2i32, &Hexagon::DoubleRegsRegClass);
+  promoteLdStType(MVT::v8i8, MVT::i64);
+
+  // Custom lower v4i16 load only. Let v4i16 store to be
+  // promoted for now.
+  setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
+  AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::i64);
+  setOperationAction(ISD::STORE, MVT::v4i16, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::i64);
+  promoteLdStType(MVT::v2i32, MVT::i64);
+
+  for (unsigned i = (unsigned) MVT::FIRST_VECTOR_VALUETYPE;
+       i <= (unsigned) MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType) i;
+
+    // Hexagon does not have support for the following operations,
+    // so they need to be expanded.
+    setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::FNEG, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+
+    // Expand all any extend loads.
+    for (unsigned j = (unsigned) MVT::FIRST_VECTOR_VALUETYPE;
+                  j <= (unsigned) MVT::LAST_VECTOR_VALUETYPE; ++j)
+      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType) j, VT, Expand);
+
+    // Expand all trunc stores.
+    for (unsigned TargetVT = (unsigned) MVT::FIRST_VECTOR_VALUETYPE;
+         TargetVT <= (unsigned) MVT::LAST_VECTOR_VALUETYPE; ++TargetVT)
+      setTruncStoreAction(VT, (MVT::SimpleValueType) TargetVT, Expand);
+
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+    setOperationAction(ISD::ConstantPool, VT, Expand);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+    setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Expand);
+    setOperationAction(ISD::INSERT_SUBVECTOR, VT, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
+    setOperationAction(ISD::SRA, VT, Custom);
+    setOperationAction(ISD::SHL, VT, Custom);
+    setOperationAction(ISD::SRL, VT, Custom);
+
+    if (!isTypeLegal(VT))
+      continue;
+
+    setOperationAction(ISD::ADD, VT, Legal);
+    setOperationAction(ISD::SUB, VT, Legal);
+    setOperationAction(ISD::MUL, VT, Legal);
+
+    setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+  }
+
+  setOperationAction(ISD::SETCC, MVT::v2i16, Custom);
+  setOperationAction(ISD::VSELECT, MVT::v2i16, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+
+  setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+
  addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass);

  computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -1363,6 +1665,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
    setOperationAction(ISD::SELECT, MVT::f64, Expand);
  }

+  // Hexagon needs to optimize cases with negative constants.
+  setOperationAction(ISD::SETCC, MVT::i16, Custom);
+  setOperationAction(ISD::SETCC, MVT::i8, Custom);
+
  if (EmitJumpTables) {
    setOperationAction(ISD::BR_JT, MVT::Other, Custom);
  } else {
@@ -1420,9 +1726,17 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
  setOperationAction(ISD::CTLZ, MVT::i64, Expand);
  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+
  setOperationAction(ISD::ROTL, MVT::i32, Expand);
  setOperationAction(ISD::ROTR, MVT::i32, Expand);
  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  setOperationAction(ISD::ROTR, MVT::i64, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+
  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
  setOperationAction(ISD::FPOW, MVT::f64, Expand);
@@ -1468,27 +1782,63 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
 const char*
 HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
  switch (Opcode) {
-    default: return nullptr;
-    case HexagonISD::CONST32:     return "HexagonISD::CONST32";
-    case HexagonISD::CONST32_GP: return "HexagonISD::CONST32_GP";
-    case HexagonISD::CONST32_Int_Real: return "HexagonISD::CONST32_Int_Real";
-    case HexagonISD::ADJDYNALLOC: return "HexagonISD::ADJDYNALLOC";
-    case HexagonISD::CMPICC:      return "HexagonISD::CMPICC";
-    case HexagonISD::CMPFCC:      return "HexagonISD::CMPFCC";
-    case HexagonISD::BRICC:       return "HexagonISD::BRICC";
-    case HexagonISD::BRFCC:       return "HexagonISD::BRFCC";
-    case HexagonISD::SELECT_ICC:  return "HexagonISD::SELECT_ICC";
-    case HexagonISD::SELECT_FCC:  return "HexagonISD::SELECT_FCC";
-    case HexagonISD::Hi:          return "HexagonISD::Hi";
-    case HexagonISD::Lo:          return "HexagonISD::Lo";
-    case HexagonISD::FTOI:        return "HexagonISD::FTOI";
-    case HexagonISD::ITOF:        return "HexagonISD::ITOF";
-    case HexagonISD::CALLv3:      return "HexagonISD::CALLv3";
-    case HexagonISD::CALLv3nr:    return "HexagonISD::CALLv3nr";
-    case HexagonISD::CALLR:       return "HexagonISD::CALLR";
-    case HexagonISD::RET_FLAG:    return "HexagonISD::RET_FLAG";
-    case HexagonISD::BR_JT:       return "HexagonISD::BR_JT";
-    case HexagonISD::TC_RETURN:   return "HexagonISD::TC_RETURN";
+  default: return nullptr;
+  case HexagonISD::CONST32:     return "HexagonISD::CONST32";
+  case HexagonISD::CONST32_GP: return "HexagonISD::CONST32_GP";
+  case HexagonISD::CONST32_Int_Real: return "HexagonISD::CONST32_Int_Real";
+  case HexagonISD::ADJDYNALLOC: return "HexagonISD::ADJDYNALLOC";
+  case HexagonISD::CMPICC:      return "HexagonISD::CMPICC";
+  case HexagonISD::CMPFCC:      return "HexagonISD::CMPFCC";
+  case HexagonISD::BRICC:       return "HexagonISD::BRICC";
+  case HexagonISD::BRFCC:       return "HexagonISD::BRFCC";
+  case HexagonISD::SELECT_ICC:  return "HexagonISD::SELECT_ICC";
+  case HexagonISD::SELECT_FCC:  return "HexagonISD::SELECT_FCC";
+  case HexagonISD::Hi:          return "HexagonISD::Hi";
+  case HexagonISD::Lo:          return "HexagonISD::Lo";
+  case HexagonISD::JT: return "HexagonISD::JT";
+  case HexagonISD::CP: return "HexagonISD::CP";
+  case HexagonISD::POPCOUNT: return "HexagonISD::POPCOUNT";
+  case HexagonISD::COMBINE: return "HexagonISD::COMBINE";
+  case HexagonISD::PACKHL: return "HexagonISD::PACKHL";
+  case HexagonISD::VSPLATB: return "HexagonISD::VSPLTB";
+  case HexagonISD::VSPLATH: return "HexagonISD::VSPLATH";
+  case HexagonISD::SHUFFEB: return "HexagonISD::SHUFFEB";
+  case HexagonISD::SHUFFEH: return "HexagonISD::SHUFFEH";
+  case HexagonISD::SHUFFOB: return "HexagonISD::SHUFFOB";
+  case HexagonISD::SHUFFOH: return "HexagonISD::SHUFFOH";
+  case HexagonISD::VSXTBH: return "HexagonISD::VSXTBH";
+  case HexagonISD::VSXTBW: return "HexagonISD::VSXTBW";
+  case HexagonISD::VSRAW: return "HexagonISD::VSRAW";
+  case HexagonISD::VSRAH: return "HexagonISD::VSRAH";
+  case HexagonISD::VSRLW: return "HexagonISD::VSRLW";
+  case HexagonISD::VSRLH: return "HexagonISD::VSRLH";
+  case HexagonISD::VSHLW: return "HexagonISD::VSHLW";
+  case HexagonISD::VSHLH: return "HexagonISD::VSHLH";
+  case HexagonISD::VCMPBEQ: return "HexagonISD::VCMPBEQ";
+  case HexagonISD::VCMPBGT: return "HexagonISD::VCMPBGT";
+  case HexagonISD::VCMPBGTU: return "HexagonISD::VCMPBGTU";
+  case HexagonISD::VCMPHEQ: return "HexagonISD::VCMPHEQ";
+  case HexagonISD::VCMPHGT: return "HexagonISD::VCMPHGT";
+  case HexagonISD::VCMPHGTU: return "HexagonISD::VCMPHGTU";
+  case HexagonISD::VCMPWEQ: return "HexagonISD::VCMPWEQ";
+  case HexagonISD::VCMPWGT: return "HexagonISD::VCMPWGT";
+  case HexagonISD::VCMPWGTU: return "HexagonISD::VCMPWGTU";
+  case HexagonISD::INSERT_ri: return "HexagonISD::INSERT_ri";
+  case HexagonISD::INSERT_rd: return "HexagonISD::INSERT_rd";
+  case HexagonISD::INSERT_riv: return "HexagonISD::INSERT_riv";
+  case HexagonISD::INSERT_rdv: return "HexagonISD::INSERT_rdv";
+  case HexagonISD::EXTRACTU_ri: return "HexagonISD::EXTRACTU_ri";
+  case HexagonISD::EXTRACTU_rd: return "HexagonISD::EXTRACTU_rd";
+  case HexagonISD::EXTRACTU_riv: return "HexagonISD::EXTRACTU_riv";
+  case HexagonISD::EXTRACTU_rdv: return "HexagonISD::EXTRACTU_rdv";
+  case HexagonISD::FTOI:        return "HexagonISD::FTOI";
+  case HexagonISD::ITOF:        return "HexagonISD::ITOF";
+  case HexagonISD::CALLv3:      return "HexagonISD::CALLv3";
+  case HexagonISD::CALLv3nr:    return "HexagonISD::CALLv3nr";
+  case HexagonISD::CALLR:       return "HexagonISD::CALLR";
+  case HexagonISD::RET_FLAG:    return "HexagonISD::RET_FLAG";
+  case HexagonISD::BR_JT:       return "HexagonISD::BR_JT";
+  case HexagonISD::TC_RETURN:   return "HexagonISD::TC_RETURN";
  case HexagonISD::EH_RETURN: return "HexagonISD::EH_RETURN";
  }
 }
@@ -1510,6 +1860,474 @@ bool HexagonTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
  return ((VT1.getSimpleVT() == MVT::i64) && (VT2.getSimpleVT() == MVT::i32));
 }

+// shouldExpandBuildVectorWithShuffles
+// Should we expand the build vector with shuffles?
+bool
+HexagonTargetLowering::shouldExpandBuildVectorWithShuffles(EVT VT,
+                                  unsigned DefinedValues) const {
+
+  // Hexagon vector shuffle operates on element sizes of bytes or halfwords
+  EVT EltVT = VT.getVectorElementType();
+  int EltBits = EltVT.getSizeInBits();
+  if ((EltBits != 8) && (EltBits != 16))
+    return false;
+
+  return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
+}
+
+// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3).  V1 and
+// V2 are the two vectors to select data from, V3 is the permutation.
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  if (V2.getOpcode() == ISD::UNDEF)
+    V2 = V1;
+
+  if (SVN->isSplat()) {
+    int Lane = SVN->getSplatIndex();
+    if (Lane == -1) Lane = 0;
+
+    // Test if V1 is a SCALAR_TO_VECTOR.
+    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return createSplat(DAG, dl, VT, V1.getOperand(0));
+
+    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
+    // (and probably will turn into a SCALAR_TO_VECTOR once legalization
+    // reaches it).
+    if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
+        !isa<ConstantSDNode>(V1.getOperand(0))) {
+      bool IsScalarToVector = true;
+      for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
+        if (V1.getOperand(i).getOpcode() != ISD::UNDEF) {
+          IsScalarToVector = false;
+          break;
+        }
+      if (IsScalarToVector)
+        return createSplat(DAG, dl, VT, V1.getOperand(0));
+    }
+    return createSplat(DAG, dl, VT, DAG.getConstant(Lane, MVT::i32));
+  }
+
+  // FIXME: We need to support more general vector shuffles.  See
+  // below the comment from the ARM backend that deals in the general
+  // case with the vector shuffles.  For now, let expand handle these.
+  return SDValue();
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+}
+
+// If BUILD_VECTOR has same base element repeated several times,
+// report true.
+static bool isCommonSplatElement(BuildVectorSDNode *BVN) {
+  unsigned NElts = BVN->getNumOperands();
+  SDValue V0 = BVN->getOperand(0);
+
+  for (unsigned i = 1, e = NElts; i != e; ++i) {
+    if (BVN->getOperand(i) != V0)
+      return false;
+  }
+  return true;
+}
+
+// LowerVECTOR_SHIFT - Lower a vector shift. Try to convert
+// <VT> = SHL/SRA/SRL <VT> by <VT> to Hexagon specific
+// <VT> = SHL/SRA/SRL <VT> by <IT/i32>.
+static SDValue LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) {
+  BuildVectorSDNode *BVN = 0;
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDValue V3;
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  if ((BVN = dyn_cast<BuildVectorSDNode>(V1.getNode())) &&
+      isCommonSplatElement(BVN))
+    V3 = V2;
+  else if ((BVN = dyn_cast<BuildVectorSDNode>(V2.getNode())) &&
+           isCommonSplatElement(BVN))
+    V3 = V1;
+  else
+    return SDValue();
+
+  SDValue CommonSplat = BVN->getOperand(0);
+  SDValue Result;
+
+  if (VT.getSimpleVT() == MVT::v4i16) {
+    switch (Op.getOpcode()) {
+    case ISD::SRA:
+      Result = DAG.getNode(HexagonISD::VSRAH, dl, VT, V3, CommonSplat);
+      break;
+    case ISD::SHL:
+      Result = DAG.getNode(HexagonISD::VSHLH, dl, VT, V3, CommonSplat);
+      break;
+    case ISD::SRL:
+      Result = DAG.getNode(HexagonISD::VSRLH, dl, VT, V3, CommonSplat);
+      break;
+    default:
+      return SDValue();
+    }
+  } else if (VT.getSimpleVT() == MVT::v2i32) {
+    switch (Op.getOpcode()) {
+    case ISD::SRA:
+      Result = DAG.getNode(HexagonISD::VSRAW, dl, VT, V3, CommonSplat);
+      break;
+    case ISD::SHL:
+      Result = DAG.getNode(HexagonISD::VSHLW, dl, VT, V3, CommonSplat);
+      break;
+    case ISD::SRL:
+      Result = DAG.getNode(HexagonISD::VSRLW, dl, VT, V3, CommonSplat);
+      break;
+    default:
+      return SDValue();
+    }
+  } else {
+    return SDValue();
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+}
+
+SDValue
+HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  unsigned Size = VT.getSizeInBits();
+
+  // A vector larger than 64 bits cannot be represented in Hexagon.
+  // Expand will split the vector.
+  if (Size > 64)
+    return SDValue();
+
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  unsigned NElts = BVN->getNumOperands();
+
+  // Try to generate a SPLAT instruction.
+  if ((VT.getSimpleVT() == MVT::v4i8 || VT.getSimpleVT() == MVT::v4i16) &&
+      (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                            HasAnyUndefs, 0, true) && SplatBitSize <= 16)) {
+    unsigned SplatBits = APSplatBits.getZExtValue();
+    int32_t SextVal = ((int32_t) (SplatBits << (32 - SplatBitSize)) >>
+                       (32 - SplatBitSize));
+    return createSplat(DAG, dl, VT, DAG.getConstant(SextVal, MVT::i32));
+  }
+
+  // Try to generate COMBINE to build v2i32 vectors.
+  if (VT.getSimpleVT() == MVT::v2i32) {
+    SDValue V0 = BVN->getOperand(0);
+    SDValue V1 = BVN->getOperand(1);
+
+    if (V0.getOpcode() == ISD::UNDEF)
+      V0 = DAG.getConstant(0, MVT::i32);
+    if (V1.getOpcode() == ISD::UNDEF)
+      V1 = DAG.getConstant(0, MVT::i32);
+
+    ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(V0);
+    ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(V1);
+    // If the element isn't a constant, it is in a register:
+    // generate a COMBINE Register Register instruction.
+    if (!C0 || !C1)
+      return DAG.getNode(HexagonISD::COMBINE, dl, VT, V1, V0);
+
+    // If one of the operands is an 8 bit integer constant, generate
+    // a COMBINE Immediate Immediate instruction.
+    if (isInt<8>(C0->getSExtValue()) ||
+        isInt<8>(C1->getSExtValue()))
+      return DAG.getNode(HexagonISD::COMBINE, dl, VT, V1, V0);
+  }
+
+  // Try to generate a S2_packhl to build v2i16 vectors.
+  if (VT.getSimpleVT() == MVT::v2i16) {
+    for (unsigned i = 0, e = NElts; i != e; ++i) {
+      if (BVN->getOperand(i).getOpcode() == ISD::UNDEF)
+        continue;
+      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(BVN->getOperand(i));
+      // If the element isn't a constant, it is in a register:
+      // generate a S2_packhl instruction.
+      if (!Cst) {
+        SDValue pack = DAG.getNode(HexagonISD::PACKHL, dl, MVT::v4i16,
+                                   BVN->getOperand(1), BVN->getOperand(0));
+
+        return DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl, MVT::v2i16,
+                                          pack);
+      }
+    }
+  }
+
+  // In the general case, generate a CONST32 or a CONST64 for constant vectors,
+  // and insert_vector_elt for all the other cases.
+  uint64_t Res = 0;
+  unsigned EltSize = Size / NElts;
+  SDValue ConstVal;
+  uint64_t Mask = ~uint64_t(0ULL) >> (64 - EltSize);
+  bool HasNonConstantElements = false;
+
+  for (unsigned i = 0, e = NElts; i != e; ++i) {
+    // LLVM's BUILD_VECTOR operands are in Little Endian mode, whereas Hexagon's
+    // combine, const64, etc. are Big Endian.
+    unsigned OpIdx = NElts - i - 1;
+    SDValue Operand = BVN->getOperand(OpIdx);
+    if (Operand.getOpcode() == ISD::UNDEF)
+      continue;
+
+    int64_t Val = 0;
+    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Operand))
+      Val = Cst->getSExtValue();
+    else
+      HasNonConstantElements = true;
+
+    Val &= Mask;
+    Res = (Res << EltSize) | Val;
+  }
+
+  if (Size == 64)
+    ConstVal = DAG.getConstant(Res, MVT::i64);
+  else
+    ConstVal = DAG.getConstant(Res, MVT::i32);
+
+  // When there are non constant operands, add them with INSERT_VECTOR_ELT to
+  // ConstVal, the constant part of the vector.
+  if (HasNonConstantElements) {
+    EVT EltVT = VT.getVectorElementType();
+    SDValue Width = DAG.getConstant(EltVT.getSizeInBits(), MVT::i64);
+    SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
+                                  DAG.getConstant(32, MVT::i64));
+
+    for (unsigned i = 0, e = NElts; i != e; ++i) {
+      // LLVM's BUILD_VECTOR operands are in Little Endian mode, whereas Hexagon
+      // is Big Endian.
+      unsigned OpIdx = NElts - i - 1;
+      SDValue Operand = BVN->getOperand(OpIdx);
+      if (dyn_cast<ConstantSDNode>(Operand))
+        // This operand is already in ConstVal.
+        continue;
+
+      if (VT.getSizeInBits() == 64 &&
+          Operand.getValueType().getSizeInBits() == 32) {
+        SDValue C = DAG.getConstant(0, MVT::i32);
+        Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand);
+      }
+
+      SDValue Idx = DAG.getConstant(OpIdx, MVT::i64);
+      SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width);
+      SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
+      const SDValue Ops[] = {ConstVal, Operand, Combined};
+
+      if (VT.getSizeInBits() == 32)
+        ConstVal = DAG.getNode(HexagonISD::INSERT_riv, dl, MVT::i32, Ops);
+      else
+        ConstVal = DAG.getNode(HexagonISD::INSERT_rdv, dl, MVT::i64, Ops);
+    }
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal);
+}
+
+SDValue
+HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned NElts = Op.getNumOperands();
+  SDValue Vec = Op.getOperand(0);
+  EVT VecVT = Vec.getValueType();
+  SDValue Width = DAG.getConstant(VecVT.getSizeInBits(), MVT::i64);
+  SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
+                                DAG.getConstant(32, MVT::i64));
+  SDValue ConstVal = DAG.getConstant(0, MVT::i64);
+
+  ConstantSDNode *W = dyn_cast<ConstantSDNode>(Width);
+  ConstantSDNode *S = dyn_cast<ConstantSDNode>(Shifted);
+
+  if ((VecVT.getSimpleVT() == MVT::v2i16) && (NElts == 2) && W && S) {
+    if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) {
+      // We are trying to concat two v2i16 to a single v4i16.
+      SDValue Vec0 = Op.getOperand(1);
+      SDValue Combined  = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec);
+      return DAG.getNode(ISD::BITCAST, dl, VT, Combined);
+    }
+  }
+
+  if ((VecVT.getSimpleVT() == MVT::v4i8) && (NElts == 2) && W && S) {
+    if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) {
+      // We are trying to concat two v4i8 to a single v8i8.
+      SDValue Vec0 = Op.getOperand(1);
+      SDValue Combined  = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec);
+      return DAG.getNode(ISD::BITCAST, dl, VT, Combined);
+    }
+  }
+
+  for (unsigned i = 0, e = NElts; i != e; ++i) {
+    unsigned OpIdx = NElts - i - 1;
+    SDValue Operand = Op.getOperand(OpIdx);
+
+    if (VT.getSizeInBits() == 64 &&
+        Operand.getValueType().getSizeInBits() == 32) {
+      SDValue C = DAG.getConstant(0, MVT::i32);
+      Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand);
+    }
+
+    SDValue Idx = DAG.getConstant(OpIdx, MVT::i64);
+    SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width);
+    SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
+    const SDValue Ops[] = {ConstVal, Operand, Combined};
+
+    if (VT.getSizeInBits() == 32)
+      ConstVal = DAG.getNode(HexagonISD::INSERT_riv, dl, MVT::i32, Ops);
+    else
+      ConstVal = DAG.getNode(HexagonISD::INSERT_rdv, dl, MVT::i64, Ops);
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal);
+}
+
+SDValue
+HexagonTargetLowering::LowerEXTRACT_VECTOR(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  int VTN = VT.isVector() ? VT.getVectorNumElements() : 1;
+  SDLoc dl(Op);
+  SDValue Idx = Op.getOperand(1);
+  SDValue Vec = Op.getOperand(0);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  int EltSize = EltVT.getSizeInBits();
+  SDValue Width = DAG.getConstant(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT ?
+                                  EltSize : VTN * EltSize, MVT::i64);
+
+  // Constant element number.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Idx)) {
+    SDValue Offset = DAG.getConstant(C->getZExtValue() * EltSize, MVT::i32);
+    const SDValue Ops[] = {Vec, Width, Offset};
+
+    ConstantSDNode *W = dyn_cast<ConstantSDNode>(Width);
+    assert(W && "Non constant width in LowerEXTRACT_VECTOR");
+
+    SDValue N;
+    // For certain extracts, it is a simple _hi/_lo subreg.
+    if (VecVT.getSimpleVT() == MVT::v2i32) {
+      // v2i32 -> i32 vselect.
+      if (C->getZExtValue() == 0)
+        N = DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl,
+                                       MVT::i32, Vec);
+      else if (C->getZExtValue() == 1)
+        N = DAG.getTargetExtractSubreg(Hexagon::subreg_hireg, dl,
+                                       MVT::i32, Vec);
+      else
+        llvm_unreachable("Bad offset");
+    } else if ((VecVT.getSimpleVT() == MVT::v4i16) &&
+               (W->getZExtValue() == 32)) {
+      // v4i16 -> v2i16/i32 vselect.
+      if (C->getZExtValue() == 0)
+        N = DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl,
+                                       MVT::i32, Vec);
+      else if (C->getZExtValue() == 2)
+        N = DAG.getTargetExtractSubreg(Hexagon::subreg_hireg, dl,
+                                       MVT::i32, Vec);
+      else
+        llvm_unreachable("Bad offset");
+    }  else if ((VecVT.getSimpleVT() == MVT::v8i8) &&
+               (W->getZExtValue() == 32)) {
+      // v8i8 -> v4i8/i32 vselect.
+      if (C->getZExtValue() == 0)
+        N = DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl,
+                                       MVT::i32, Vec);
+      else if (C->getZExtValue() == 4)
+        N = DAG.getTargetExtractSubreg(Hexagon::subreg_hireg, dl,
+                                       MVT::i32, Vec);
+      else
+        llvm_unreachable("Bad offset");
+    } else if (VecVT.getSizeInBits() == 32) {
+        N = DAG.getNode(HexagonISD::EXTRACTU_ri, dl, MVT::i32, Ops);
+    } else {
+      N = DAG.getNode(HexagonISD::EXTRACTU_rd, dl, MVT::i64, Ops);
+      if (VT.getSizeInBits() == 32)
+        N = DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl, MVT::i32, N);
+    }
+
+    return DAG.getNode(ISD::BITCAST, dl, VT, N);
+  }
+
+  // Variable element number.
+  SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i32, Idx,
+                               DAG.getConstant(EltSize, MVT::i32));
+  SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
+                                DAG.getConstant(32, MVT::i64));
+  SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
+
+  const SDValue Ops[] = {Vec, Combined};
+
+  SDValue N;
+  if (VecVT.getSizeInBits() == 32) {
+    N = DAG.getNode(HexagonISD::EXTRACTU_riv, dl, MVT::i32, Ops);
+  } else {
+    N = DAG.getNode(HexagonISD::EXTRACTU_rdv, dl, MVT::i64, Ops);
+    if (VT.getSizeInBits() == 32)
+      N = DAG.getTargetExtractSubreg(Hexagon::subreg_loreg, dl, MVT::i32, N);
+  }
+  return DAG.getNode(ISD::BITCAST, dl, VT, N);
+}
+
+SDValue
+HexagonTargetLowering::LowerINSERT_VECTOR(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  int VTN = VT.isVector() ? VT.getVectorNumElements() : 1;
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Val = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  int EltSize = EltVT.getSizeInBits();
+  SDValue Width = DAG.getConstant(Op.getOpcode() == ISD::INSERT_VECTOR_ELT ?
+                                  EltSize : VTN * EltSize, MVT::i64);
+
+  if (ConstantSDNode *C = cast<ConstantSDNode>(Idx)) {
+    SDValue Offset = DAG.getConstant(C->getSExtValue() * EltSize, MVT::i32);
+    const SDValue Ops[] = {Vec, Val, Width, Offset};
+
+    SDValue N;
+    if (VT.getSizeInBits() == 32)
+      N = DAG.getNode(HexagonISD::INSERT_ri, dl, MVT::i32, Ops);
+    else
+      N = DAG.getNode(HexagonISD::INSERT_rd, dl, MVT::i64, Ops);
+
+    return DAG.getNode(ISD::BITCAST, dl, VT, N);
+  }
+
+  // Variable element number.
+  SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i32, Idx,
+                               DAG.getConstant(EltSize, MVT::i32));
+  SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
+                                DAG.getConstant(32, MVT::i64));
+  SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
+
+  if (VT.getSizeInBits() == 64 &&
+      Val.getValueType().getSizeInBits() == 32) {
+    SDValue C = DAG.getConstant(0, MVT::i32);
+    Val = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Val);
+  }
+
+  const SDValue Ops[] = {Vec, Val, Combined};
+
+  SDValue N;
+  if (VT.getSizeInBits() == 32)
+    N = DAG.getNode(HexagonISD::INSERT_riv, dl, MVT::i32, Ops);
+  else
+    N = DAG.getNode(HexagonISD::INSERT_rdv, dl, MVT::i64, Ops);
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, N);
+}
+
 bool
 HexagonTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
  // Assuming the caller does not have either a signext or zeroext modifier, and
@@ -1554,7 +2372,19 @@ SDValue
 HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  switch (Op.getOpcode()) {
    default: llvm_unreachable("Should not custom lower this!");
-    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+    case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
+    case ISD::INSERT_SUBVECTOR:   return LowerINSERT_VECTOR(Op, DAG);
+    case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR(Op, DAG);
+    case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_VECTOR(Op, DAG);
+    case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR(Op, DAG);
+    case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+    case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+    case ISD::SRA:
+    case ISD::SHL:
+    case ISD::SRL:
+      return LowerVECTOR_SHIFT(Op, DAG);
+    case ISD::ConstantPool:
+      return LowerConstantPool(Op, DAG);
    case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
      // Frame & Return address.  Currently unimplemented.
    case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
@@ -1566,9 +2396,14 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
    case ISD::VASTART:            return LowerVASTART(Op, DAG);
    case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
+    // Custom lower some vector loads.
+    case ISD::LOAD:               return LowerLOAD(Op, DAG);

    case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
    case ISD::SELECT:             return Op;
+    case ISD::SETCC:              return LowerSETCC(Op, DAG);
+    case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
+    case ISD::CTPOP:              return LowerCTPOP(Op, DAG);
    case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
    case ISD::INLINEASM:          return LowerINLINEASM(Op, DAG);

--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -58,13 +58,36 @@ bool isPositiveHalfWord(SDNode *N);
      CALLR,

      RET_FLAG,    // Return with a flag operand.
-      BR_JT,       // Jump table.
-      BARRIER,     // Memory barrier
+      BR_JT,       // Branch through jump table.
+      BARRIER,     // Memory barrier.
+      JT,          // Jump table.
+      CP,          // Constant pool.
      POPCOUNT,
      COMBINE,
      PACKHL,
-      JT,
-      CP,
+      VSPLATB,
+      VSPLATH,
+      SHUFFEB,
+      SHUFFEH,
+      SHUFFOB,
+      SHUFFOH,
+      VSXTBH,
+      VSXTBW,
+      VSRAW,
+      VSRAH,
+      VSRLW,
+      VSRLH,
+      VSHLW,
+      VSHLH,
+      VCMPBEQ,
+      VCMPBGT,
+      VCMPBGTU,
+      VCMPHEQ,
+      VCMPHGT,
+      VCMPHGTU,
+      VCMPWEQ,
+      VCMPWGT,
+      VCMPWGTU,
      INSERT_ri,
      INSERT_rd,
      INSERT_riv,
@@ -73,17 +96,6 @@ bool isPositiveHalfWord(SDNode *N);
      EXTRACTU_rd,
      EXTRACTU_riv,
      EXTRACTU_rdv,
-      WrapperCombineII,
-      WrapperCombineRR,
-      WrapperCombineRI_V4,
-      WrapperCombineIR_V4,
-      WrapperPackhl,
-      WrapperSplatB,
-      WrapperSplatH,
-      WrapperShuffEB,
-      WrapperShuffEH,
-      WrapperShuffOB,
-      WrapperShuffOH,
      TC_RETURN,
      EH_RETURN,
      DCFETCH
@@ -98,6 +110,8 @@ bool isPositiveHalfWord(SDNode *N);
    bool CanReturnSmallStruct(const Function* CalleeFn,
                              unsigned& RetSize) const;

+    void promoteLdStType(EVT VT, EVT PromotedLdStVT);
+
  public:
    const HexagonSubtarget *Subtarget;
    explicit HexagonTargetLowering(const TargetMachine &TM,
@@ -123,10 +137,17 @@ bool isPositiveHalfWord(SDNode *N);

    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;

-    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+    // Should we expand the build vector with shuffles?
+    bool shouldExpandBuildVectorWithShuffles(EVT VT,
+                                        unsigned DefinedValues) const override;

+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
    const char *getTargetNodeName(unsigned Opcode) const override;
-    SDValue  LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEXTRACT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINSERT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
@@ -150,9 +171,13 @@ bool isPositiveHalfWord(SDNode *N);
                            const SmallVectorImpl<SDValue> &OutVals,
                            SDValue Callee) const;

+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;

    SDValue LowerReturn(SDValue Chain,
                        CallingConv::ID CallConv, bool isVarArg,
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -566,6 +566,8 @@ void HexagonInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
 }
 bool
 HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  const HexagonRegisterInfo &TRI = getRegisterInfo();
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
  MachineBasicBlock &MBB = *MI->getParent();
  DebugLoc DL = MI->getDebugLoc();
  unsigned Opc = MI->getOpcode();
@@ -587,6 +589,55 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
      MBB.erase(MI);
      return true;
    }
+    case Hexagon::VMULW: {
+      // Expand a 64-bit vector multiply into 2 32-bit scalar multiplies.
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned Src1Reg = MI->getOperand(1).getReg();
+      unsigned Src2Reg = MI->getOperand(2).getReg();
+      unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
+      unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
+      unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
+      unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi),
+              TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
+          .addReg(Src2SubHi);
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi),
+              TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
+          .addReg(Src2SubLo);
+      MBB.erase(MI);
+      MRI.clearKillFlags(Src1SubHi);
+      MRI.clearKillFlags(Src1SubLo);
+      MRI.clearKillFlags(Src2SubHi);
+      MRI.clearKillFlags(Src2SubLo);
+      return true;
+    }
+    case Hexagon::VMULW_ACC: {
+      // Expand 64-bit vector multiply with addition into 2 scalar multiplies.
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned Src1Reg = MI->getOperand(1).getReg();
+      unsigned Src2Reg = MI->getOperand(2).getReg();
+      unsigned Src3Reg = MI->getOperand(3).getReg();
+      unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
+      unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
+      unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
+      unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
+      unsigned Src3SubHi = TRI.getSubReg(Src3Reg, Hexagon::subreg_hireg);
+      unsigned Src3SubLo = TRI.getSubReg(Src3Reg, Hexagon::subreg_loreg);
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci),
+              TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
+          .addReg(Src2SubHi).addReg(Src3SubHi);
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci),
+              TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
+          .addReg(Src2SubLo).addReg(Src3SubLo);
+      MBB.erase(MI);
+      MRI.clearKillFlags(Src1SubHi);
+      MRI.clearKillFlags(Src1SubLo);
+      MRI.clearKillFlags(Src2SubHi);
+      MRI.clearKillFlags(Src2SubLo);
+      MRI.clearKillFlags(Src3SubHi);
+      MRI.clearKillFlags(Src3SubLo);
+      return true;
+    }
    case Hexagon::TCRETURNi:
      MI->setDesc(get(Hexagon::J2_jump));
      return true;
--- a/lib/Target/Hexagon/HexagonInstrInfoVector.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoVector.td
@@ -20,6 +20,34 @@ def V8I8:  PatLeaf<(v8i8  DoubleRegs:$R)>;
 def V4I16: PatLeaf<(v4i16 DoubleRegs:$R)>;
 def V2I32: PatLeaf<(v2i32 DoubleRegs:$R)>;

+
+multiclass bitconvert_32<ValueType a, ValueType b> {
+  def : Pat <(b (bitconvert (a IntRegs:$src))),
+             (b IntRegs:$src)>;
+  def : Pat <(a (bitconvert (b IntRegs:$src))),
+             (a IntRegs:$src)>;
+}
+
+multiclass bitconvert_64<ValueType a, ValueType b> {
+  def : Pat <(b (bitconvert (a DoubleRegs:$src))),
+             (b DoubleRegs:$src)>;
+  def : Pat <(a (bitconvert (b DoubleRegs:$src))),
+             (a DoubleRegs:$src)>;
+}
+
+// Bit convert vector types.
+defm : bitconvert_32<v4i8, i32>;
+defm : bitconvert_32<v2i16, i32>;
+defm : bitconvert_32<v2i16, v4i8>;
+
+defm : bitconvert_64<v8i8, i64>;
+defm : bitconvert_64<v4i16, i64>;
+defm : bitconvert_64<v2i32, i64>;
+defm : bitconvert_64<v8i8, v4i16>;
+defm : bitconvert_64<v8i8, v2i32>;
+defm : bitconvert_64<v4i16, v2i32>;
+
+
 // Vector shift support. Vector shifting in Hexagon is rather different
 // from internal representation of LLVM.
 // LLVM assumes all shifts (in vector case) will have the form
@@ -44,6 +72,12 @@ class vshift_v2i32<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
  let Inst{12-8} = src2;
 }

+def : Pat<(v2i16 (add (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))),
+          (A2_svaddh IntRegs:$src1, IntRegs:$src2)>;
+
+def : Pat<(v2i16 (sub (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))),
+          (A2_svsubh IntRegs:$src1, IntRegs:$src2)>;
+
 def S2_asr_i_vw : vshift_v2i32<sra, "vasrw", 0b010, 0b000>;
 def S2_lsr_i_vw : vshift_v2i32<srl, "vlsrw", 0b010, 0b001>;
 def S2_asl_i_vw : vshift_v2i32<shl, "vaslw", 0b010, 0b010>;
@@ -52,6 +86,87 @@ def S2_asr_i_vh : vshift_v4i16<sra, "vasrh", 0b100, 0b000>;
 def S2_lsr_i_vh : vshift_v4i16<srl, "vlsrh", 0b100, 0b001>;
 def S2_asl_i_vh : vshift_v4i16<shl, "vaslh", 0b100, 0b010>;

+
+def HexagonVSPLATB: SDNode<"HexagonISD::VSPLATB", SDTUnaryOp>;
+def HexagonVSPLATH: SDNode<"HexagonISD::VSPLATH", SDTUnaryOp>;
+
+// Replicate the low 8-bits from 32-bits input register into each of the
+// four bytes of 32-bits destination register.
+def: Pat<(v4i8  (HexagonVSPLATB I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
+
+// Replicate the low 16-bits from 32-bits input register into each of the
+// four halfwords of 64-bits destination register.
+def: Pat<(v4i16 (HexagonVSPLATH I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
+
+
+class VArith_pat <InstHexagon MI, SDNode Op, PatFrag Type>
+  : Pat <(Op Type:$Rss, Type:$Rtt),
+         (MI Type:$Rss, Type:$Rtt)>;
+
+def: VArith_pat <A2_vaddub, add, V8I8>;
+def: VArith_pat <A2_vaddh,  add, V4I16>;
+def: VArith_pat <A2_vaddw,  add, V2I32>;
+def: VArith_pat <A2_vsubub, sub, V8I8>;
+def: VArith_pat <A2_vsubh,  sub, V4I16>;
+def: VArith_pat <A2_vsubw,  sub, V2I32>;
+
+def: VArith_pat <A2_and,    and, V2I16>;
+def: VArith_pat <A2_xor,    xor, V2I16>;
+def: VArith_pat <A2_or,     or,  V2I16>;
+
+def: VArith_pat <A2_andp,   and, V8I8>;
+def: VArith_pat <A2_andp,   and, V4I16>;
+def: VArith_pat <A2_andp,   and, V2I32>;
+def: VArith_pat <A2_orp,    or,  V8I8>;
+def: VArith_pat <A2_orp,    or,  V4I16>;
+def: VArith_pat <A2_orp,    or,  V2I32>;
+def: VArith_pat <A2_xorp,   xor, V8I8>;
+def: VArith_pat <A2_xorp,   xor, V4I16>;
+def: VArith_pat <A2_xorp,   xor, V2I32>;
+
+def: Pat<(v2i32 (sra V2I32:$b, (i64 (HexagonCOMBINE (i32 u5ImmPred:$c),
+                                                    (i32 u5ImmPred:$c))))),
+         (S2_asr_i_vw V2I32:$b, imm:$c)>;
+def: Pat<(v2i32 (srl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5ImmPred:$c),
+                                                    (i32 u5ImmPred:$c))))),
+         (S2_lsr_i_vw V2I32:$b, imm:$c)>;
+def: Pat<(v2i32 (shl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5ImmPred:$c),
+                                                    (i32 u5ImmPred:$c))))),
+         (S2_asl_i_vw V2I32:$b, imm:$c)>;
+
+def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4ImmPred:$c)))))),
+         (S2_asr_i_vh V4I16:$b, imm:$c)>;
+def: Pat<(v4i16 (srl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4ImmPred:$c)))))),
+         (S2_lsr_i_vh V4I16:$b, imm:$c)>;
+def: Pat<(v4i16 (shl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4ImmPred:$c)))))),
+         (S2_asl_i_vh V4I16:$b, imm:$c)>;
+
+
+def SDTHexagon_v2i32_v2i32_i32 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisVT<0, v2i32>, SDTCisInt<2>]>;
+def SDTHexagon_v4i16_v4i16_i32 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisVT<0, v4i16>, SDTCisInt<2>]>;
+
+def HexagonVSRAW: SDNode<"HexagonISD::VSRAW", SDTHexagon_v2i32_v2i32_i32>;
+def HexagonVSRAH: SDNode<"HexagonISD::VSRAH", SDTHexagon_v4i16_v4i16_i32>;
+def HexagonVSRLW: SDNode<"HexagonISD::VSRLW", SDTHexagon_v2i32_v2i32_i32>;
+def HexagonVSRLH: SDNode<"HexagonISD::VSRLH", SDTHexagon_v4i16_v4i16_i32>;
+def HexagonVSHLW: SDNode<"HexagonISD::VSHLW", SDTHexagon_v2i32_v2i32_i32>;
+def HexagonVSHLH: SDNode<"HexagonISD::VSHLH", SDTHexagon_v4i16_v4i16_i32>;
+
+def: Pat<(v2i32 (HexagonVSRAW V2I32:$Rs, u5ImmPred:$u5)),
+         (S2_asr_i_vw V2I32:$Rs, imm:$u5)>;
+def: Pat<(v4i16 (HexagonVSRAH V4I16:$Rs, u4ImmPred:$u4)),
+         (S2_asr_i_vh V4I16:$Rs, imm:$u4)>;
+def: Pat<(v2i32 (HexagonVSRLW V2I32:$Rs, u5ImmPred:$u5)),
+         (S2_lsr_i_vw V2I32:$Rs, imm:$u5)>;
+def: Pat<(v4i16 (HexagonVSRLH V4I16:$Rs, u4ImmPred:$u4)),
+         (S2_lsr_i_vh V4I16:$Rs, imm:$u4)>;
+def: Pat<(v2i32 (HexagonVSHLW V2I32:$Rs, u5ImmPred:$u5)),
+         (S2_asl_i_vw V2I32:$Rs, imm:$u5)>;
+def: Pat<(v4i16 (HexagonVSHLH V4I16:$Rs, u4ImmPred:$u4)),
+         (S2_asl_i_vh V4I16:$Rs, imm:$u4)>;
+
 // Vector shift words by register
 def S2_asr_r_vw : T_S3op_shiftVect < "vasrw", 0b00, 0b00>;
 def S2_lsr_r_vw : T_S3op_shiftVect < "vlsrw", 0b00, 0b01>;
@@ -63,3 +178,306 @@ def S2_asr_r_vh : T_S3op_shiftVect < "vasrh", 0b01, 0b00>;
 def S2_lsr_r_vh : T_S3op_shiftVect < "vlsrh", 0b01, 0b01>;
 def S2_asl_r_vh : T_S3op_shiftVect < "vaslh", 0b01, 0b10>;
 def S2_lsl_r_vh : T_S3op_shiftVect < "vlslh", 0b01, 0b11>;
+
+class vshift_rr_pat<InstHexagon MI, SDNode Op, PatFrag Value>
+  : Pat <(Op Value:$Rs, I32:$Rt),
+         (MI Value:$Rs, I32:$Rt)>;
+
+def: vshift_rr_pat <S2_asr_r_vw, HexagonVSRAW, V2I32>;
+def: vshift_rr_pat <S2_asr_r_vh, HexagonVSRAH, V4I16>;
+def: vshift_rr_pat <S2_lsr_r_vw, HexagonVSRLW, V2I32>;
+def: vshift_rr_pat <S2_lsr_r_vh, HexagonVSRLH, V4I16>;
+def: vshift_rr_pat <S2_asl_r_vw, HexagonVSHLW, V2I32>;
+def: vshift_rr_pat <S2_asl_r_vh, HexagonVSHLH, V4I16>;
+
+
+def SDTHexagonVecCompare_v8i8 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v8i8>]>;
+def SDTHexagonVecCompare_v4i16 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v4i16>]>;
+def SDTHexagonVecCompare_v2i32 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v2i32>]>;
+
+def HexagonVCMPBEQ:  SDNode<"HexagonISD::VCMPBEQ",  SDTHexagonVecCompare_v8i8>;
+def HexagonVCMPBGT:  SDNode<"HexagonISD::VCMPBGT",  SDTHexagonVecCompare_v8i8>;
+def HexagonVCMPBGTU: SDNode<"HexagonISD::VCMPBGTU", SDTHexagonVecCompare_v8i8>;
+def HexagonVCMPHEQ:  SDNode<"HexagonISD::VCMPHEQ",  SDTHexagonVecCompare_v4i16>;
+def HexagonVCMPHGT:  SDNode<"HexagonISD::VCMPHGT",  SDTHexagonVecCompare_v4i16>;
+def HexagonVCMPHGTU: SDNode<"HexagonISD::VCMPHGTU", SDTHexagonVecCompare_v4i16>;
+def HexagonVCMPWEQ:  SDNode<"HexagonISD::VCMPWEQ",  SDTHexagonVecCompare_v2i32>;
+def HexagonVCMPWGT:  SDNode<"HexagonISD::VCMPWGT",  SDTHexagonVecCompare_v2i32>;
+def HexagonVCMPWGTU: SDNode<"HexagonISD::VCMPWGTU", SDTHexagonVecCompare_v2i32>;
+
+
+class vcmp_i1_pat<InstHexagon MI, SDNode Op, PatFrag Value>
+  : Pat <(i1 (Op Value:$Rs, Value:$Rt)),
+         (MI Value:$Rs, Value:$Rt)>;
+
+def: vcmp_i1_pat<A2_vcmpbeq,  HexagonVCMPBEQ,  V8I8>;
+def: vcmp_i1_pat<A4_vcmpbgt,  HexagonVCMPBGT,  V8I8>;
+def: vcmp_i1_pat<A2_vcmpbgtu, HexagonVCMPBGTU, V8I8>;
+
+def: vcmp_i1_pat<A2_vcmpheq,  HexagonVCMPHEQ,  V4I16>;
+def: vcmp_i1_pat<A2_vcmphgt,  HexagonVCMPHGT,  V4I16>;
+def: vcmp_i1_pat<A2_vcmphgtu, HexagonVCMPHGTU, V4I16>;
+
+def: vcmp_i1_pat<A2_vcmpweq,  HexagonVCMPWEQ,  V2I32>;
+def: vcmp_i1_pat<A2_vcmpwgt,  HexagonVCMPWGT,  V2I32>;
+def: vcmp_i1_pat<A2_vcmpwgtu, HexagonVCMPWGTU, V2I32>;
+
+
+class vcmp_vi1_pat<InstHexagon MI, PatFrag Op, PatFrag InVal, ValueType OutTy>
+  : Pat <(OutTy (Op InVal:$Rs, InVal:$Rt)),
+         (MI InVal:$Rs, InVal:$Rt)>;
+
+def: vcmp_vi1_pat<A2_vcmpweq,  seteq,  V2I32, v2i1>;
+def: vcmp_vi1_pat<A2_vcmpwgt,  setgt,  V2I32, v2i1>;
+def: vcmp_vi1_pat<A2_vcmpwgtu, setugt, V2I32, v2i1>;
+
+def: vcmp_vi1_pat<A2_vcmpheq,  seteq,  V4I16, v4i1>;
+def: vcmp_vi1_pat<A2_vcmphgt,  setgt,  V4I16, v4i1>;
+def: vcmp_vi1_pat<A2_vcmphgtu, setugt, V4I16, v4i1>;
+
+
+// Hexagon doesn't have a vector multiply with C semantics.
+// Instead, generate a pseudo instruction that gets expaneded into two
+// scalar MPYI instructions.
+// This is expanded by ExpandPostRAPseudos.
+let isPseudo = 1 in
+def VMULW : PseudoM<(outs DoubleRegs:$Rd),
+      (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
+      ".error \"Should never try to emit VMULW\"",
+      [(set V2I32:$Rd, (mul V2I32:$Rs, V2I32:$Rt))]>;
+
+let isPseudo = 1 in
+def VMULW_ACC : PseudoM<(outs DoubleRegs:$Rd),
+      (ins DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt),
+      ".error \"Should never try to emit VMULW_ACC\"",
+      [(set V2I32:$Rd, (add V2I32:$Rx, (mul V2I32:$Rs, V2I32:$Rt)))],
+      "$Rd = $Rx">;
+
+// Adds two v4i8: Hexagon does not have an insn for this one, so we
+// use the double add v8i8, and use only the low part of the result.
+def: Pat<(v4i8 (add (v4i8 IntRegs:$Rs), (v4i8 IntRegs:$Rt))),
+         (LoReg (A2_vaddub (Zext64 $Rs), (Zext64 $Rt)))>;
+
+// Subtract two v4i8: Hexagon does not have an insn for this one, so we
+// use the double sub v8i8, and use only the low part of the result.
+def: Pat<(v4i8 (sub (v4i8 IntRegs:$Rs), (v4i8 IntRegs:$Rt))),
+         (LoReg (A2_vsubub (Zext64 $Rs), (Zext64 $Rt)))>;
+
+//
+// No 32 bit vector mux.
+//
+def: Pat<(v4i8 (select I1:$Pu, V4I8:$Rs, V4I8:$Rt)),
+         (LoReg (C2_vmux I1:$Pu, (Zext64 $Rs), (Zext64 $Rt)))>;
+def: Pat<(v2i16 (select I1:$Pu, V2I16:$Rs, V2I16:$Rt)),
+         (LoReg (C2_vmux I1:$Pu, (Zext64 $Rs), (Zext64 $Rt)))>;
+
+//
+// 64-bit vector mux.
+//
+def: Pat<(v8i8 (vselect V8I1:$Pu, V8I8:$Rs, V8I8:$Rt)),
+         (C2_vmux V8I1:$Pu, V8I8:$Rs, V8I8:$Rt)>;
+def: Pat<(v4i16 (vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt)),
+         (C2_vmux V4I1:$Pu, V4I16:$Rs, V4I16:$Rt)>;
+def: Pat<(v2i32 (vselect V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)),
+         (C2_vmux V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)>;
+
+//
+// No 32 bit vector compare.
+//
+def: Pat<(i1 (seteq V4I8:$Rs, V4I8:$Rt)),
+         (A2_vcmpbeq (Zext64 $Rs), (Zext64 $Rt))>;
+def: Pat<(i1 (setgt V4I8:$Rs, V4I8:$Rt)),
+         (A4_vcmpbgt (Zext64 $Rs), (Zext64 $Rt))>;
+def: Pat<(i1 (setugt V4I8:$Rs, V4I8:$Rt)),
+         (A2_vcmpbgtu (Zext64 $Rs), (Zext64 $Rt))>;
+
+def: Pat<(i1 (seteq V2I16:$Rs, V2I16:$Rt)),
+         (A2_vcmpheq (Zext64 $Rs), (Zext64 $Rt))>;
+def: Pat<(i1 (setgt V2I16:$Rs, V2I16:$Rt)),
+         (A2_vcmphgt (Zext64 $Rs), (Zext64 $Rt))>;
+def: Pat<(i1 (setugt V2I16:$Rs, V2I16:$Rt)),
+         (A2_vcmphgtu (Zext64 $Rs), (Zext64 $Rt))>;
+
+
+class InvertCmp_pat<InstHexagon InvMI, PatFrag CmpOp, PatFrag Value,
+                    ValueType CmpTy>
+  : Pat<(CmpTy (CmpOp Value:$Rs, Value:$Rt)),
+        (InvMI Value:$Rt, Value:$Rs)>;
+
+// Map from a compare operation to the corresponding instruction with the
+// order of operands reversed, e.g.  x > y --> cmp.lt(y,x).
+def: InvertCmp_pat<A4_vcmpbgt,  setlt,  V8I8,  i1>;
+def: InvertCmp_pat<A4_vcmpbgt,  setlt,  V8I8,  v8i1>;
+def: InvertCmp_pat<A2_vcmphgt,  setlt,  V4I16, i1>;
+def: InvertCmp_pat<A2_vcmphgt,  setlt,  V4I16, v4i1>;
+def: InvertCmp_pat<A2_vcmpwgt,  setlt,  V2I32, i1>;
+def: InvertCmp_pat<A2_vcmpwgt,  setlt,  V2I32, v2i1>;
+
+def: InvertCmp_pat<A2_vcmpbgtu, setult, V8I8,  i1>;
+def: InvertCmp_pat<A2_vcmpbgtu, setult, V8I8,  v8i1>;
+def: InvertCmp_pat<A2_vcmphgtu, setult, V4I16, i1>;
+def: InvertCmp_pat<A2_vcmphgtu, setult, V4I16, v4i1>;
+def: InvertCmp_pat<A2_vcmpwgtu, setult, V2I32, i1>;
+def: InvertCmp_pat<A2_vcmpwgtu, setult, V2I32, v2i1>;
+
+// Map from vcmpne(Rss) -> !vcmpew(Rss).
+// rs != rt -> !(rs == rt).
+def: Pat<(v2i1 (setne V2I32:$Rs, V2I32:$Rt)),
+         (C2_not (v2i1 (A2_vcmpbeq V2I32:$Rs, V2I32:$Rt)))>;
+
+
+// Truncate: from vector B copy all 'E'ven 'B'yte elements:
+// A[0] = B[0];  A[1] = B[2];  A[2] = B[4];  A[3] = B[6];
+def: Pat<(v4i8 (trunc V4I16:$Rs)),
+         (S2_vtrunehb V4I16:$Rs)>;
+
+// Truncate: from vector B copy all 'O'dd 'B'yte elements:
+// A[0] = B[1];  A[1] = B[3];  A[2] = B[5];  A[3] = B[7];
+// S2_vtrunohb
+
+// Truncate: from vectors B and C copy all 'E'ven 'H'alf-word elements:
+// A[0] = B[0];  A[1] = B[2];  A[2] = C[0];  A[3] = C[2];
+// S2_vtruneh
+
+def: Pat<(v2i16 (trunc V2I32:$Rs)),
+         (LoReg (S2_packhl (HiReg $Rs), (LoReg $Rs)))>;
+
+
+def HexagonVSXTBH : SDNode<"HexagonISD::VSXTBH", SDTUnaryOp>;
+def HexagonVSXTBW : SDNode<"HexagonISD::VSXTBW", SDTUnaryOp>;
+
+def: Pat<(i64 (HexagonVSXTBH I32:$Rs)), (S2_vsxtbh I32:$Rs)>;
+def: Pat<(i64 (HexagonVSXTBW I32:$Rs)), (S2_vsxthw I32:$Rs)>;
+
+def: Pat<(v4i16 (zext   V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
+def: Pat<(v2i32 (zext   V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
+def: Pat<(v4i16 (anyext V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
+def: Pat<(v2i32 (anyext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
+def: Pat<(v4i16 (sext   V4I8:$Rs)),  (S2_vsxtbh V4I8:$Rs)>;
+def: Pat<(v2i32 (sext   V2I16:$Rs)), (S2_vsxthw V2I16:$Rs)>;
+
+// Sign extends a v2i8 into a v2i32.
+def: Pat<(v2i32 (sext_inreg V2I32:$Rs, v2i8)),
+         (A2_combinew (A2_sxtb (HiReg $Rs)), (A2_sxtb (LoReg $Rs)))>;
+
+// Sign extends a v2i16 into a v2i32.
+def: Pat<(v2i32 (sext_inreg V2I32:$Rs, v2i16)),
+         (A2_combinew (A2_sxth (HiReg $Rs)), (A2_sxth (LoReg $Rs)))>;
+
+
+// Multiplies two v2i16 and returns a v2i32.  We are using here the
+// saturating multiply, as hexagon does not provide a non saturating
+// vector multiply, and saturation does not impact the result that is
+// in double precision of the operands.
+
+// Multiplies two v2i16 vectors: as Hexagon does not have a multiply
+// with the C semantics for this one, this pattern uses the half word
+// multiply vmpyh that takes two v2i16 and returns a v2i32.  This is
+// then truncated to fit this back into a v2i16 and to simulate the
+// wrap around semantics for unsigned in C.
+def vmpyh: OutPatFrag<(ops node:$Rs, node:$Rt),
+                      (M2_vmpy2s_s0 (i32 $Rs), (i32 $Rt))>;
+
+def: Pat<(v2i16 (mul V2I16:$Rs, V2I16:$Rt)),
+         (LoReg (S2_vtrunewh (v2i32 (A2_combineii 0, 0)),
+                             (v2i32 (vmpyh V2I16:$Rs, V2I16:$Rt))))>;
+
+// Multiplies two v4i16 vectors.
+def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)),
+         (S2_vtrunewh (vmpyh (HiReg $Rs), (HiReg $Rt)),
+                      (vmpyh (LoReg $Rs), (LoReg $Rt)))>;
+
+def VMPYB_no_V5: OutPatFrag<(ops node:$Rs, node:$Rt),
+  (S2_vtrunewh (vmpyh (HiReg (S2_vsxtbh $Rs)), (HiReg (S2_vsxtbh $Rt))),
+               (vmpyh (LoReg (S2_vsxtbh $Rs)), (LoReg (S2_vsxtbh $Rt))))>;
+
+// Multiplies two v4i8 vectors.
+def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)),
+         (S2_vtrunehb (M5_vmpybsu V4I8:$Rs, V4I8:$Rt))>,
+     Requires<[HasV5T]>;
+
+def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)),
+         (S2_vtrunehb (VMPYB_no_V5 V4I8:$Rs, V4I8:$Rt))>;
+
+// Multiplies two v8i8 vectors.
+def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
+         (A2_combinew (S2_vtrunehb (M5_vmpybsu (HiReg $Rs), (HiReg $Rt))),
+                      (S2_vtrunehb (M5_vmpybsu (LoReg $Rs), (LoReg $Rt))))>,
+     Requires<[HasV5T]>;
+
+def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
+         (A2_combinew (S2_vtrunehb (VMPYB_no_V5 (HiReg $Rs), (HiReg $Rt))),
+                      (S2_vtrunehb (VMPYB_no_V5 (LoReg $Rs), (LoReg $Rt))))>;
+
+
+class shuffler<SDNode Op, string Str>
+  : SInst<(outs DoubleRegs:$a), (ins DoubleRegs:$b, DoubleRegs:$c),
+      "$a = " # Str # "($b, $c)",
+      [(set (i64 DoubleRegs:$a),
+            (i64 (Op (i64 DoubleRegs:$b), (i64 DoubleRegs:$c))))],
+      "", S_3op_tc_1_SLOT23>;
+
+def SDTHexagonBinOp64 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>]>;
+
+def HexagonSHUFFEB: SDNode<"HexagonISD::SHUFFEB", SDTHexagonBinOp64>;
+def HexagonSHUFFEH: SDNode<"HexagonISD::SHUFFEH", SDTHexagonBinOp64>;
+def HexagonSHUFFOB: SDNode<"HexagonISD::SHUFFOB", SDTHexagonBinOp64>;
+def HexagonSHUFFOH: SDNode<"HexagonISD::SHUFFOH", SDTHexagonBinOp64>;
+
+class ShufflePat<InstHexagon MI, SDNode Op>
+  : Pat<(i64 (Op DoubleRegs:$src1, DoubleRegs:$src2)),
+        (i64 (MI DoubleRegs:$src1, DoubleRegs:$src2))>;
+
+// Shuffles even bytes for i=0..3: A[2*i].b = C[2*i].b; A[2*i+1].b = B[2*i].b
+def: ShufflePat<S2_shuffeb, HexagonSHUFFEB>;
+
+// Shuffles odd bytes for i=0..3: A[2*i].b = C[2*i+1].b; A[2*i+1].b = B[2*i+1].b
+def: ShufflePat<S2_shuffob, HexagonSHUFFOB>;
+
+// Shuffles even half for i=0,1: A[2*i].h = C[2*i].h; A[2*i+1].h = B[2*i].h
+def: ShufflePat<S2_shuffeh, HexagonSHUFFEH>;
+
+// Shuffles odd half for i=0,1: A[2*i].h = C[2*i+1].h; A[2*i+1].h = B[2*i+1].h
+def: ShufflePat<S2_shuffoh, HexagonSHUFFOH>;
+
+
+// Truncated store from v4i16 to v4i8.
+def truncstorev4i8: PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr),
+    [{ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4i8; }]>;
+
+// Truncated store from v2i32 to v2i16.
+def truncstorev2i16: PatFrag<(ops node:$val, node:$ptr),
+                             (truncstore node:$val, node:$ptr),
+    [{ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i16; }]>;
+
+def: Pat<(truncstorev2i16 V2I32:$Rs, I32:$Rt),
+         (S2_storeri_io I32:$Rt, 0, (LoReg (S2_packhl (HiReg $Rs),
+                                                      (LoReg $Rs))))>;
+
+def: Pat<(truncstorev4i8 V4I16:$Rs, I32:$Rt),
+         (S2_storeri_io I32:$Rt, 0, (S2_vtrunehb V4I16:$Rs))>;
+
+
+// Zero and sign extended load from v2i8 into v2i16.
+def zextloadv2i8: PatFrag<(ops node:$ptr), (zextload node:$ptr),
+    [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v2i8; }]>;
+
+def sextloadv2i8: PatFrag<(ops node:$ptr), (sextload node:$ptr),
+    [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v2i8; }]>;
+
+def: Pat<(v2i16 (zextloadv2i8 I32:$Rs)),
+         (LoReg (v4i16 (S2_vzxtbh (L2_loadruh_io I32:$Rs, 0))))>;
+
+def: Pat<(v2i16 (sextloadv2i8 I32:$Rs)),
+         (LoReg (v4i16 (S2_vsxtbh (L2_loadrh_io I32:$Rs, 0))))>;
+
+def: Pat<(v2i32 (zextloadv2i8 I32:$Rs)),
+         (S2_vzxthw (LoReg (v4i16 (S2_vzxtbh (L2_loadruh_io I32:$Rs, 0)))))>;
+
+def: Pat<(v2i32 (sextloadv2i8 I32:$Rs)),
+         (S2_vsxthw (LoReg (v4i16 (S2_vsxtbh (L2_loadrh_io I32:$Rs, 0)))))>;
--- a/test/CodeGen/Hexagon/vect/vect-anyextend.ll
+++ b/test/CodeGen/Hexagon/vect/vect-anyextend.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=hexagon < %s
+; Used to fail with "Cannot select: 0x17300f0: v2i32 = any_extend"
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout =
+"e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+target triple = "hexagon-unknown-linux-gnu"
+
+define void @foo() nounwind {
+entry:
+  %_p_vec_full48 = load <4 x i8>, <4 x i8>* undef, align 8
+  %0 = zext <4 x i8> %_p_vec_full48 to <4 x i32>
+  store <4 x i32> %0, <4 x i32>* undef, align 8
+  unreachable
+}
--- a/test/CodeGen/Hexagon/vect/vect-apint-truncate.ll
+++ b/test/CodeGen/Hexagon/vect/vect-apint-truncate.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=hexagon < %s
+; Used to fail with "Invalid APInt Truncate request".
+; Used to fail with "Cannot select: 0x596010: v2i32 = sign_extend_inreg".
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+target triple = "hexagon-unknown-linux-gnu"
+
+define void @foo() nounwind {
+entry:
+  br label %polly.loop_header
+
+polly.loop_after:                                 ; preds = %polly.loop_header
+  unreachable
+
+polly.loop_header:                                ; preds = %polly.loop_body, %entry
+  %0 = icmp sle i32 undef, 63
+  br i1 %0, label %polly.loop_body, label %polly.loop_after
+
+polly.loop_body:                                  ; preds = %polly.loop_header
+  %_p_vec_full = load <4 x i8>, <4 x i8>* undef, align 8
+  %1 = sext <4 x i8> %_p_vec_full to <4 x i32>
+  %p_vec = mul <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
+  %mulp_vec = add <4 x i32> %p_vec, <i32 21, i32 21, i32 21, i32 21>
+  store <4 x i32> %mulp_vec, <4 x i32>* undef, align 8
+  br label %polly.loop_header
+}
--- a/test/CodeGen/Hexagon/vect/vect-bad-bitcast.ll
+++ b/test/CodeGen/Hexagon/vect/vect-bad-bitcast.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s
+; REQUIRES: asserts
+; Check for successful compilation.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon"
+
+@input_buf = internal unnamed_addr constant [256 x i16] [i16 0, i16 0, i16 0, i16 1280, i16 2560, i16 4864, i16 7168, i16 9472, i16 11776, i16 12672, i16 13568, i16 14080, i16 15360, i16 15360, i16 15360, i16 15360, i16 15360, i16 15104, i16 14848, i16 14592, i16 14336, i16 14080, i16 14080, i16 13952, i16 13824, i16 13696, i16 13568, i16 13440, i16 13312, i16 13184, i16 13056, i16 12928, i16 12800, i16 12800, i16 12800, i16 12800, i16 12800, i16 12672, i16 12544, i16 12544, i16 12544, i16 12544, i16 12672, i16 12800, i16 12800, i16 12928, i16 13056, i16 13184, i16 13312, i16 13440, i16 13568, i16 13696, i16 13824, i16 14208, i16 14592, i16 14976, i16 15104, i16 15360, i16 15616, i16 15872, i16 16128, i16 16512, i16 16896, i16 17152, i16 17408, i16 17536, i16 17664, i16 17792, i16 17920, i16 18304, i16 18688, i16 19072, i16 19456, i16 19712, i16 19968, i16 20224, i16 20480, i16 20608, i16 20864, i16 20992, i16 21248, i16 21248, i16 21248, i16 21248, i16 21248, i16 21248, i16 21376, i16 21504, i16 21760, i16 21760, i16 21632, i16 21504, i16 21504, i16 21632, i16 21632, i16 21504, i16 21504, i16 21376, i16 21248, i16 21120, i16 20992, i16 20992, i16 20864, i16 20736, i16 20736, i16 20736, i16 20480, i16 20352, i16 20224, i16 20224, i16 20224, i16 20224, i16 20352, i16 20352, i16 20480, i16 20352, i16 20352, i16 20352, i16 20352, i16 20224, i16 20224, i16 20224, i16 20096, i16 20096, i16 19968, i16 19840, i16 19712, i16 19584, i16 19456, i16 19584, i16 19584, i16 19456, i16 19456, i16 19328, i16 19328, i16 19456, i16 19456, i16 19328, i16 19328, i16 19200, i16 19200, i16 19200, i16 19072, i16 19072, i16 18944, i16 18816, i16 18688, i16 18560, i16 18432, i16 18304, i16 18304, i16 18176, i16 18176, i16 18176, i16 18304, i16 18304, i16 18432, i16 18560, i16 18432, i16 18176, i16 17920, i16 17920, i16 17792, i16 17792, i16 17664, i16 17664, i16 17536, i16 17536, i16 17408, i16 17408, i16 17280, i16 17280, i16 17280, i16 17152, i16 17152, i16 17152, i16 17152, i16 17024, i16 17024, i16 16896, i16 16896, i16 16896, i16 16768, i16 16768, i16 16640, i16 16640, i16 16512, i16 16512, i16 16384, i16 16256, i16 16128, i16 16000, i16 15872, i16 15744, i16 15616, i16 15488, i16 15360, i16 15488, i16 15360, i16 15232, i16 15360, i16 15232, i16 15104, i16 14976, i16 14336, i16 14336, i16 14592, i16 14464, i16 13824, i16 13824, i16 13568, i16 13568, i16 13440, i16 13312, i16 13184, i16 13056, i16 13056, i16 13056, i16 12928, i16 12800, i16 12672, i16 12672, i16 12544, i16 12416, i16 12288, i16 12160, i16 11904, i16 11776, i16 11571, i16 11520, i16 11392, i16 11136, i16 10905, i16 10752, i16 10624, i16 10444, i16 10240, i16 9984, i16 9728, i16 9472, i16 9216, i16 8960, i16 8704, i16 8448, i16 8192, i16 7936, i16 7680, i16 7424, i16 7168, i16 6400, i16 5632, i16 4864, i16 3584, i16 1536, i16 0, i16 0], align 8
+
+; Function Attrs: nounwind
+define i32 @t_run_test() #0 {
+entry:
+  %WaterLeveldB_out = alloca i16, align 2
+  br label %polly.stmt.for.body
+
+for.body8:                                        ; preds = %for.body8, %polly.loop_exit.loopexit
+  %i.120 = phi i32 [ 0, %polly.loop_exit.loopexit ], [ %inc11.24, %for.body8 ]
+  %call = call i32 bitcast (i32 (...)* @fxpBitAllocation to i32 (i32, i32, i32, i32, i16*, i32, i32, i32)*)(i32 0, i32 0, i32 256, i32 %conv9, i16* %WaterLeveldB_out, i32 0, i32 1920, i32 %i.120) #2
+  %inc11.24 = add i32 %i.120, 25
+  %exitcond.24 = icmp eq i32 %inc11.24, 500
+  br i1 %exitcond.24, label %for.end12, label %for.body8
+
+for.end12:                                        ; preds = %for.body8
+  ret i32 0
+
+polly.loop_exit.loopexit:                         ; preds = %polly.stmt.for.body
+  %WaterLeveldB.1p_vsel.lcssa = phi <4 x i16> [ %WaterLeveldB.1p_vsel, %polly.stmt.for.body ]
+  %_low_half = shufflevector <4 x i16> %WaterLeveldB.1p_vsel.lcssa, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %_high_half = shufflevector <4 x i16> %WaterLeveldB.1p_vsel.lcssa, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+  %0 = icmp sgt <2 x i16> %_low_half, %_high_half
+  %1 = select <2 x i1> %0, <2 x i16> %_low_half, <2 x i16> %_high_half
+  %2 = extractelement <2 x i16> %1, i32 0
+  %3 = extractelement <2 x i16> %1, i32 1
+  %4 = icmp sgt i16 %2, %3
+  %5 = select i1 %4, i16 %2, i16 %3
+  %conv9 = sext i16 %5 to i32
+  br label %for.body8
+
+polly.stmt.for.body:                              ; preds = %entry, %polly.stmt.for.body
+  %WaterLeveldB.1p_vsel35 = phi <4 x i16> [ <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, %entry ], [ %WaterLeveldB.1p_vsel, %polly.stmt.for.body ]
+  %scevgep.phi = phi i16* [ getelementptr inbounds ([256 x i16], [256 x i16]* @input_buf, i32 0, i32 0), %entry ], [ %scevgep.inc, %polly.stmt.for.body ]
+  %polly.indvar = phi i32 [ 0, %entry ], [ %polly.indvar_next, %polly.stmt.for.body ]
+  %vector_ptr = bitcast i16* %scevgep.phi to <4 x i16>*
+  %_p_vec_full = load <4 x i16>, <4 x i16>* %vector_ptr, align 8
+  %cmp2p_vicmp = icmp sgt <4 x i16> %_p_vec_full, %WaterLeveldB.1p_vsel35
+  %WaterLeveldB.1p_vsel = select <4 x i1> %cmp2p_vicmp, <4 x i16> %_p_vec_full, <4 x i16> %WaterLeveldB.1p_vsel35
+  %polly.indvar_next = add nsw i32 %polly.indvar, 4
+  %polly.loop_cond = icmp slt i32 %polly.indvar, 252
+  %scevgep.inc = getelementptr i16, i16* %scevgep.phi, i32 4
+  br i1 %polly.loop_cond, label %polly.stmt.for.body, label %polly.loop_exit.loopexit
+}
+
+declare i32 @fxpBitAllocation(...) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"QuIC LLVM Hexagon Clang version 3.1"}
--- a/test/CodeGen/Hexagon/vect/vect-bitcast-1.ll
+++ b/test/CodeGen/Hexagon/vect/vect-bitcast-1.ll
@@ -0,0 +1,68 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+; Used to fail with: Assertion `VT.getSizeInBits() == Operand.getValueType().getSizeInBits() && "Cannot BITCAST between types of different sizes!"' failed.
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+target triple = "hexagon-unknown-linux-gnu"
+
+define void @foo() nounwind {
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %if.then155, %if.then12, %entry
+  %cmp.i = icmp eq i8* undef, null
+  br i1 %cmp.i, label %lab_ci.exit, label %if.end.i
+
+if.end.i:                                         ; preds = %while.body
+  unreachable
+
+lab_ci.exit:      ; preds = %while.body
+  br i1 false, label %if.then, label %if.else
+
+if.then:                                          ; preds = %lab_ci.exit
+  unreachable
+
+if.else:                                          ; preds = %lab_ci.exit
+  br i1 undef, label %if.then12, label %if.else17
+
+if.then12:                                        ; preds = %if.else
+  br label %while.body
+
+if.else17:                                        ; preds = %if.else
+  br i1 false, label %if.then22, label %if.else35
+
+if.then22:                                        ; preds = %if.else17
+  unreachable
+
+if.else35:                                        ; preds = %if.else17
+  br i1 false, label %if.then40, label %if.else83
+
+if.then40:                                        ; preds = %if.else35
+  unreachable
+
+if.else83:                                        ; preds = %if.else35
+  br i1 false, label %if.then88, label %if.else150
+
+if.then88:                                        ; preds = %if.else83
+  unreachable
+
+if.else150:                                       ; preds = %if.else83
+  %cmp154 = icmp eq i32 undef, 0
+  br i1 %cmp154, label %if.then155, label %if.else208
+
+if.then155:                                       ; preds = %if.else150
+  %call191 = call i32 @strtol() nounwind
+  %conv192 = trunc i32 %call191 to i16
+  %_p_splat_one = insertelement <1 x i16> undef, i16 %conv192, i32 0
+  %_p_splat = shufflevector <1 x i16> %_p_splat_one, <1 x i16> undef, <2 x i32> zeroinitializer
+  %0 = sext <2 x i16> %_p_splat to <2 x i32>
+  %mul198p_vec = shl <2 x i32> %0, <i32 2, i32 2>
+  %1 = extractelement <2 x i32> %mul198p_vec, i32 0
+  store i32 %1, i32* null, align 4
+  br label %while.body
+
+if.else208:                                       ; preds = %if.else150
+  unreachable
+}
+
+declare i32 @strtol() nounwind
--- a/test/CodeGen/Hexagon/vect/vect-bitcast.ll
+++ b/test/CodeGen/Hexagon/vect/vect-bitcast.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+; Used to fail with "Cannot BITCAST between types of different sizes!"
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define void @foo() nounwind {
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %if.then155, %if.then12, %if.then, %entry
+  br i1 undef, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  br label %while.body
+
+if.else:                                          ; preds = %while.body
+  br i1 undef, label %if.then12, label %if.else17
+
+if.then12:                                        ; preds = %if.else
+  br label %while.body
+
+if.else17:                                        ; preds = %if.else
+  br i1 false, label %if.then22, label %if.else35
+
+if.then22:                                        ; preds = %if.else17
+  unreachable
+
+if.else35:                                        ; preds = %if.else17
+  br i1 false, label %if.then40, label %if.else83
+
+if.then40:                                        ; preds = %if.else35
+  unreachable
+
+if.else83:                                        ; preds = %if.else35
+  br i1 false, label %if.then88, label %if.else150
+
+if.then88:                                        ; preds = %if.else83
+  unreachable
+
+if.else150:                                       ; preds = %if.else83
+  %cmp154 = icmp eq i32 undef, 0
+  br i1 %cmp154, label %if.then155, label %if.else208
+
+if.then155:                                       ; preds = %if.else150
+  %_p_splat.1 = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <2 x i32> zeroinitializer
+  %0 = sext <2 x i16> %_p_splat.1 to <2 x i32>
+  %mul198p_vec.1 = mul <2 x i32> %0, <i32 4, i32 4>
+  %1 = extractelement <2 x i32> %mul198p_vec.1, i32 0
+  store i32 %1, i32* undef, align 4
+  br label %while.body
+
+if.else208:                                       ; preds = %if.else150
+  unreachable
+}
--- a/test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll
+++ b/test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; This one should generate a combine with two immediates.
+; CHECK: combine(#7, #7)
+@B = common global [400 x i32] zeroinitializer, align 8
+@A = common global [400 x i32] zeroinitializer, align 8
+@C = common global [400 x i32] zeroinitializer, align 8
+
+define void @run() nounwind {
+entry:
+  br label %polly.loop_body
+
+polly.loop_after:                                 ; preds = %polly.loop_body
+  ret void
+
+polly.loop_body:                                  ; preds = %entry, %polly.loop_body
+  %polly.loopiv23 = phi i32 [ 0, %entry ], [ %polly.next_loopiv, %polly.loop_body ]
+  %polly.next_loopiv = add nsw i32 %polly.loopiv23, 4
+  %p_arrayidx1 = getelementptr [400 x i32], [400 x i32]* @A, i32 0, i32 %polly.loopiv23
+  %p_arrayidx = getelementptr [400 x i32], [400 x i32]* @B, i32 0, i32 %polly.loopiv23
+  %vector_ptr = bitcast i32* %p_arrayidx to <4 x i32>*
+  %_p_vec_full = load <4 x i32>, <4 x i32>* %vector_ptr, align 8
+  %mulp_vec = mul <4 x i32> %_p_vec_full, <i32 7, i32 7, i32 7, i32 7>
+  %vector_ptr12 = bitcast i32* %p_arrayidx1 to <4 x i32>*
+  %_p_vec_full13 = load <4 x i32>, <4 x i32>* %vector_ptr12, align 8
+  %addp_vec = add <4 x i32> %_p_vec_full13, %mulp_vec
+  store <4 x i32> %addp_vec, <4 x i32>* %vector_ptr12, align 8
+  %0 = icmp slt i32 %polly.next_loopiv, 400
+  br i1 %0, label %polly.loop_body, label %polly.loop_after
+}
--- a/test/CodeGen/Hexagon/vect/vect-cst-v4i8.ll
+++ b/test/CodeGen/Hexagon/vect/vect-cst-v4i8.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; Make sure we can build the constant vector <1, 2, 3, 4>
+; CHECK-DAG: ##B
+; CHECK-DAG: ##A
+@B = common global [400 x i8] zeroinitializer, align 8
+@A = common global [400 x i8] zeroinitializer, align 8
+@C = common global [400 x i8] zeroinitializer, align 8
+
+define void @run() nounwind {
+entry:
+  br label %polly.loop_body
+
+polly.loop_after:                                 ; preds = %polly.loop_body
+  ret void
+
+polly.loop_body:                                  ; preds = %entry, %polly.loop_body
+  %polly.loopiv25 = phi i32 [ 0, %entry ], [ %polly.next_loopiv, %polly.loop_body ]
+  %polly.next_loopiv = add i32 %polly.loopiv25, 4
+  %p_arrayidx1 = getelementptr [400 x i8], [400 x i8]* @A, i32 0, i32 %polly.loopiv25
+  %p_arrayidx = getelementptr [400 x i8], [400 x i8]* @B, i32 0, i32 %polly.loopiv25
+  %vector_ptr = bitcast i8* %p_arrayidx to <4 x i8>*
+  %_p_vec_full = load <4 x i8>, <4 x i8>* %vector_ptr, align 8
+  %mulp_vec = mul <4 x i8> %_p_vec_full, <i8 1, i8 2, i8 3, i8 4>
+  %vector_ptr14 = bitcast i8* %p_arrayidx1 to <4 x i8>*
+  %_p_vec_full15 = load <4 x i8>, <4 x i8>* %vector_ptr14, align 8
+  %addp_vec = add <4 x i8> %_p_vec_full15, %mulp_vec
+  store <4 x i8> %addp_vec, <4 x i8>* %vector_ptr14, align 8
+  %0 = icmp slt i32 %polly.next_loopiv, 400
+  br i1 %0, label %polly.loop_body, label %polly.loop_after
+}
--- a/test/CodeGen/Hexagon/vect/vect-cst.ll
+++ b/test/CodeGen/Hexagon/vect/vect-cst.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Make sure we can build the constant vector <7, 7, 7, 7>
+; CHECK: vaddub
+@B = common global [400 x i8] zeroinitializer, align 8
+@A = common global [400 x i8] zeroinitializer, align 8
+@C = common global [400 x i8] zeroinitializer, align 8
+
+define void @run() nounwind {
+entry:
+  br label %polly.loop_body
+
+polly.loop_after:                                 ; preds = %polly.loop_body
+  ret void
+
+polly.loop_body:                                  ; preds = %entry, %polly.loop_body
+  %polly.loopiv25 = phi i32 [ 0, %entry ], [ %polly.next_loopiv, %polly.loop_body ]
+  %polly.next_loopiv = add i32 %polly.loopiv25, 4
+  %p_arrayidx1 = getelementptr [400 x i8], [400 x i8]* @A, i32 0, i32 %polly.loopiv25
+  %p_arrayidx = getelementptr [400 x i8], [400 x i8]* @B, i32 0, i32 %polly.loopiv25
+  %vector_ptr = bitcast i8* %p_arrayidx to <4 x i8>*
+  %_p_vec_full = load <4 x i8>, <4 x i8>* %vector_ptr, align 8
+  %mulp_vec = mul <4 x i8> %_p_vec_full, <i8 7, i8 7, i8 7, i8 7>
+  %vector_ptr14 = bitcast i8* %p_arrayidx1 to <4 x i8>*
+  %_p_vec_full15 = load <4 x i8>, <4 x i8>* %vector_ptr14, align 8
+  %addp_vec = add <4 x i8> %_p_vec_full15, %mulp_vec
+  store <4 x i8> %addp_vec, <4 x i8>* %vector_ptr14, align 8
+  %0 = icmp slt i32 %polly.next_loopiv, 400
+  br i1 %0, label %polly.loop_body, label %polly.loop_after
+}
--- a/test/CodeGen/Hexagon/vect/vect-extract.ll
+++ b/test/CodeGen/Hexagon/vect/vect-extract.ll
@@ -0,0 +1,96 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+
+; Check that we do not generate extract.
+; CHECK-NOT: extractu
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define void @foo(i32 %N, i32* nocapture %C, i16* nocapture %A, i16 signext %val) #0 {
+entry:
+  %cmp14 = icmp eq i32 %N, 0
+  br i1 %cmp14, label %for.end11, label %for.cond1.preheader.single_entry.preheader
+
+for.cond1.preheader.single_entry.preheader:       ; preds = %entry
+  %0 = add i32 %N, -1
+  %leftover_lb = and i32 %0, -2
+  %p_conv4 = sext i16 %val to i32
+  br label %for.cond1.preheader.single_entry
+
+for.cond1.preheader.single_entry:                 ; preds = %for.inc9, %for.cond1.preheader.single_entry.preheader
+  %indvar = phi i32 [ %indvar.next, %for.inc9 ], [ 0, %for.cond1.preheader.single_entry.preheader ]
+  %1 = mul i32 %indvar, %N
+  %.not = icmp slt i32 %N, 2
+  %.not41 = icmp slt i32 %leftover_lb, 1
+  %brmerge = or i1 %.not, %.not41
+  %.mux = select i1 %.not, i32 0, i32 %leftover_lb
+  br i1 %brmerge, label %polly.loop_header26.preheader, label %polly.loop_body.lr.ph
+
+for.inc9.loopexit:                                ; preds = %polly.stmt.for.body331
+  br label %for.inc9
+
+for.inc9:                                         ; preds = %for.inc9.loopexit, %polly.loop_header26.preheader
+  %indvar.next = add i32 %indvar, 1
+  %exitcond40 = icmp eq i32 %indvar.next, %N
+  br i1 %exitcond40, label %for.end11.loopexit, label %for.cond1.preheader.single_entry
+
+for.end11.loopexit:                               ; preds = %for.inc9
+  br label %for.end11
+
+for.end11:                                        ; preds = %for.end11.loopexit, %entry
+  ret void
+
+polly.loop_body.lr.ph:                            ; preds = %for.cond1.preheader.single_entry
+  %2 = call i64 @llvm.hexagon.A2.combinew(i32 %1, i32 %1)
+  %3 = bitcast i64 %2 to <2 x i32>
+  %4 = extractelement <2 x i32> %3, i32 0
+  %5 = call i64 @llvm.hexagon.A2.combinew(i32 %p_conv4, i32 %p_conv4)
+  %6 = bitcast i64 %5 to <2 x i32>
+  %p_arrayidx8.gep = getelementptr i32, i32* %C, i32 %4
+  %p_arrayidx.gep = getelementptr i16, i16* %A, i32 %4
+  br label %polly.loop_body
+
+polly.loop_body:                                  ; preds = %polly.loop_body.lr.ph, %polly.loop_body
+  %p_arrayidx8.phi = phi i32* [ %p_arrayidx8.gep, %polly.loop_body.lr.ph ], [ %p_arrayidx8.inc, %polly.loop_body ]
+  %p_arrayidx.phi = phi i16* [ %p_arrayidx.gep, %polly.loop_body.lr.ph ], [ %p_arrayidx.inc, %polly.loop_body ]
+  %polly.loopiv38 = phi i32 [ 0, %polly.loop_body.lr.ph ], [ %polly.next_loopiv, %polly.loop_body ]
+  %polly.next_loopiv = add nsw i32 %polly.loopiv38, 2
+  %vector_ptr = bitcast i16* %p_arrayidx.phi to <2 x i16>*
+  %_p_vec_full = load <2 x i16>, <2 x i16>* %vector_ptr, align 2
+  %7 = sext <2 x i16> %_p_vec_full to <2 x i32>
+  %mul5p_vec = mul <2 x i32> %7, %6
+  %vector_ptr21 = bitcast i32* %p_arrayidx8.phi to <2 x i32>*
+  store <2 x i32> %mul5p_vec, <2 x i32>* %vector_ptr21, align 4
+  %8 = icmp slt i32 %polly.next_loopiv, %leftover_lb
+  %p_arrayidx8.inc = getelementptr i32, i32* %p_arrayidx8.phi, i32 2
+  %p_arrayidx.inc = getelementptr i16, i16* %p_arrayidx.phi, i32 2
+  br i1 %8, label %polly.loop_body, label %polly.loop_header26.preheader.loopexit
+
+polly.loop_header26.preheader.loopexit:           ; preds = %polly.loop_body
+  br label %polly.loop_header26.preheader
+
+polly.loop_header26.preheader:                    ; preds = %polly.loop_header26.preheader.loopexit, %for.cond1.preheader.single_entry
+  %polly.loopiv29.ph = phi i32 [ %.mux, %for.cond1.preheader.single_entry ], [ %leftover_lb, %polly.loop_header26.preheader.loopexit ]
+  %9 = icmp slt i32 %polly.loopiv29.ph, %N
+  br i1 %9, label %polly.stmt.for.body331.preheader, label %for.inc9
+
+polly.stmt.for.body331.preheader:                 ; preds = %polly.loop_header26.preheader
+  br label %polly.stmt.for.body331
+
+polly.stmt.for.body331:                           ; preds = %polly.stmt.for.body331.preheader, %polly.stmt.for.body331
+  %polly.loopiv2939 = phi i32 [ %polly.next_loopiv30, %polly.stmt.for.body331 ], [ %polly.loopiv29.ph, %polly.stmt.for.body331.preheader ]
+  %polly.next_loopiv30 = add nsw i32 %polly.loopiv2939, 1
+  %p_32 = add i32 %polly.loopiv2939, %1
+  %p_arrayidx833 = getelementptr i32, i32* %C, i32 %p_32
+  %p_arrayidx34 = getelementptr i16, i16* %A, i32 %p_32
+  %_p_scalar_ = load i16, i16* %p_arrayidx34, align 2
+  %p_conv = sext i16 %_p_scalar_ to i32
+  %p_mul5 = mul nsw i32 %p_conv, %p_conv4
+  store i32 %p_mul5, i32* %p_arrayidx833, align 4
+  %exitcond = icmp eq i32 %polly.next_loopiv30, %N
+  br i1 %exitcond, label %for.inc9.loopexit, label %polly.stmt.for.body331
+}
+
+declare i64 @llvm.hexagon.A2.combinew(i32, i32) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
--- a/test/CodeGen/Hexagon/vect/vect-fma.ll
+++ b/test/CodeGen/Hexagon/vect/vect-fma.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s
+; REQUIRES: asserts
+; Used to fail with "SplitVectorResult #0: 0x16cbe60: v4f64 = fma"
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+target triple = "hexagon-unknown-linux-gnu"
+
+define void @run() nounwind {
+entry:
+  br label %polly.loop_header
+
+polly.loop_after:                                 ; preds = %polly.loop_header
+  ret void
+
+polly.loop_header:                                ; preds = %polly.loop_body, %entry
+  %0 = icmp sle i32 undef, 399
+  br i1 %0, label %polly.loop_body, label %polly.loop_after
+
+polly.loop_body:                                  ; preds = %polly.loop_header
+  %_p_vec_full = load <4 x double>, <4 x double>* undef, align 8
+  %mulp_vec = fmul <4 x double> %_p_vec_full, <double 7.000000e+00, double 7.000000e+00, double 7.000000e+00, double 7.000000e+00>
+  %addp_vec = fadd <4 x double> undef, %mulp_vec
+  store <4 x double> %addp_vec, <4 x double>* undef, align 8
+  br label %polly.loop_header
+}
--- a/test/CodeGen/Hexagon/vect/vect-illegal-type.ll
+++ b/test/CodeGen/Hexagon/vect/vect-illegal-type.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+; Used to fail with "Unexpected illegal type!"
+; Used to fail with "Cannot select: ch = store x,x,x,<ST4[undef](align=8), trunc to v4i8>"
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+target triple = "hexagon-unknown-linux-gnu"
+
+define void @foo() nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  br label %for.body71
+
+for.body71:                                       ; preds = %for.body71, %for.end
+  br i1 undef, label %for.end96, label %for.body71
+
+for.end96:                                        ; preds = %for.body71
+  switch i32 undef, label %sw.epilog [
+    i32 1, label %for.cond375.preheader
+    i32 8, label %for.cond591
+  ]
+
+for.cond375.preheader:                            ; preds = %for.end96
+  br label %polly.loop_header228
+
+for.cond591:                                      ; preds = %for.end96
+  br label %for.body664
+
+for.body664:                                      ; preds = %for.body664, %for.cond591
+  br i1 undef, label %for.end670, label %for.body664
+
+for.end670:                                       ; preds = %for.body664
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %for.end670, %for.end96
+  ret void
+
+polly.loop_header228:                             ; preds = %polly.loop_header228, %for.cond375.preheader
+  %_p_splat_one = load <1 x i16>, <1 x i16>* undef, align 8
+  %_p_splat = shufflevector <1 x i16> %_p_splat_one, <1 x i16> %_p_splat_one, <4 x i32> zeroinitializer
+  %0 = trunc <4 x i16> %_p_splat to <4 x i8>
+  store <4 x i8> %0, <4 x i8>* undef, align 8
+  br label %polly.loop_header228
+}
--- a/test/CodeGen/Hexagon/vect/vect-insert-extract-elt.ll
+++ b/test/CodeGen/Hexagon/vect/vect-insert-extract-elt.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=hexagon < %s
+; Used to fail with an infinite recursion in the insn selection.
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon-unknown-linux-gnu"
+
+%struct.elt = type { [2 x [4 x %struct.block]] }
+%struct.block = type { [2 x i16] }
+
+define void @foo(%struct.elt* noalias nocapture %p0, %struct.elt* noalias nocapture %p1) nounwind {
+entry:
+  %arrayidx1 = getelementptr inbounds %struct.elt, %struct.elt* %p1, i32 0, i32 0, i32 0, i32 3
+  %arrayidx4 = getelementptr inbounds %struct.elt, %struct.elt* %p1, i32 0, i32 0, i32 0, i32 2
+  %arrayidx7 = getelementptr inbounds %struct.elt, %struct.elt* %p0, i32 0, i32 0, i32 0, i32 3
+  %0 = bitcast %struct.block* %arrayidx7 to i32*
+  %1 = bitcast %struct.block* %arrayidx4 to i32*
+  %2 = load i32, i32* %0, align 4
+  store i32 %2, i32* %1, align 4
+  %3 = bitcast %struct.block* %arrayidx1 to i32*
+  store i32 %2, i32* %3, align 4
+  %arrayidx10 = getelementptr inbounds %struct.elt, %struct.elt* %p1, i32 0, i32 0, i32 0, i32 1
+  %arrayidx16 = getelementptr inbounds %struct.elt, %struct.elt* %p0, i32 0, i32 0, i32 0, i32 2
+  %4 = bitcast %struct.block* %arrayidx16 to i32*
+  %5 = bitcast %struct.elt* %p1 to i32*
+  %6 = load i32, i32* %4, align 4
+  store i32 %6, i32* %5, align 4
+  %7 = bitcast %struct.block* %arrayidx10 to i32*
+  store i32 %6, i32* %7, align 4
+  %p_arrayidx26 = getelementptr %struct.elt, %struct.elt* %p0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1
+  %p_arrayidx2632 = getelementptr %struct.elt, %struct.elt* %p0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1
+  %p_arrayidx2633 = getelementptr %struct.elt, %struct.elt* %p0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 1
+  %p_arrayidx2634 = getelementptr %struct.elt, %struct.elt* %p0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 1
+  %p_arrayidx20 = getelementptr %struct.elt, %struct.elt* %p1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1
+  %p_arrayidx2035 = getelementptr %struct.elt, %struct.elt* %p1, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1
+  %p_arrayidx2036 = getelementptr %struct.elt, %struct.elt* %p1, i32 0, i32 0, i32 0, i32 2, i32 0, i32 1
+  %p_arrayidx2037 = getelementptr %struct.elt, %struct.elt* %p1, i32 0, i32 0, i32 0, i32 3, i32 0, i32 1
+  %8 = lshr i32 %6, 16
+  %9 = trunc i32 %8 to i16
+  %_p_vec_ = insertelement <4 x i16> undef, i16 %9, i32 0
+  %_p_vec_39 = insertelement <4 x i16> %_p_vec_, i16 %9, i32 1
+  %10 = lshr i32 %2, 16
+  %11 = trunc i32 %10 to i16
+  %_p_vec_41 = insertelement <4 x i16> %_p_vec_39, i16 %11, i32 2
+  %_p_vec_43 = insertelement <4 x i16> %_p_vec_41, i16 %11, i32 3
+  %shlp_vec = shl <4 x i16> %_p_vec_43, <i16 1, i16 1, i16 1, i16 1>
+  %12 = extractelement <4 x i16> %shlp_vec, i32 0
+  store i16 %12, i16* %p_arrayidx20, align 2
+  %13 = extractelement <4 x i16> %shlp_vec, i32 1
+  store i16 %13, i16* %p_arrayidx2035, align 2
+  %14 = extractelement <4 x i16> %shlp_vec, i32 2
+  store i16 %14, i16* %p_arrayidx2036, align 2
+  %15 = extractelement <4 x i16> %shlp_vec, i32 3
+  store i16 %15, i16* %p_arrayidx2037, align 2
+  %_p_scalar_44 = load i16, i16* %p_arrayidx26, align 2
+  %_p_vec_45 = insertelement <4 x i16> undef, i16 %_p_scalar_44, i32 0
+  %_p_scalar_46 = load i16, i16* %p_arrayidx2632, align 2
+  %_p_vec_47 = insertelement <4 x i16> %_p_vec_45, i16 %_p_scalar_46, i32 1
+  %_p_scalar_48 = load i16, i16* %p_arrayidx2633, align 2
+  %_p_vec_49 = insertelement <4 x i16> %_p_vec_47, i16 %_p_scalar_48, i32 2
+  %_p_scalar_50 = load i16, i16* %p_arrayidx2634, align 2
+  %_p_vec_51 = insertelement <4 x i16> %_p_vec_49, i16 %_p_scalar_50, i32 3
+  %shl28p_vec = shl <4 x i16> %_p_vec_51, <i16 1, i16 1, i16 1, i16 1>
+  %16 = extractelement <4 x i16> %shl28p_vec, i32 0
+  store i16 %16, i16* %p_arrayidx26, align 2
+  %17 = extractelement <4 x i16> %shl28p_vec, i32 1
+  store i16 %17, i16* %p_arrayidx2632, align 2
+  %18 = extractelement <4 x i16> %shl28p_vec, i32 2
+  store i16 %18, i16* %p_arrayidx2633, align 2
+  %19 = extractelement <4 x i16> %shl28p_vec, i32 3
+  store i16 %19, i16* %p_arrayidx2634, align 2
+  ret void
+}
--- a/test/CodeGen/Hexagon/vect/vect-load-1.ll
+++ b/test/CodeGen/Hexagon/vect/vect-load-1.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=hexagon < %s
+; Used to fail with "Cannot select: v2i32,ch = load 0x16c5890, 0x16f76e0, 0x16f76e0<LD2[undef](align=8), sext from v2i8>", 0x16c5890, 0x16f76e0, 0x16f76e0<LD2[undef](align=8), sext from v2i8>"
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+target triple = "hexagon-unknown-linux-gnu"
+
+define void @foo() nounwind {
+entry:
+  br label %polly.loop_header
+
+polly.loop_after:                                 ; preds = %polly.loop_header
+  unreachable
+
+polly.loop_header:                                ; preds = %polly.loop_body, %entry
+  %0 = icmp sle i32 undef, 63
+  br i1 %0, label %polly.loop_body, label %polly.loop_after
+
+polly.loop_body:                                  ; preds = %polly.loop_header
+  %_p_vec_full = load <2 x i8>, <2 x i8>* undef, align 8
+  %1 = sext <2 x i8> %_p_vec_full to <2 x i32>
+  %p_vec = mul <2 x i32> %1, <i32 3, i32 3>
+  %mulp_vec = add <2 x i32> %p_vec, <i32 21, i32 21>
+  store <2 x i32> %mulp_vec, <2 x i32>* undef, align 8
+  br label %polly.loop_header
+}
--- a/test/CodeGen/Hexagon/vect/vect-load.ll
+++ b/test/CodeGen/Hexagon/vect/vect-load.ll
@@ -0,0 +1,76 @@
+; RUN: llc -march=hexagon < %s
+; Used to fail with "Cannot select: 0x16cf370: v2i16,ch = load"
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+target triple = "hexagon-unknown-linux-gnu"
+
+%struct.ext_hdrs.10.65.142.274.307.318.329.681.692.703.714.725.736.758.791.802.846.857.868.879.890.901.945.956.958 = type { i8, i8, i8, i8, i8, i8, i16, i32, [8 x %struct.hcdc_ext_vec.9.64.141.273.306.317.328.680.691.702.713.724.735.757.790.801.845.856.867.878.889.900.944.955.957] }
+%struct.hcdc_ext_vec.9.64.141.273.306.317.328.680.691.702.713.724.735.757.790.801.845.856.867.878.889.900.944.955.957 = type { i8, i8, i16 }
+
+define void @foo(%struct.ext_hdrs.10.65.142.274.307.318.329.681.692.703.714.725.736.758.791.802.846.857.868.879.890.901.945.956.958* %hc_ext_info) nounwind {
+entry:
+  br i1 undef, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %if.end5, label %if.then3
+
+if.then3:                                         ; preds = %if.end
+  br label %if.end5
+
+if.end5:                                          ; preds = %if.then3, %if.end
+  %add.ptr = getelementptr inbounds %struct.ext_hdrs.10.65.142.274.307.318.329.681.692.703.714.725.736.758.791.802.846.857.868.879.890.901.945.956.958, %struct.ext_hdrs.10.65.142.274.307.318.329.681.692.703.714.725.736.758.791.802.846.857.868.879.890.901.945.956.958* %hc_ext_info, i32 0, i32 8, i32 0
+  %add.ptr22 = getelementptr inbounds %struct.ext_hdrs.10.65.142.274.307.318.329.681.692.703.714.725.736.758.791.802.846.857.868.879.890.901.945.956.958, %struct.ext_hdrs.10.65.142.274.307.318.329.681.692.703.714.725.736.758.791.802.846.857.868.879.890.901.945.956.958* null, i32 0, i32 8, i32 undef
+  br label %while.cond
+
+while.cond:                                       ; preds = %if.end419, %if.end5
+  %gre_chksum.0 = phi <2 x i8> [ undef, %if.end5 ], [ %gre_chksum.2, %if.end419 ]
+  %cmp23 = icmp ult %struct.hcdc_ext_vec.9.64.141.273.306.317.328.680.691.702.713.724.735.757.790.801.845.856.867.878.889.900.944.955.957* null, %add.ptr
+  %cmp25 = icmp ult %struct.hcdc_ext_vec.9.64.141.273.306.317.328.680.691.702.713.724.735.757.790.801.845.856.867.878.889.900.944.955.957* null, %add.ptr22
+  %sel1 = and i1 %cmp23, %cmp25
+  br i1 %sel1, label %while.body, label %while.end422
+
+while.body:                                       ; preds = %while.cond
+  switch i8 undef, label %if.end419 [
+    i8 5, label %if.then70
+    i8 3, label %if.then70
+    i8 2, label %if.then70
+    i8 1, label %if.then70
+    i8 0, label %if.then70
+    i8 4, label %if.then93
+    i8 6, label %if.then195
+  ]
+
+if.then70:                                        ; preds = %while.body, %while.body, %while.body, %while.body, %while.body
+  unreachable
+
+if.then93:                                        ; preds = %while.body
+  unreachable
+
+if.then195:                                       ; preds = %while.body
+  br i1 undef, label %if.end274, label %if.then202
+
+if.then202:                                       ; preds = %if.then195
+  br label %while.body222
+
+while.body222:                                    ; preds = %while.body222, %if.then202
+  br i1 undef, label %if.end240, label %while.body222
+
+if.end240:                                        ; preds = %while.body222
+  %_p_vec_full100 = load <2 x i8>, <2 x i8>* undef, align 8
+  br label %if.end274
+
+if.end274:                                        ; preds = %if.end240, %if.then195
+  %gre_chksum.1 = phi <2 x i8> [ %gre_chksum.0, %if.then195 ], [ %_p_vec_full100, %if.end240 ]
+  br label %if.end419
+
+if.end419:                                        ; preds = %if.end274, %while.body
+  %gre_chksum.2 = phi <2 x i8> [ %gre_chksum.0, %while.body ], [ %gre_chksum.1, %if.end274 ]
+  br label %while.cond
+
+while.end422:                                     ; preds = %while.cond
+  ret void
+}
--- a/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
+++ b/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
@@ -0,0 +1,73 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+
+; Check that store is post-incremented.
+; CHECK: memuh(r{{[0-9]+}} + {{ *}}#6{{ *}})
+; CHECK: combine(r{{[0-9]+}}{{ *}},{{ *}}r{{[0-9]+}}{{ *}})
+; CHECK: vaddh
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define void @matrix_add_const(i32 %N, i16* nocapture %A, i16 signext %val) #0 {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %polly.cond
+
+for.end.loopexit:                                 ; preds = %polly.stmt.for.body29
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %polly.loop_header24.preheader, %entry
+  ret void
+
+polly.cond:                                       ; preds = %entry
+  %0 = icmp sgt i32 %N, 3
+  br i1 %0, label %polly.then, label %polly.loop_header24.preheader
+
+polly.then:                                       ; preds = %polly.cond
+  %1 = add i32 %N, -1
+  %leftover_lb = and i32 %1, -4
+  %2 = icmp sgt i32 %leftover_lb, 0
+  br i1 %2, label %polly.loop_body.lr.ph, label %polly.loop_header24.preheader
+
+polly.loop_body.lr.ph:                            ; preds = %polly.then
+  %3 = insertelement <4 x i16> undef, i16 %val, i32 0
+  %4 = insertelement <4 x i16> %3, i16 %val, i32 1
+  %5 = insertelement <4 x i16> %4, i16 %val, i32 2
+  %6 = insertelement <4 x i16> %5, i16 %val, i32 3
+  br label %polly.loop_body
+
+polly.loop_header24.preheader.loopexit:           ; preds = %polly.loop_body
+  br label %polly.loop_header24.preheader
+
+polly.loop_header24.preheader:                    ; preds = %polly.loop_header24.preheader.loopexit, %polly.then, %polly.cond
+  %polly.loopiv27.ph = phi i32 [ 0, %polly.cond ], [ %leftover_lb, %polly.then ], [ %leftover_lb, %polly.loop_header24.preheader.loopexit ]
+  %7 = icmp slt i32 %polly.loopiv27.ph, %N
+  br i1 %7, label %polly.stmt.for.body29.preheader, label %for.end
+
+polly.stmt.for.body29.preheader:                  ; preds = %polly.loop_header24.preheader
+  br label %polly.stmt.for.body29
+
+polly.loop_body:                                  ; preds = %polly.loop_body.lr.ph, %polly.loop_body
+  %p_arrayidx.phi = phi i16* [ %A, %polly.loop_body.lr.ph ], [ %p_arrayidx.inc, %polly.loop_body ]
+  %polly.loopiv34 = phi i32 [ 0, %polly.loop_body.lr.ph ], [ %polly.next_loopiv, %polly.loop_body ]
+  %polly.next_loopiv = add nsw i32 %polly.loopiv34, 4
+  %vector_ptr = bitcast i16* %p_arrayidx.phi to <4 x i16>*
+  %_p_vec_full = load <4 x i16>, <4 x i16>* %vector_ptr, align 2
+  %addp_vec = add <4 x i16> %_p_vec_full, %6
+  store <4 x i16> %addp_vec, <4 x i16>* %vector_ptr, align 2
+  %8 = icmp slt i32 %polly.next_loopiv, %leftover_lb
+  %p_arrayidx.inc = getelementptr i16, i16* %p_arrayidx.phi, i32 4
+  br i1 %8, label %polly.loop_body, label %polly.loop_header24.preheader.loopexit
+
+polly.stmt.for.body29:                            ; preds = %polly.stmt.for.body29.preheader, %polly.stmt.for.body29
+  %polly.loopiv2733 = phi i32 [ %polly.next_loopiv28, %polly.stmt.for.body29 ], [ %polly.loopiv27.ph, %polly.stmt.for.body29.preheader ]
+  %polly.next_loopiv28 = add nsw i32 %polly.loopiv2733, 1
+  %p_arrayidx30 = getelementptr i16, i16* %A, i32 %polly.loopiv2733
+  %_p_scalar_ = load i16, i16* %p_arrayidx30, align 2
+  %p_add = add i16 %_p_scalar_, %val
+  store i16 %p_add, i16* %p_arrayidx30, align 2
+  %exitcond = icmp eq i32 %polly.next_loopiv28, %N
+  br i1 %exitcond, label %for.end.loopexit, label %polly.stmt.for.body29
+}
+
+attributes #0 = { nounwind "fp-contract-model"="standard" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="static" "ssp-buffers-size"="8" }
--- a/test/CodeGen/Hexagon/vect/vect-mul-v2i16.ll
+++ b/test/CodeGen/Hexagon/vect/vect-mul-v2i16.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vmpyh
+; CHECK: vtrunewh
+
+define <2 x i16> @t_i2x16(<2 x i16> %a, <2 x i16> %b) nounwind {
+entry:
+	%0 = mul <2 x i16> %a, %b
+	ret <2 x i16> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-mul-v2i32.ll
+++ b/test/CodeGen/Hexagon/vect/vect-mul-v2i32.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: mpyi
+; CHECK: mpyi
+
+define <2 x i32> @t_i2x32(<2 x i32> %a, <2 x i32> %b) nounwind {
+entry:
+	%0 = mul <2 x i32> %a, %b
+	ret <2 x i32> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-mul-v4i16.ll
+++ b/test/CodeGen/Hexagon/vect/vect-mul-v4i16.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vmpyh
+; CHECK: vmpyh
+; CHECK: vtrunewh
+
+define <4 x i16> @t_i4x16(<4 x i16> %a, <4 x i16> %b) nounwind {
+entry:
+	%0 = mul <4 x i16> %a, %b
+	ret <4 x i16> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-mul-v4i8.ll
+++ b/test/CodeGen/Hexagon/vect/vect-mul-v4i8.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; CHECK: vmpybsu
+; CHECK: vtrunehb
+
+define <4 x i8> @t_i4x8(<4 x i8> %a, <4 x i8> %b) nounwind {
+entry:
+	%0 = mul <4 x i8> %a, %b
+	ret <4 x i8> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-mul-v8i8.ll
+++ b/test/CodeGen/Hexagon/vect/vect-mul-v8i8.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; CHECK: vmpybsu
+; CHECK: vmpybsu
+
+define <8 x i8> @t_i8x8(<8 x i8> %a, <8 x i8> %b) nounwind {
+entry:
+	%0 = mul <8 x i8> %a, %b
+	ret <8 x i8> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-no-tfrs-1.ll
+++ b/test/CodeGen/Hexagon/vect/vect-no-tfrs-1.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK-NOT: r1:0 = r1:0
+
+define <4 x i16> @t_i4x16(<4 x i16> %a, <4 x i16> %b) nounwind {
+entry:
+	%0 = mul <4 x i16> %a, %b
+	ret <4 x i16> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-no-tfrs.ll
+++ b/test/CodeGen/Hexagon/vect/vect-no-tfrs.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK-NOT: r1:0 = combine(r1, r0)
+
+define <4 x i8> @t_i4x8(<4 x i8> %a, <4 x i8> %b) nounwind {
+entry:
+	%0 = mul <4 x i8> %a, %b
+	ret <4 x i8> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-packhl.ll
+++ b/test/CodeGen/Hexagon/vect/vect-packhl.ll
@@ -0,0 +1,10 @@
+; Extracted from test/CodeGen/Generic/vector-casts.ll: used to loop indefinitely.
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: packhl
+
+define void @a(<2 x double>* %p, <2 x i8>* %q) {
+        %t = load <2 x double>, <2 x double>* %p
+	%r = fptosi <2 x double> %t to <2 x i8>
+        store <2 x i8> %r, <2 x i8>* %q
+	ret void
+}
--- a/test/CodeGen/Hexagon/vect/vect-shift-imm.ll
+++ b/test/CodeGen/Hexagon/vect/vect-shift-imm.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s --check-prefix=CHECK-ASLW
+; RUN: llc -march=hexagon < %s | FileCheck %s --check-prefix=CHECK-ASRW
+; RUN: llc -march=hexagon < %s | FileCheck %s --check-prefix=CHECK-LSRW
+; RUN: llc -march=hexagon < %s | FileCheck %s --check-prefix=CHECK-ASLH
+; RUN: llc -march=hexagon < %s | FileCheck %s --check-prefix=CHECK-ASRH
+; RUN: llc -march=hexagon < %s | FileCheck %s --check-prefix=CHECK-LSRH
+;
+; Make sure that the instructions with immediate operands are generated.
+; CHECK-ASLW: vaslw({{.*}}, #9)
+; CHECK-ASRW: vasrw({{.*}}, #8)
+; CHECK-LSRW: vlsrw({{.*}}, #7)
+; CHECK-ASLH: vaslh({{.*}}, #6)
+; CHECK-ASRH: vasrh({{.*}}, #5)
+; CHECK-LSRH: vlsrh({{.*}}, #4)
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define i64 @foo(i64 %x) nounwind readnone {
+entry:
+  %0 = tail call i64 @llvm.hexagon.S2.asl.i.vw(i64 %x, i32 9)
+  %1 = tail call i64 @llvm.hexagon.S2.asr.i.vw(i64 %x, i32 8)
+  %2 = tail call i64 @llvm.hexagon.S2.lsr.i.vw(i64 %x, i32 7)
+  %3 = tail call i64 @llvm.hexagon.S2.asl.i.vh(i64 %x, i32 6)
+  %4 = tail call i64 @llvm.hexagon.S2.asr.i.vh(i64 %x, i32 5)
+  %5 = tail call i64 @llvm.hexagon.S2.lsr.i.vh(i64 %x, i32 4)
+  %add = add i64 %1, %0
+  %add1 = add i64 %add, %2
+  %add2 = add i64 %add1, %3
+  %add3 = add i64 %add2, %4
+  %add4 = add i64 %add3, %5
+  ret i64 %add4
+}
+
+declare i64 @llvm.hexagon.S2.asl.i.vw(i64, i32) nounwind readnone
+declare i64 @llvm.hexagon.S2.asr.i.vw(i64, i32) nounwind readnone
+declare i64 @llvm.hexagon.S2.lsr.i.vw(i64, i32) nounwind readnone
+declare i64 @llvm.hexagon.S2.asl.i.vh(i64, i32) nounwind readnone
+declare i64 @llvm.hexagon.S2.asr.i.vh(i64, i32) nounwind readnone
+declare i64 @llvm.hexagon.S2.lsr.i.vh(i64, i32) nounwind readnone
+
--- a/test/CodeGen/Hexagon/vect/vect-shuffle.ll
+++ b/test/CodeGen/Hexagon/vect/vect-shuffle.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+
+; Check that store is post-incremented.
+; CHECK-NOT: extractu
+; CHECK-NOT: insert
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define i32 @foo(i16* noalias nocapture %src, i16* noalias nocapture %dstImg, i32 %width, i32 %idx, i32 %flush) #0 {
+entry:
+  %0 = tail call i64 @llvm.hexagon.A2.combinew(i32 %flush, i32 %flush)
+  %1 = bitcast i64 %0 to <2 x i32>
+  br label %polly.loop_body
+
+polly.loop_after:                                 ; preds = %polly.loop_body
+  ret i32 0
+
+polly.loop_body:                                  ; preds = %entry, %polly.loop_body
+  %p_arrayidx35.phi = phi i16* [ %dstImg, %entry ], [ %p_arrayidx35.inc, %polly.loop_body ]
+  %p_arrayidx.phi = phi i16* [ %src, %entry ], [ %p_arrayidx.inc, %polly.loop_body ]
+  %polly.loopiv56 = phi i32 [ 0, %entry ], [ %polly.next_loopiv, %polly.loop_body ]
+  %polly.next_loopiv = add nsw i32 %polly.loopiv56, 4
+  %vector_ptr = bitcast i16* %p_arrayidx.phi to <4 x i16>*
+  %_p_vec_full = load <4 x i16>, <4 x i16>* %vector_ptr, align 2
+  %_high_half = shufflevector <4 x i16> %_p_vec_full, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+  %_low_half = shufflevector <4 x i16> %_p_vec_full, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %2 = zext <2 x i16> %_low_half to <2 x i32>
+  %3 = zext <2 x i16> %_high_half to <2 x i32>
+  %add33p_vec = add <2 x i32> %2, %1
+  %add33p_vec48 = add <2 x i32> %3, %1
+  %4 = trunc <2 x i32> %add33p_vec to <2 x i16>
+  %5 = trunc <2 x i32> %add33p_vec48 to <2 x i16>
+  %_combined_vec = shufflevector <2 x i16> %4, <2 x i16> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %vector_ptr49 = bitcast i16* %p_arrayidx35.phi to <4 x i16>*
+  store <4 x i16> %_combined_vec, <4 x i16>* %vector_ptr49, align 2
+  %6 = icmp slt i32 %polly.next_loopiv, 1024
+  %p_arrayidx35.inc = getelementptr i16, i16* %p_arrayidx35.phi, i32 4
+  %p_arrayidx.inc = getelementptr i16, i16* %p_arrayidx.phi, i32 4
+  br i1 %6, label %polly.loop_body, label %polly.loop_after
+}
+
+declare i64 @llvm.hexagon.A2.combinew(i32, i32) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+
--- a/test/CodeGen/Hexagon/vect/vect-splat.ll
+++ b/test/CodeGen/Hexagon/vect/vect-splat.ll
@@ -0,0 +1,16 @@
+; Extracted from test/CodeGen/Generic/vector.ll: used to loop indefinitely.
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; CHECK: combine
+
+%i4 = type <4 x i32>
+
+define void @splat_i4(%i4* %P, %i4* %Q, i32 %X) {
+	%tmp = insertelement %i4 undef, i32 %X, i32 0		; <%i4> [#uses=1]
+	%tmp2 = insertelement %i4 %tmp, i32 %X, i32 1		; <%i4> [#uses=1]
+	%tmp4 = insertelement %i4 %tmp2, i32 %X, i32 2		; <%i4> [#uses=1]
+	%tmp6 = insertelement %i4 %tmp4, i32 %X, i32 3		; <%i4> [#uses=1]
+	%q = load %i4, %i4* %Q		; <%i4> [#uses=1]
+	%R = add %i4 %q, %tmp6		; <%i4> [#uses=1]
+	store %i4 %R, %i4* %P
+	ret void
+}
--- a/test/CodeGen/Hexagon/vect/vect-store-v2i16.ll
+++ b/test/CodeGen/Hexagon/vect/vect-store-v2i16.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=hexagon < %s
+; Used to fail with: "Cannot select: 0x3bab680: ch = store <ST4[%lsr.iv522525], trunc to v2i16>
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+target triple = "hexagon-unknown-linux-gnu"
+
+define void @foobar() nounwind {
+entry:
+  br label %for.cond7.preheader.single_entry.i
+
+for.cond7.preheader.single_entry.i:               ; preds = %for.cond7.preheader.single_entry.i, %entry
+  %exitcond72.i = icmp eq i32 undef, 64
+  br i1 %exitcond72.i, label %foo_32.exit, label %for.cond7.preheader.single_entry.i
+
+foo_32.exit:                         ; preds = %for.cond7.preheader.single_entry.i
+  br label %for.body.i428
+
+for.body.i428:                                    ; preds = %for.body.i428, %foo_32.exit
+  br i1 undef, label %foo_12.exit, label %for.body.i428
+
+foo_12.exit:                            ; preds = %for.body.i428
+  br label %for.body.i.i
+
+for.body.i.i:                                     ; preds = %for.body.i.i, %foo_12.exit
+  br i1 undef, label %foo_14.exit, label %for.body.i.i
+
+foo_14.exit:                         ; preds = %for.body.i.i
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %foo_14.exit
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %storemerge294 = select i1 undef, i32 32767, i32 undef
+  %_p_splat_one386 = insertelement <1 x i32> undef, i32 %storemerge294, i32 0
+  %_p_splat387 = shufflevector <1 x i32> %_p_splat_one386, <1 x i32> undef, <2 x i32> zeroinitializer
+  br label %polly.loop_body377
+
+polly.loop_after378:                              ; preds = %polly.loop_body377
+  unreachable
+
+polly.loop_body377:                               ; preds = %polly.loop_body377, %for.end
+  %_p_vec_full384 = load <2 x i16>, <2 x i16>* undef, align 4
+  %0 = sext <2 x i16> %_p_vec_full384 to <2 x i32>
+  %mulp_vec = mul <2 x i32> %0, %_p_splat387
+  %shr100293p_vec = lshr <2 x i32> %mulp_vec, <i32 15, i32 15>
+  %1 = trunc <2 x i32> %shr100293p_vec to <2 x i16>
+  store <2 x i16> %1, <2 x i16>* undef, align 4
+  br i1 undef, label %polly.loop_body377, label %polly.loop_after378
+}
+
--- a/test/CodeGen/Hexagon/vect/vect-truncate.ll
+++ b/test/CodeGen/Hexagon/vect/vect-truncate.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=hexagon < %s
+; Used to fail with "Cannot select: 0x16cb7f0: v2i16 = truncate"
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+target triple = "hexagon-unknown-linux-gnu"
+
+define void @Autocorr() nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %polly.loop_header43, label %for.body
+
+do.cond:                                          ; preds = %polly.loop_header
+  unreachable
+
+do.end:                                           ; preds = %polly.loop_after45
+  ret void
+
+polly.loop_header:                                ; preds = %polly.loop_after45, %polly.loop_body
+  %0 = icmp sle i32 undef, 239
+  br i1 %0, label %polly.loop_body, label %do.cond
+
+polly.loop_body:                                  ; preds = %polly.loop_header
+  %p_25 = call i32 @llvm.hexagon.SI.to.SXTHI.asrh(i32 undef)
+  %1 = insertelement <4 x i32> undef, i32 %p_25, i32 3
+  %2 = trunc <4 x i32> %1 to <4 x i16>
+  store <4 x i16> %2, <4 x i16>* undef, align 8
+  br label %polly.loop_header
+
+polly.loop_after45:                               ; preds = %polly.loop_header43
+  br i1 undef, label %polly.loop_header, label %do.end
+
+polly.loop_header43:                              ; preds = %polly.loop_body44, %for.body
+  br i1 undef, label %polly.loop_body44, label %polly.loop_after45
+
+polly.loop_body44:                                ; preds = %polly.loop_header43
+  br label %polly.loop_header43
+}
+
+declare i32 @llvm.hexagon.SI.to.SXTHI.asrh(i32) nounwind readnone
--- a/test/CodeGen/Hexagon/vect/vect-vaddb-1.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vaddb-1.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vaddub
+
+define <4 x i8> @t_i4x8(<4 x i8> %a, <4 x i8> %b) nounwind {
+entry:
+	%0 = add <4 x i8> %a, %b
+	ret <4 x i8> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-vaddb.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vaddb.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vaddub
+
+define <8 x i8> @t_i8x8(<8 x i8> %a, <8 x i8> %b) nounwind {
+entry:
+	%0 = add <8 x i8> %a, %b
+	ret <8 x i8> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-vaddh-1.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vaddh-1.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vaddh
+
+define <4 x i16> @t_i4x16(<4 x i16> %a, <4 x i16> %b) nounwind {
+entry:
+	%0 = add <4 x i16> %a, %b
+	ret <4 x i16> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-vaddh.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vaddh.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vaddh
+
+define <2 x i16> @t_i2x16(<2 x i16> %a, <2 x i16> %b) nounwind {
+entry:
+	%0 = add <2 x i16> %a, %b
+	ret <2 x i16> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-vaddw.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vaddw.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vaddw
+
+define <2 x i32> @t_i2x32(<2 x i32> %a, <2 x i32> %b) nounwind {
+entry:
+	%0 = add <2 x i32> %a, %b
+	ret <2 x i32> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-vaslw.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vaslw.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vaslw
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon-unknown-linux-gnu"
+
+define void @foo(i16* nocapture %v) nounwind {
+entry:
+  %p_arrayidx = getelementptr i16, i16* %v, i32 4
+  %vector_ptr = bitcast i16* %p_arrayidx to <4 x i16>*
+  %_p_vec_full = load <4 x i16>, <4 x i16>* %vector_ptr, align 2
+  %_high_half = shufflevector <4 x i16> %_p_vec_full, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+  %_low_half = shufflevector <4 x i16> %_p_vec_full, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %0 = sext <2 x i16> %_low_half to <2 x i32>
+  %1 = sext <2 x i16> %_high_half to <2 x i32>
+  %shr6p_vec = shl <2 x i32> %0, <i32 2, i32 2>
+  %shr6p_vec19 = shl <2 x i32> %1, <i32 2, i32 2>
+  %addp_vec = add <2 x i32> %shr6p_vec, <i32 34, i32 34>
+  %addp_vec20 = add <2 x i32> %shr6p_vec19, <i32 34, i32 34>
+  %vector_ptr21 = bitcast i16* %v to <4 x i16>*
+  %_p_vec_full22 = load <4 x i16>, <4 x i16>* %vector_ptr21, align 2
+  %_high_half23 = shufflevector <4 x i16> %_p_vec_full22, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+  %_low_half24 = shufflevector <4 x i16> %_p_vec_full22, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %2 = zext <2 x i16> %_low_half24 to <2 x i32>
+  %3 = zext <2 x i16> %_high_half23 to <2 x i32>
+  %add3p_vec = add <2 x i32> %addp_vec, %2
+  %add3p_vec25 = add <2 x i32> %addp_vec20, %3
+  %4 = trunc <2 x i32> %add3p_vec to <2 x i16>
+  %5 = trunc <2 x i32> %add3p_vec25 to <2 x i16>
+  %_combined_vec = shufflevector <2 x i16> %4, <2 x i16> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %_combined_vec, <4 x i16>* %vector_ptr21, align 2
+  ret void
+}
--- a/test/CodeGen/Hexagon/vect/vect-vshifts.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vshifts.ll
@@ -0,0 +1,279 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+
+; Check that store is post-incremented.
+; CHECK: r{{[0-9]+:[0-9]+}} = vasrw(r{{[0-9]+:[0-9]+}}, r{{[0-9]+}})
+; CHECK: r{{[0-9]+:[0-9]+}} = vaslw(r{{[0-9]+:[0-9]+}}, r{{[0-9]+}})
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define void @foo(i32* nocapture %buf, i32* nocapture %dest, i32 %offset, i32 %oddBlock, i32 %gb) #0 {
+entry:
+  %0 = load i32, i32* %buf, align 4, !tbaa !0
+  %shr = ashr i32 %0, %gb
+  store i32 %shr, i32* %buf, align 4, !tbaa !0
+  %not.tobool = icmp eq i32 %oddBlock, 0
+  %1 = sub i32 %offset, %oddBlock
+  %2 = zext i1 %not.tobool to i32
+  %3 = and i32 %1, 7
+  %4 = add i32 %2, %3
+  %5 = add i32 %4, 8
+  %p_sub8 = sub nsw i32 31, %gb
+  %6 = insertelement <2 x i32> undef, i32 %p_sub8, i32 0
+  %7 = insertelement <2 x i32> %6, i32 %p_sub8, i32 1
+  %8 = bitcast <2 x i32> %7 to i64
+  %9 = tail call i64 @llvm.hexagon.S2.asl.i.vw(i64 %8, i32 1)
+  %10 = bitcast i64 %9 to <2 x i32>
+  %11 = tail call i64 @llvm.hexagon.A2.combinew(i32 -1, i32 -1)
+  %12 = bitcast i64 %11 to <2 x i32>
+  %sub12p_vec = add <2 x i32> %10, %12
+  %p_22 = add i32 %4, 64
+  %p_d.018 = getelementptr i32, i32* %dest, i32 %4
+  %p_d.01823 = getelementptr i32, i32* %dest, i32 %p_22
+  %p_25 = add i32 %4, 72
+  %p_arrayidx14 = getelementptr i32, i32* %dest, i32 %5
+  %p_arrayidx1426 = getelementptr i32, i32* %dest, i32 %p_25
+  %_p_scalar_ = load i32, i32* %p_d.018, align 4
+  %_p_vec_ = insertelement <2 x i32> undef, i32 %_p_scalar_, i32 0
+  %_p_scalar_27 = load i32, i32* %p_d.01823, align 4
+  %_p_vec_28 = insertelement <2 x i32> %_p_vec_, i32 %_p_scalar_27, i32 1
+  %13 = bitcast <2 x i32> %_p_vec_28 to i64
+  %14 = tail call i64 @llvm.hexagon.S2.asr.i.vw(i64 %13, i32 31)
+  %15 = bitcast i64 %14 to <2 x i32>
+  %shr9p_vec = ashr <2 x i32> %_p_vec_28, %7
+  %xorp_vec = xor <2 x i32> %15, %sub12p_vec
+  %16 = bitcast <2 x i32> %shr9p_vec to i64
+  %17 = tail call i32 @llvm.hexagon.A2.vcmpweq(i64 %14, i64 %16)
+  %18 = bitcast <2 x i32> %xorp_vec to i64
+  %19 = tail call i64 @llvm.hexagon.C2.vmux(i32 %17, i64 %13, i64 %18)
+  %20 = tail call i64 @llvm.hexagon.S2.asl.r.vw(i64 %19, i32 %gb)
+  %21 = bitcast i64 %20 to <2 x i32>
+  %22 = extractelement <2 x i32> %21, i32 0
+  store i32 %22, i32* %p_arrayidx14, align 4
+  %23 = extractelement <2 x i32> %21, i32 1
+  store i32 %23, i32* %p_arrayidx1426, align 4
+  store i32 %22, i32* %p_d.018, align 4
+  store i32 %23, i32* %p_d.01823, align 4
+  %p_21.1 = add i32 %4, 128
+  %p_22.1 = add i32 %4, 192
+  %p_d.018.1 = getelementptr i32, i32* %dest, i32 %p_21.1
+  %p_d.01823.1 = getelementptr i32, i32* %dest, i32 %p_22.1
+  %p_24.1 = add i32 %4, 136
+  %p_25.1 = add i32 %4, 200
+  %p_arrayidx14.1 = getelementptr i32, i32* %dest, i32 %p_24.1
+  %p_arrayidx1426.1 = getelementptr i32, i32* %dest, i32 %p_25.1
+  %_p_scalar_.1 = load i32, i32* %p_d.018.1, align 4
+  %_p_vec_.1 = insertelement <2 x i32> undef, i32 %_p_scalar_.1, i32 0
+  %_p_scalar_27.1 = load i32, i32* %p_d.01823.1, align 4
+  %_p_vec_28.1 = insertelement <2 x i32> %_p_vec_.1, i32 %_p_scalar_27.1, i32 1
+  %24 = bitcast <2 x i32> %_p_vec_28.1 to i64
+  %25 = tail call i64 @llvm.hexagon.S2.asr.i.vw(i64 %24, i32 31)
+  %26 = bitcast i64 %25 to <2 x i32>
+  %shr9p_vec.1 = ashr <2 x i32> %_p_vec_28.1, %7
+  %xorp_vec.1 = xor <2 x i32> %26, %sub12p_vec
+  %27 = bitcast <2 x i32> %shr9p_vec.1 to i64
+  %28 = tail call i32 @llvm.hexagon.A2.vcmpweq(i64 %25, i64 %27)
+  %29 = bitcast <2 x i32> %xorp_vec.1 to i64
+  %30 = tail call i64 @llvm.hexagon.C2.vmux(i32 %28, i64 %24, i64 %29)
+  %31 = tail call i64 @llvm.hexagon.S2.asl.r.vw(i64 %30, i32 %gb)
+  %32 = bitcast i64 %31 to <2 x i32>
+  %33 = extractelement <2 x i32> %32, i32 0
+  store i32 %33, i32* %p_arrayidx14.1, align 4
+  %34 = extractelement <2 x i32> %32, i32 1
+  store i32 %34, i32* %p_arrayidx1426.1, align 4
+  store i32 %33, i32* %p_d.018.1, align 4
+  store i32 %34, i32* %p_d.01823.1, align 4
+  %p_21.2 = add i32 %4, 256
+  %p_22.2 = add i32 %4, 320
+  %p_d.018.2 = getelementptr i32, i32* %dest, i32 %p_21.2
+  %p_d.01823.2 = getelementptr i32, i32* %dest, i32 %p_22.2
+  %p_24.2 = add i32 %4, 264
+  %p_25.2 = add i32 %4, 328
+  %p_arrayidx14.2 = getelementptr i32, i32* %dest, i32 %p_24.2
+  %p_arrayidx1426.2 = getelementptr i32, i32* %dest, i32 %p_25.2
+  %_p_scalar_.2 = load i32, i32* %p_d.018.2, align 4
+  %_p_vec_.2 = insertelement <2 x i32> undef, i32 %_p_scalar_.2, i32 0
+  %_p_scalar_27.2 = load i32, i32* %p_d.01823.2, align 4
+  %_p_vec_28.2 = insertelement <2 x i32> %_p_vec_.2, i32 %_p_scalar_27.2, i32 1
+  %35 = bitcast <2 x i32> %_p_vec_28.2 to i64
+  %36 = tail call i64 @llvm.hexagon.S2.asr.i.vw(i64 %35, i32 31)
+  %37 = bitcast i64 %36 to <2 x i32>
+  %shr9p_vec.2 = ashr <2 x i32> %_p_vec_28.2, %7
+  %xorp_vec.2 = xor <2 x i32> %37, %sub12p_vec
+  %38 = bitcast <2 x i32> %shr9p_vec.2 to i64
+  %39 = tail call i32 @llvm.hexagon.A2.vcmpweq(i64 %36, i64 %38)
+  %40 = bitcast <2 x i32> %xorp_vec.2 to i64
+  %41 = tail call i64 @llvm.hexagon.C2.vmux(i32 %39, i64 %35, i64 %40)
+  %42 = tail call i64 @llvm.hexagon.S2.asl.r.vw(i64 %41, i32 %gb)
+  %43 = bitcast i64 %42 to <2 x i32>
+  %44 = extractelement <2 x i32> %43, i32 0
+  store i32 %44, i32* %p_arrayidx14.2, align 4
+  %45 = extractelement <2 x i32> %43, i32 1
+  store i32 %45, i32* %p_arrayidx1426.2, align 4
+  store i32 %44, i32* %p_d.018.2, align 4
+  store i32 %45, i32* %p_d.01823.2, align 4
+  %p_21.3 = add i32 %4, 384
+  %p_22.3 = add i32 %4, 448
+  %p_d.018.3 = getelementptr i32, i32* %dest, i32 %p_21.3
+  %p_d.01823.3 = getelementptr i32, i32* %dest, i32 %p_22.3
+  %p_24.3 = add i32 %4, 392
+  %p_25.3 = add i32 %4, 456
+  %p_arrayidx14.3 = getelementptr i32, i32* %dest, i32 %p_24.3
+  %p_arrayidx1426.3 = getelementptr i32, i32* %dest, i32 %p_25.3
+  %_p_scalar_.3 = load i32, i32* %p_d.018.3, align 4
+  %_p_vec_.3 = insertelement <2 x i32> undef, i32 %_p_scalar_.3, i32 0
+  %_p_scalar_27.3 = load i32, i32* %p_d.01823.3, align 4
+  %_p_vec_28.3 = insertelement <2 x i32> %_p_vec_.3, i32 %_p_scalar_27.3, i32 1
+  %46 = bitcast <2 x i32> %_p_vec_28.3 to i64
+  %47 = tail call i64 @llvm.hexagon.S2.asr.i.vw(i64 %46, i32 31)
+  %48 = bitcast i64 %47 to <2 x i32>
+  %shr9p_vec.3 = ashr <2 x i32> %_p_vec_28.3, %7
+  %xorp_vec.3 = xor <2 x i32> %48, %sub12p_vec
+  %49 = bitcast <2 x i32> %shr9p_vec.3 to i64
+  %50 = tail call i32 @llvm.hexagon.A2.vcmpweq(i64 %47, i64 %49)
+  %51 = bitcast <2 x i32> %xorp_vec.3 to i64
+  %52 = tail call i64 @llvm.hexagon.C2.vmux(i32 %50, i64 %46, i64 %51)
+  %53 = tail call i64 @llvm.hexagon.S2.asl.r.vw(i64 %52, i32 %gb)
+  %54 = bitcast i64 %53 to <2 x i32>
+  %55 = extractelement <2 x i32> %54, i32 0
+  store i32 %55, i32* %p_arrayidx14.3, align 4
+  %56 = extractelement <2 x i32> %54, i32 1
+  store i32 %56, i32* %p_arrayidx1426.3, align 4
+  store i32 %55, i32* %p_d.018.3, align 4
+  store i32 %56, i32* %p_d.01823.3, align 4
+  %p_21.4 = add i32 %4, 512
+  %p_22.4 = add i32 %4, 576
+  %p_d.018.4 = getelementptr i32, i32* %dest, i32 %p_21.4
+  %p_d.01823.4 = getelementptr i32, i32* %dest, i32 %p_22.4
+  %p_24.4 = add i32 %4, 520
+  %p_25.4 = add i32 %4, 584
+  %p_arrayidx14.4 = getelementptr i32, i32* %dest, i32 %p_24.4
+  %p_arrayidx1426.4 = getelementptr i32, i32* %dest, i32 %p_25.4
+  %_p_scalar_.4 = load i32, i32* %p_d.018.4, align 4
+  %_p_vec_.4 = insertelement <2 x i32> undef, i32 %_p_scalar_.4, i32 0
+  %_p_scalar_27.4 = load i32, i32* %p_d.01823.4, align 4
+  %_p_vec_28.4 = insertelement <2 x i32> %_p_vec_.4, i32 %_p_scalar_27.4, i32 1
+  %57 = bitcast <2 x i32> %_p_vec_28.4 to i64
+  %58 = tail call i64 @llvm.hexagon.S2.asr.i.vw(i64 %57, i32 31)
+  %59 = bitcast i64 %58 to <2 x i32>
+  %shr9p_vec.4 = ashr <2 x i32> %_p_vec_28.4, %7
+  %xorp_vec.4 = xor <2 x i32> %59, %sub12p_vec
+  %60 = bitcast <2 x i32> %shr9p_vec.4 to i64
+  %61 = tail call i32 @llvm.hexagon.A2.vcmpweq(i64 %58, i64 %60)
+  %62 = bitcast <2 x i32> %xorp_vec.4 to i64
+  %63 = tail call i64 @llvm.hexagon.C2.vmux(i32 %61, i64 %57, i64 %62)
+  %64 = tail call i64 @llvm.hexagon.S2.asl.r.vw(i64 %63, i32 %gb)
+  %65 = bitcast i64 %64 to <2 x i32>
+  %66 = extractelement <2 x i32> %65, i32 0
+  store i32 %66, i32* %p_arrayidx14.4, align 4
+  %67 = extractelement <2 x i32> %65, i32 1
+  store i32 %67, i32* %p_arrayidx1426.4, align 4
+  store i32 %66, i32* %p_d.018.4, align 4
+  store i32 %67, i32* %p_d.01823.4, align 4
+  %p_21.5 = add i32 %4, 640
+  %p_22.5 = add i32 %4, 704
+  %p_d.018.5 = getelementptr i32, i32* %dest, i32 %p_21.5
+  %p_d.01823.5 = getelementptr i32, i32* %dest, i32 %p_22.5
+  %p_24.5 = add i32 %4, 648
+  %p_25.5 = add i32 %4, 712
+  %p_arrayidx14.5 = getelementptr i32, i32* %dest, i32 %p_24.5
+  %p_arrayidx1426.5 = getelementptr i32, i32* %dest, i32 %p_25.5
+  %_p_scalar_.5 = load i32, i32* %p_d.018.5, align 4
+  %_p_vec_.5 = insertelement <2 x i32> undef, i32 %_p_scalar_.5, i32 0
+  %_p_scalar_27.5 = load i32, i32* %p_d.01823.5, align 4
+  %_p_vec_28.5 = insertelement <2 x i32> %_p_vec_.5, i32 %_p_scalar_27.5, i32 1
+  %68 = bitcast <2 x i32> %_p_vec_28.5 to i64
+  %69 = tail call i64 @llvm.hexagon.S2.asr.i.vw(i64 %68, i32 31)
+  %70 = bitcast i64 %69 to <2 x i32>
+  %shr9p_vec.5 = ashr <2 x i32> %_p_vec_28.5, %7
+  %xorp_vec.5 = xor <2 x i32> %70, %sub12p_vec
+  %71 = bitcast <2 x i32> %shr9p_vec.5 to i64
+  %72 = tail call i32 @llvm.hexagon.A2.vcmpweq(i64 %69, i64 %71)
+  %73 = bitcast <2 x i32> %xorp_vec.5 to i64
+  %74 = tail call i64 @llvm.hexagon.C2.vmux(i32 %72, i64 %68, i64 %73)
+  %75 = tail call i64 @llvm.hexagon.S2.asl.r.vw(i64 %74, i32 %gb)
+  %76 = bitcast i64 %75 to <2 x i32>
+  %77 = extractelement <2 x i32> %76, i32 0
+  store i32 %77, i32* %p_arrayidx14.5, align 4
+  %78 = extractelement <2 x i32> %76, i32 1
+  store i32 %78, i32* %p_arrayidx1426.5, align 4
+  store i32 %77, i32* %p_d.018.5, align 4
+  store i32 %78, i32* %p_d.01823.5, align 4
+  %p_21.6 = add i32 %4, 768
+  %p_22.6 = add i32 %4, 832
+  %p_d.018.6 = getelementptr i32, i32* %dest, i32 %p_21.6
+  %p_d.01823.6 = getelementptr i32, i32* %dest, i32 %p_22.6
+  %p_24.6 = add i32 %4, 776
+  %p_25.6 = add i32 %4, 840
+  %p_arrayidx14.6 = getelementptr i32, i32* %dest, i32 %p_24.6
+  %p_arrayidx1426.6 = getelementptr i32, i32* %dest, i32 %p_25.6
+  %_p_scalar_.6 = load i32, i32* %p_d.018.6, align 4
+  %_p_vec_.6 = insertelement <2 x i32> undef, i32 %_p_scalar_.6, i32 0
+  %_p_scalar_27.6 = load i32, i32* %p_d.01823.6, align 4
+  %_p_vec_28.6 = insertelement <2 x i32> %_p_vec_.6, i32 %_p_scalar_27.6, i32 1
+  %79 = bitcast <2 x i32> %_p_vec_28.6 to i64
+  %80 = tail call i64 @llvm.hexagon.S2.asr.i.vw(i64 %79, i32 31)
+  %81 = bitcast i64 %80 to <2 x i32>
+  %shr9p_vec.6 = ashr <2 x i32> %_p_vec_28.6, %7
+  %xorp_vec.6 = xor <2 x i32> %81, %sub12p_vec
+  %82 = bitcast <2 x i32> %shr9p_vec.6 to i64
+  %83 = tail call i32 @llvm.hexagon.A2.vcmpweq(i64 %80, i64 %82)
+  %84 = bitcast <2 x i32> %xorp_vec.6 to i64
+  %85 = tail call i64 @llvm.hexagon.C2.vmux(i32 %83, i64 %79, i64 %84)
+  %86 = tail call i64 @llvm.hexagon.S2.asl.r.vw(i64 %85, i32 %gb)
+  %87 = bitcast i64 %86 to <2 x i32>
+  %88 = extractelement <2 x i32> %87, i32 0
+  store i32 %88, i32* %p_arrayidx14.6, align 4
+  %89 = extractelement <2 x i32> %87, i32 1
+  store i32 %89, i32* %p_arrayidx1426.6, align 4
+  store i32 %88, i32* %p_d.018.6, align 4
+  store i32 %89, i32* %p_d.01823.6, align 4
+  %p_21.7 = add i32 %4, 896
+  %p_22.7 = add i32 %4, 960
+  %p_d.018.7 = getelementptr i32, i32* %dest, i32 %p_21.7
+  %p_d.01823.7 = getelementptr i32, i32* %dest, i32 %p_22.7
+  %p_24.7 = add i32 %4, 904
+  %p_25.7 = add i32 %4, 968
+  %p_arrayidx14.7 = getelementptr i32, i32* %dest, i32 %p_24.7
+  %p_arrayidx1426.7 = getelementptr i32, i32* %dest, i32 %p_25.7
+  %_p_scalar_.7 = load i32, i32* %p_d.018.7, align 4
+  %_p_vec_.7 = insertelement <2 x i32> undef, i32 %_p_scalar_.7, i32 0
+  %_p_scalar_27.7 = load i32, i32* %p_d.01823.7, align 4
+  %_p_vec_28.7 = insertelement <2 x i32> %_p_vec_.7, i32 %_p_scalar_27.7, i32 1
+  %90 = bitcast <2 x i32> %_p_vec_28.7 to i64
+  %91 = tail call i64 @llvm.hexagon.S2.asr.i.vw(i64 %90, i32 31)
+  %92 = bitcast i64 %91 to <2 x i32>
+  %shr9p_vec.7 = ashr <2 x i32> %_p_vec_28.7, %7
+  %xorp_vec.7 = xor <2 x i32> %92, %sub12p_vec
+  %93 = bitcast <2 x i32> %shr9p_vec.7 to i64
+  %94 = tail call i32 @llvm.hexagon.A2.vcmpweq(i64 %91, i64 %93)
+  %95 = bitcast <2 x i32> %xorp_vec.7 to i64
+  %96 = tail call i64 @llvm.hexagon.C2.vmux(i32 %94, i64 %90, i64 %95)
+  %97 = tail call i64 @llvm.hexagon.S2.asl.r.vw(i64 %96, i32 %gb)
+  %98 = bitcast i64 %97 to <2 x i32>
+  %99 = extractelement <2 x i32> %98, i32 0
+  store i32 %99, i32* %p_arrayidx14.7, align 4
+  %100 = extractelement <2 x i32> %98, i32 1
+  store i32 %100, i32* %p_arrayidx1426.7, align 4
+  store i32 %99, i32* %p_d.018.7, align 4
+  store i32 %100, i32* %p_d.01823.7, align 4
+  ret void
+}
+
+declare i64 @llvm.hexagon.S2.asr.i.vw(i64, i32) #1
+
+declare i64 @llvm.hexagon.S2.asl.i.vw(i64, i32) #1
+
+declare i64 @llvm.hexagon.A2.combinew(i32, i32) #1
+
+declare i32 @llvm.hexagon.A2.vcmpweq(i64, i64) #1
+
+declare i64 @llvm.hexagon.C2.vmux(i32, i64, i64) #1
+
+declare i64 @llvm.hexagon.S2.asl.r.vw(i64, i32) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
--- a/test/CodeGen/Hexagon/vect/vect-vsplatb.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vsplatb.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Make sure we build the constant vector <7, 7, 7, 7> with a vsplatb.
+; CHECK: vsplatb
+@B = common global [400 x i8] zeroinitializer, align 8
+@A = common global [400 x i8] zeroinitializer, align 8
+@C = common global [400 x i8] zeroinitializer, align 8
+
+define void @run() nounwind {
+entry:
+  br label %polly.loop_body
+
+polly.loop_after:                                 ; preds = %polly.loop_body
+  ret void
+
+polly.loop_body:                                  ; preds = %entry, %polly.loop_body
+  %polly.loopiv25 = phi i32 [ 0, %entry ], [ %polly.next_loopiv, %polly.loop_body ]
+  %polly.next_loopiv = add i32 %polly.loopiv25, 4
+  %p_arrayidx1 = getelementptr [400 x i8], [400 x i8]* @A, i32 0, i32 %polly.loopiv25
+  %p_arrayidx = getelementptr [400 x i8], [400 x i8]* @B, i32 0, i32 %polly.loopiv25
+  %vector_ptr = bitcast i8* %p_arrayidx to <4 x i8>*
+  %_p_vec_full = load <4 x i8>, <4 x i8>* %vector_ptr, align 8
+  %mulp_vec = mul <4 x i8> %_p_vec_full, <i8 7, i8 7, i8 7, i8 7>
+  %vector_ptr14 = bitcast i8* %p_arrayidx1 to <4 x i8>*
+  %_p_vec_full15 = load <4 x i8>, <4 x i8>* %vector_ptr14, align 8
+  %addp_vec = add <4 x i8> %_p_vec_full15, %mulp_vec
+  store <4 x i8> %addp_vec, <4 x i8>* %vector_ptr14, align 8
+  %0 = icmp slt i32 %polly.next_loopiv, 400
+  br i1 %0, label %polly.loop_body, label %polly.loop_after
+}
--- a/test/CodeGen/Hexagon/vect/vect-vsplath.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vsplath.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Make sure we build the constant vector <7, 7, 7, 7> with a vsplath.
+; CHECK: vsplath
+@B = common global [400 x i16] zeroinitializer, align 8
+@A = common global [400 x i16] zeroinitializer, align 8
+@C = common global [400 x i16] zeroinitializer, align 8
+
+define void @run() nounwind {
+entry:
+  br label %polly.loop_body
+
+polly.loop_after:                                 ; preds = %polly.loop_body
+  ret void
+
+polly.loop_body:                                  ; preds = %entry, %polly.loop_body
+  %polly.loopiv26 = phi i32 [ 0, %entry ], [ %polly.next_loopiv, %polly.loop_body ]
+  %polly.next_loopiv = add nsw i32 %polly.loopiv26, 4
+  %p_arrayidx1 = getelementptr [400 x i16], [400 x i16]* @A, i32 0, i32 %polly.loopiv26
+  %p_arrayidx = getelementptr [400 x i16], [400 x i16]* @B, i32 0, i32 %polly.loopiv26
+  %vector_ptr = bitcast i16* %p_arrayidx to <4 x i16>*
+  %_p_vec_full = load <4 x i16>, <4 x i16>* %vector_ptr, align 8
+  %mulp_vec = mul <4 x i16> %_p_vec_full, <i16 7, i16 7, i16 7, i16 7>
+  %vector_ptr15 = bitcast i16* %p_arrayidx1 to <4 x i16>*
+  %_p_vec_full16 = load <4 x i16>, <4 x i16>* %vector_ptr15, align 8
+  %addp_vec = add <4 x i16> %_p_vec_full16, %mulp_vec
+  store <4 x i16> %addp_vec, <4 x i16>* %vector_ptr15, align 8
+  %0 = icmp slt i32 %polly.next_loopiv, 400
+  br i1 %0, label %polly.loop_body, label %polly.loop_after
+}
--- a/test/CodeGen/Hexagon/vect/vect-vsubb-1.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vsubb-1.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vsubub
+
+define <4 x i8> @t_i4x8(<4 x i8> %a, <4 x i8> %b) nounwind {
+entry:
+	%0 = sub <4 x i8> %a, %b
+	ret <4 x i8> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-vsubb.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vsubb.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vsubub
+
+define <8 x i8> @t_i8x8(<8 x i8> %a, <8 x i8> %b) nounwind {
+entry:
+	%0 = sub <8 x i8> %a, %b
+	ret <8 x i8> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-vsubh-1.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vsubh-1.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vsubh
+
+define <4 x i16> @t_i4x16(<4 x i16> %a, <4 x i16> %b) nounwind {
+entry:
+	%0 = sub <4 x i16> %a, %b
+	ret <4 x i16> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-vsubh.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vsubh.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vsubh
+
+define <2 x i16> @t_i2x16(<2 x i16> %a, <2 x i16> %b) nounwind {
+entry:
+	%0 = sub <2 x i16> %a, %b
+	ret <2 x i16> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-vsubw.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vsubw.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: vsubw
+
+define <2 x i32> @t_i2x32(<2 x i32> %a, <2 x i32> %b) nounwind {
+entry:
+	%0 = sub <2 x i32> %a, %b
+	ret <2 x i32> %0
+}
--- a/test/CodeGen/Hexagon/vect/vect-xor.ll
+++ b/test/CodeGen/Hexagon/vect/vect-xor.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+
+; Check that the parsing succeeded.
+; CHECK: r{{[0-9]+:[0-9]+}} = xor(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+@window_size = global i32 65536, align 4
+@prev = external global [0 x i16], align 8
+@block_start = common global i32 0, align 4
+@prev_length = common global i32 0, align 4
+@strstart = common global i32 0, align 4
+@match_start = common global i32 0, align 4
+@max_chain_length = common global i32 0, align 4
+@good_match = common global i32 0, align 4
+
+define void @fill_window() #0 {
+entry:
+  br label %polly.loop_body
+
+polly.loop_after:                                 ; preds = %polly.loop_body
+  ret void
+
+polly.loop_body:                                  ; preds = %entry, %polly.loop_body
+  %polly.loopiv36 = phi i32 [ 0, %entry ], [ %polly.next_loopiv, %polly.loop_body ]
+  %polly.next_loopiv = add nsw i32 %polly.loopiv36, 4
+  %p_arrayidx4 = getelementptr [0 x i16], [0 x i16]* @prev, i32 0, i32 %polly.loopiv36
+  %vector_ptr = bitcast i16* %p_arrayidx4 to <4 x i16>*
+  %_p_vec_full = load <4 x i16>, <4 x i16>* %vector_ptr, align 2
+  %cmp1p_vicmp = icmp slt <4 x i16> %_p_vec_full, zeroinitializer
+  %subp_vec = xor <4 x i16> %_p_vec_full, <i16 -32768, i16 -32768, i16 -32768, i16 -32768>
+  %sel1p_vsel = select <4 x i1> %cmp1p_vicmp, <4 x i16> %subp_vec, <4 x i16> zeroinitializer
+  store <4 x i16> %sel1p_vsel, <4 x i16>* %vector_ptr, align 2
+  %0 = icmp slt i32 %polly.next_loopiv, 32768
+  br i1 %0, label %polly.loop_body, label %polly.loop_after
+}
+
+attributes #0 = { nounwind "fp-contract-model"="standard" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="static" "ssp-buffers-size"="8" }
--- a/test/CodeGen/Hexagon/vect/vect-zeroextend.ll
+++ b/test/CodeGen/Hexagon/vect/vect-zeroextend.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=hexagon < %s
+; Used to fail with "Cannot select: 0x16cb2d0: v4i16 = zero_extend"
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+target triple = "hexagon-unknown-linux-gnu"
+
+define void @foo() nounwind {
+entry:
+  br i1 undef, label %for.cond30.preheader.lr.ph, label %for.end425
+
+for.cond30.preheader.lr.ph:                       ; preds = %entry
+  br label %for.cond37.preheader
+
+for.cond37.preheader:                             ; preds = %for.cond37.preheader, %for.cond30.preheader.lr.ph
+  %_p_vec_full = load <3 x i8>, <3 x i8>* undef, align 8
+  %0 = zext <3 x i8> %_p_vec_full to <3 x i16>
+  store <3 x i16> %0, <3 x i16>* undef, align 8
+  br label %for.cond37.preheader
+
+for.end425:                                       ; preds = %entry
+  ret void
+}