[PowerPC] Add loads, stores, and related things to fast-isel.

This is the next big chunk of fast-isel code. The primary purpose is to implement selection of loads and stores, but there is a lot of drag-along to support this. The common code to analyze addresses for both loads and stores is substantial. It's also necessary to add the materialization code for global values. Related to load-store processing is the code to fold loads into integer extends, since otherwise we generate lots of redundant instructions. We also need to add some overrides to some FastEmit routines to ensure we don't assign GPR 0 to a virtual register when this would change the meaning of an instruction. I added handling selection of a few binary arithmetic instructions, to enable committing some test cases I wrote a while back. Finally, ap couple of miscellaneous changes: * I cleaned up some poor style from a previous patch in PPCISelLowering.cpp, pointed out by David Blaikie. * I enlarged the Addr.Offset field to avoid sign problems with 32-bit offsets. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189636 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-05 17:39:16 +00:00 · 2013-08-30 02:29:45 +00:00 · 2013-08-30 02:29:45 +00:00 · 7248968fa5
commit 7248968fa5
parent 6dc6a89d73
6 changed files with 1210 additions and 7 deletions
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@ -52,7 +52,7 @@ typedef struct Address {
    int FI;
  } Base;

-  int Offset;
+  long Offset;

  // Innocuous defaults for our address.
  Address()
@ -90,21 +90,45 @@ class PPCFastISel : public FastISel {
                                     const LoadInst *LI);
    virtual bool FastLowerArguments();
    virtual unsigned FastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm);
+    virtual unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     uint64_t Imm);
+    virtual unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC,
+                                    unsigned Op0, bool Op0IsKill);
+    virtual unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     unsigned Op1, bool Op1IsKill);

  // Instruction selection routines.
  private:
+    bool SelectLoad(const Instruction *I);
+    bool SelectStore(const Instruction *I);
    bool SelectBranch(const Instruction *I);
    bool SelectIndirectBr(const Instruction *I);
+    bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
    bool SelectRet(const Instruction *I);
    bool SelectIntExt(const Instruction *I);

  // Utility routines.
  private:
+    bool isTypeLegal(Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(Type *Ty, MVT &VT);
    bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
                    bool isZExt, unsigned DestReg);
+    bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                     const TargetRegisterClass *RC, bool IsZExt = true,
+                     unsigned FP64LoadOpc = PPC::LFD);
+    bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr);
+    bool PPCComputeAddress(const Value *Obj, Address &Addr);
+    void PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+                            unsigned &IndexReg);
    bool PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
                           unsigned DestReg, bool IsZExt);
    unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT);
+    unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT);
    unsigned PPCMaterializeInt(const Constant *C, MVT VT);
    unsigned PPCMaterialize32BitInt(int64_t Imm,
                                    const TargetRegisterClass *RC);
@ -187,6 +211,439 @@ static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
  }
 }

+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel, and return its equivalent machine type in VT.
+// FIXME: Copied directly from ARM -- factor into base class?
+bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT Evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (Evt == MVT::Other || !Evt.isSimple()) return false;
+  VT = Evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel as a load target, and return its equivalent machine type in VT.
+bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT)) return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) {
+    return true;
+  }
+
+  return false;
+}
+
+// Given a value Obj, create an Address object Addr that represents its
+// address.  Return false if we can't handle it.
+bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
+  const User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+    default:
+      break;
+    case Instruction::BitCast:
+      // Look through bitcasts.
+      return PPCComputeAddress(U->getOperand(0), Addr);
+    case Instruction::IntToPtr:
+      // Look past no-op inttoptrs.
+      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::PtrToInt:
+      // Look past no-op ptrtoints.
+      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::GetElementPtr: {
+      Address SavedAddr = Addr;
+      long TmpOffset = Addr.Offset;
+
+      // Iterate through the GEP folding the constants into offsets where
+      // we can.
+      gep_type_iterator GTI = gep_type_begin(U);
+      for (User::const_op_iterator II = U->op_begin() + 1, IE = U->op_end();
+           II != IE; ++II, ++GTI) {
+        const Value *Op = *II;
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+          const StructLayout *SL = TD.getStructLayout(STy);
+          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+          TmpOffset += SL->getElementOffset(Idx);
+        } else {
+          uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+          for (;;) {
+            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+              // Constant-offset addressing.
+              TmpOffset += CI->getSExtValue() * S;
+              break;
+            }
+            if (isa<AddOperator>(Op) &&
+                (!isa<Instruction>(Op) ||
+                 FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
+                 == FuncInfo.MBB) &&
+                isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
+              // An add (in the same block) with a constant operand. Fold the
+              // constant.
+              ConstantInt *CI =
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+              TmpOffset += CI->getSExtValue() * S;
+              // Iterate on the other operand.
+              Op = cast<AddOperator>(Op)->getOperand(0);
+              continue;
+            }
+            // Unsupported
+            goto unsupported_gep;
+          }
+        }
+      }
+
+      // Try to grab the base operand now.
+      Addr.Offset = TmpOffset;
+      if (PPCComputeAddress(U->getOperand(0), Addr)) return true;
+
+      // We failed, restore everything and try the other options.
+      Addr = SavedAddr;
+
+      unsupported_gep:
+      break;
+    }
+    case Instruction::Alloca: {
+      const AllocaInst *AI = cast<AllocaInst>(Obj);
+      DenseMap<const AllocaInst*, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end()) {
+        Addr.BaseType = Address::FrameIndexBase;
+        Addr.Base.FI = SI->second;
+        return true;
+      }
+      break;
+    }
+  }
+
+  // FIXME: References to parameters fall through to the behavior
+  // below.  They should be able to reference a frame index since
+  // they are stored to the stack, so we can get "ld rx, offset(r1)"
+  // instead of "addi ry, r1, offset / ld rx, 0(ry)".  Obj will
+  // just contain the parameter.  Try to handle this with a FI.
+
+  // Try to get this in a register if nothing else has worked.
+  if (Addr.Base.Reg == 0)
+    Addr.Base.Reg = getRegForValue(Obj);
+
+  // Prevent assignment of base register to X0, which is inappropriate
+  // for loads and stores alike.
+  if (Addr.Base.Reg != 0)
+    MRI.setRegClass(Addr.Base.Reg, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  return Addr.Base.Reg != 0;
+}
+
+// Fix up some addresses that can't be used directly.  For example, if
+// an offset won't fit in an instruction field, we may need to move it
+// into an index register.
+void PPCFastISel::PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+                                     unsigned &IndexReg) {
+
+  // Check whether the offset fits in the instruction field.
+  if (!isInt<16>(Addr.Offset))
+    UseOffset = false;
+
+  // If this is a stack pointer and the offset needs to be simplified then
+  // put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) {
+    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDI8),
+            ResultReg).addFrameIndex(Addr.Base.FI).addImm(0);
+    Addr.Base.Reg = ResultReg;
+    Addr.BaseType = Address::RegBase;
+  }
+
+  if (!UseOffset) {
+    IntegerType *OffsetTy = ((VT == MVT::i32) ? Type::getInt32Ty(*Context)
+                             : Type::getInt64Ty(*Context));
+    const ConstantInt *Offset =
+      ConstantInt::getSigned(OffsetTy, (int64_t)(Addr.Offset));
+    IndexReg = PPCMaterializeInt(Offset, MVT::i64);
+    assert(IndexReg && "Unexpected error in PPCMaterializeInt!");
+  }
+}
+
+// Emit a load instruction if possible, returning true if we succeeded,
+// otherwise false.  See commentary below for how the register class of
+// the load is determined. 
+bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                              const TargetRegisterClass *RC,
+                              bool IsZExt, unsigned FP64LoadOpc) {
+  unsigned Opc;
+  bool UseOffset = true;
+
+  // If ResultReg is given, it determines the register class of the load.
+  // Otherwise, RC is the register class to use.  If the result of the
+  // load isn't anticipated in this block, both may be zero, in which
+  // case we must make a conservative guess.  In particular, don't assign
+  // R0 or X0 to the result register, as the result may be used in a load,
+  // store, add-immediate, or isel that won't permit this.  (Though
+  // perhaps the spill and reload of live-exit values would handle this?)
+  const TargetRegisterClass *UseRC =
+    (ResultReg ? MRI.getRegClass(ResultReg) :
+     (RC ? RC :
+      (VT == MVT::f64 ? &PPC::F8RCRegClass :
+       (VT == MVT::f32 ? &PPC::F4RCRegClass :
+        (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+         &PPC::GPRC_and_GPRC_NOR0RegClass)))));
+
+  bool Is32BitInt = UseRC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::LBZ : PPC::LBZ8;
+      break;
+    case MVT::i16:
+      Opc = (IsZExt ?
+             (Is32BitInt ? PPC::LHZ : PPC::LHZ8) : 
+             (Is32BitInt ? PPC::LHA : PPC::LHA8));
+      break;
+    case MVT::i32:
+      Opc = (IsZExt ? 
+             (Is32BitInt ? PPC::LWZ : PPC::LWZ8) :
+             (Is32BitInt ? PPC::LWA_32 : PPC::LWA));
+      if ((Opc == PPC::LWA || Opc == PPC::LWA_32) && ((Addr.Offset & 3) != 0))
+        UseOffset = false;
+      break;
+    case MVT::i64:
+      Opc = PPC::LD;
+      assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) && 
+             "64-bit load with 32-bit target??");
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::LFS;
+      break;
+    case MVT::f64:
+      Opc = FP64LoadOpc;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+  if (ResultReg == 0)
+    ResultReg = createResultReg(UseRC);
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+
+    MachineMemOperand *MMO =
+      FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+        MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset) {
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  } else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::LBZ:    Opc = PPC::LBZX;    break;
+      case PPC::LBZ8:   Opc = PPC::LBZX8;   break;
+      case PPC::LHZ:    Opc = PPC::LHZX;    break;
+      case PPC::LHZ8:   Opc = PPC::LHZX8;   break;
+      case PPC::LHA:    Opc = PPC::LHAX;    break;
+      case PPC::LHA8:   Opc = PPC::LHAX8;   break;
+      case PPC::LWZ:    Opc = PPC::LWZX;    break;
+      case PPC::LWZ8:   Opc = PPC::LWZX8;   break;
+      case PPC::LWA:    Opc = PPC::LWAX;    break;
+      case PPC::LWA_32: Opc = PPC::LWAX_32; break;
+      case PPC::LD:     Opc = PPC::LDX;     break;
+      case PPC::LFS:    Opc = PPC::LFSX;    break;
+      case PPC::LFD:    Opc = PPC::LFDX;    break;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addReg(Addr.Base.Reg).addReg(IndexReg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a load instruction.
+bool PPCFastISel::SelectLoad(const Instruction *I) {
+  // FIXME: No atomic loads are supported.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  This is necessary
+  // to constrain RA from using R0/X0 when this is not legal.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    AssignedReg ? MRI.getRegClass(AssignedReg) : 0;
+
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Emit a store instruction to store SrcReg at Addr.
+bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
+  assert(SrcReg && "Nothing to store!");
+  unsigned Opc;
+  bool UseOffset = true;
+
+  const TargetRegisterClass *RC = MRI.getRegClass(SrcReg);
+  bool Is32BitInt = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::STB : PPC::STB8;
+      break;
+    case MVT::i16:
+      Opc = Is32BitInt ? PPC::STH : PPC::STH8;
+      break;
+    case MVT::i32:
+      assert(Is32BitInt && "Not GPRC for i32??");
+      Opc = PPC::STW;
+      break;
+    case MVT::i64:
+      Opc = PPC::STD;
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::STFS;
+      break;
+    case MVT::f64:
+      Opc = PPC::STFD;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+    MachineMemOperand *MMO =
+      FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+        MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc)).addReg(SrcReg)
+      .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset) {
+    if (Addr.Offset == 0 && Opc == PPC::STW8)
+      dbgs() << "Possible problem here.\n";
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
+      .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  } else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::STB:  Opc = PPC::STBX;  break;
+      case PPC::STH : Opc = PPC::STHX;  break;
+      case PPC::STW : Opc = PPC::STWX;  break;
+      case PPC::STB8: Opc = PPC::STBX8; break;
+      case PPC::STH8: Opc = PPC::STHX8; break;
+      case PPC::STW8: Opc = PPC::STWX8; break;
+      case PPC::STD:  Opc = PPC::STDX;  break;
+      case PPC::STFS: Opc = PPC::STFSX; break;
+      case PPC::STFD: Opc = PPC::STFDX; break;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
+      .addReg(SrcReg).addReg(Addr.Base.Reg).addReg(IndexReg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a store instruction.
+bool PPCFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // FIXME: No atomics loads are supported.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(Op0->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!PPCEmitStore(VT, SrcReg, Addr))
+    return false;
+
+  return true;
+}
+
 // Attempt to fast-select a branch instruction.
 bool PPCFastISel::SelectBranch(const Instruction *I) {
  const BranchInst *BI = cast<BranchInst>(I);
@ -330,6 +787,109 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
  return true;
 }

+// Attempt to fast-select a binary integer operation that isn't already
+// handled automatically.
+bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestVT  = TLI.getValueType(I->getType(), true);
+
+  // We can get here in the case when we have a binary operation on a non-legal
+  // type and the target independent selector doesn't know how to handle it.
+  if (DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  If there is no register,
+  // make a conservative choice (don't assign R0).
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    (AssignedReg ? MRI.getRegClass(AssignedReg) :
+     &PPC::GPRC_and_GPRC_NOR0RegClass);
+  bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  unsigned Opc;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::ADD:
+      Opc = IsGPRC ? PPC::ADD4 : PPC::ADD8;
+      break;
+    case ISD::OR:
+      Opc = IsGPRC ? PPC::OR : PPC::OR8;
+      break;
+    case ISD::SUB:
+      Opc = IsGPRC ? PPC::SUBF : PPC::SUBF8;
+      break;
+  }
+
+  unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass);
+  unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+  if (SrcReg1 == 0) return false;
+
+  // Handle case of small immediate operand.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    const APInt &CIVal = ConstInt->getValue();
+    int Imm = (int)CIVal.getSExtValue();
+    bool UseImm = true;
+    if (isInt<16>(Imm)) {
+      switch (Opc) {
+        default:
+          llvm_unreachable("Missing case!");
+        case PPC::ADD4:
+          Opc = PPC::ADDI;
+          MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+          break;
+        case PPC::ADD8:
+          Opc = PPC::ADDI8;
+          MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+          break;
+        case PPC::OR:
+          Opc = PPC::ORI;
+          break;
+        case PPC::OR8:
+          Opc = PPC::ORI8;
+          break;
+        case PPC::SUBF:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI;
+            MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+            Imm = -Imm;
+          }
+          break;
+        case PPC::SUBF8:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI8;
+            MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+            Imm = -Imm;
+          }
+          break;
+      }
+
+      if (UseImm) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+          .addReg(SrcReg1).addImm(Imm);
+        UpdateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+  }
+
+  // Reg-reg case.
+  unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+  if (SrcReg2 == 0) return false;
+
+  // Reverse operands for subtract-from.
+  if (ISDOpcode == ISD::SUB)
+    std::swap(SrcReg1, SrcReg2);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+    .addReg(SrcReg1).addReg(SrcReg2);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
 // Attempt to fast-select a return instruction.
 bool PPCFastISel::SelectRet(const Instruction *I) {

@ -551,10 +1111,20 @@ bool PPCFastISel::SelectIntExt(const Instruction *I) {
 bool PPCFastISel::TargetSelectInstruction(const Instruction *I) {

  switch (I->getOpcode()) {
+    case Instruction::Load:
+      return SelectLoad(I);
+    case Instruction::Store:
+      return SelectStore(I);
    case Instruction::Br:
      return SelectBranch(I);
    case Instruction::IndirectBr:
      return SelectIndirectBr(I);
+    case Instruction::Add:
+      return SelectBinaryIntOp(I, ISD::ADD);
+    case Instruction::Or:
+      return SelectBinaryIntOp(I, ISD::OR);
+    case Instruction::Sub:
+      return SelectBinaryIntOp(I, ISD::SUB);
    case Instruction::Ret:
      return SelectRet(I);
    case Instruction::ZExt:
@ -611,6 +1181,68 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
  return DestReg;
 }

+// Materialize the address of a global value into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
+  assert(VT == MVT::i64 && "Non-address!");
+  const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass;
+  unsigned DestReg = createResultReg(RC);
+
+  // Global values may be plain old object addresses, TLS object
+  // addresses, constant pool entries, or jump tables.  How we generate
+  // code for these may depend on small, medium, or large code model.
+  CodeModel::Model CModel = TM.getCodeModel();
+
+  // FIXME: Jump tables are not yet required because fast-isel doesn't
+  // handle switches; if that changes, we need them as well.  For now,
+  // what follows assumes everything's a generic (or TLS) global address.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias, use the aliasee for determining thread-locality.
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false));
+    assert((GVar || isa<Function>(GV)) && "Unexpected GV subclass!");
+  }
+
+  // FIXME: We don't yet handle the complexity of TLS.
+  bool IsTLS = GVar && GVar->isThreadLocal();
+  if (IsTLS)
+    return 0;
+
+  // For small code model, generate a simple TOC load.
+  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtoc), DestReg)
+      .addGlobalAddress(GV).addReg(PPC::X2);
+  else {
+    // If the address is an externally defined symbol, a symbol with
+    // common or externally available linkage, a function address, or a
+    // jump table address (not yet needed), or if we are generating code
+    // for large code model, we generate:
+    //       LDtocL(GV, ADDIStocHA(%X2, GV))
+    // Otherwise we generate:
+    //       ADDItocL(ADDIStocHA(%X2, GV), GV)
+    // Either way, start with the ADDIStocHA:
+    unsigned HighPartReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDIStocHA),
+            HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
+
+    // !GVar implies a function address.  An external variable is one
+    // without an initializer.
+    // If/when switches are implemented, jump tables should be handled
+    // on the "if" path here.
+    if (CModel == CodeModel::Large || !GVar || !GVar->hasInitializer() ||
+        GVar->hasCommonLinkage() || GVar->hasAvailableExternallyLinkage())
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocL),
+              DestReg).addGlobalAddress(GV).addReg(HighPartReg);
+    else
+      // Otherwise generate the ADDItocL.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDItocL),
+              DestReg).addReg(HighPartReg).addGlobalAddress(GV);
+  }
+
+  return DestReg;
+}
+
 // Materialize a 32-bit integer constant into a register, and return
 // the register number (or zero if we failed to handle it).
 unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm,
@ -743,6 +1375,8 @@ unsigned PPCFastISel::TargetMaterializeConstant(const Constant *C) {

  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
    return PPCMaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return PPCMaterializeGV(GV, VT);
  else if (isa<ConstantInt>(C))
    return PPCMaterializeInt(C, VT);
  // TBD: Global values.
@ -756,10 +1390,82 @@ unsigned PPCFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
  return AI && 0;
 }

-// Fold loads into extends when possible.  TBD.
+// Fold loads into extends when possible.
+// FIXME: We can have multiple redundant extend/trunc instructions
+// following a load.  The folding only picks up one.  Extend this
+// to check subsequent instructions for the same pattern and remove
+// them.  Thus ResultReg should be the def reg for the last redundant
+// instruction in a chain, and all intervening instructions can be
+// removed from parent.  Change test/CodeGen/PowerPC/fast-isel-fold.ll
+// to add ELF64-NOT: rldicl to the appropriate tests when this works.
 bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                                      const LoadInst *LI) {
-  return MI && OpNo && LI && false;
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(LI->getType(), VT))
+    return false;
+
+  // Combine load followed by zero- or sign-extend.
+  bool IsZExt = false;
+  switch(MI->getOpcode()) {
+    default:
+      return false;
+
+    case PPC::RLDICL:
+    case PPC::RLDICL_32_64: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 56) ||
+          (VT == MVT::i16 && MB <= 48) ||
+          (VT == MVT::i32 && MB <= 32))
+        break;
+      return false;
+    }
+
+    case PPC::RLWINM:
+    case PPC::RLWINM8: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 24) ||
+          (VT == MVT::i16 && MB <= 16))
+        break;
+      return false;
+    }
+
+    case PPC::EXTSB:
+    case PPC::EXTSB8:
+    case PPC::EXTSB8_32_64:
+      /* There is no sign-extending load-byte instruction. */
+      return false;
+
+    case PPC::EXTSH:
+    case PPC::EXTSH8:
+    case PPC::EXTSH8_32_64: {
+      if (VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+
+    case PPC::EXTSW:
+    case PPC::EXTSW_32_64: {
+      if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+  }
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(LI->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg = MI->getOperand(0).getReg();
+
+  if (!PPCEmitLoad(VT, ResultReg, Addr, 0, IsZExt))
+    return false;
+
+  MI->eraseFromParent();
+  return true;
 }

 // Attempt to lower call arguments in a faster way than done by
@ -791,6 +1497,62 @@ unsigned PPCFastISel::FastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
    return PPCMaterialize32BitInt(Imm, RC);
 }

+// Override for ADDI and ADDI8 to set the correct register class
+// on RHS operand 0.  The automatic infrastructure naively assumes
+// GPRC for i32 and G8RC for i64; the concept of "no R0" is lost
+// for these cases.  At the moment, none of the other automatically
+// generated RI instructions require special treatment.  However, once
+// SelectSelect is implemented, "isel" requires similar handling.
+//
+// Also be conservative about the output register class.  Avoid
+// assigning R0 or X0 to the output register for GPRC and G8RC
+// register classes, as any such result could be used in ADDI, etc.,
+// where those regs have another meaning.
+unsigned PPCFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      uint64_t Imm) {
+  if (MachineInstOpcode == PPC::ADDI)
+    MRI.setRegClass(Op0, &PPC::GPRC_and_GPRC_NOR0RegClass);
+  else if (MachineInstOpcode == PPC::ADDI8)
+    MRI.setRegClass(Op0, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_ri(MachineInstOpcode, UseRC,
+                                   Op0, Op0IsKill, Imm);
+}
+
+// Override for instructions with one register operand to avoid use of
+// R0/X0.  The automatic infrastructure isn't aware of the context so
+// we must be conservative.
+unsigned PPCFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass* RC,
+                                     unsigned Op0, bool Op0IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill);
+}
+
+// Override for instructions with two register operands to avoid use
+// of R0/X0.  The automatic infrastructure isn't aware of the context
+// so we must be conservative.
+unsigned PPCFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass* RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill,
+                                   Op1, Op1IsKill);
+}
+
 namespace llvm {
  // Create the fast instruction selector for PowerPC64 ELF.
  FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo,
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@ -1813,10 +1813,8 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
 // Function whose sole purpose is to kill compiler warnings 
 // stemming from unused functions included from PPCGenCallingConv.inc.
 CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
-  if (Flag == 1)
-    return RetCC_PPC64_ELF_FIS; /* CC_PPC64_ELF_FIS in future patch. */
-  else
-    return RetCC_PPC64_ELF_FIS;
+  /* One of these will be CC_PPC64_ELF_FIS in a future patch. */
+  return Flag ? RetCC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
 }

 bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@ -636,6 +636,15 @@ def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src),
                   "lwax $rD, $src", LdStLHA,
                   [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
                   PPC970_DGroup_Cracked;
+// For fast-isel:
+let isCodeGenOnly = 1, mayLoad = 1 in {
+def LWA_32  : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src),
+                      "lwa $rD, $src", LdStLWA, []>, isPPC64,
+                      PPC970_DGroup_Cracked;
+def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
+                     "lwax $rD, $src", LdStLHA, []>, isPPC64,
+                     PPC970_DGroup_Cracked;
+} // end fast-isel isCodeGenOnly

 // Update forms.
 let mayLoad = 1, neverHasSideEffects = 1 in {
--- a/test/CodeGen/PowerPC/fast-isel-binary.ll
+++ b/test/CodeGen/PowerPC/fast-isel-binary.ll
@ -0,0 +1,137 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+; Test add with non-legal types
+
+define void @add_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; ELF64: add_i8
+  %a.addr = alloca i8, align 4
+  %0 = add i8 %a, %b
+; ELF64: add
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @add_i8_imm(i8 %a) nounwind ssp {
+entry:
+; ELF64: add_i8_imm
+  %a.addr = alloca i8, align 4
+  %0 = add i8 %a, 22;
+; ELF64: addi
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @add_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: add_i16
+  %a.addr = alloca i16, align 4
+  %0 = add i16 %a, %b
+; ELF64: add
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @add_i16_imm(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: add_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = add i16 %a, 243;
+; ELF64: addi
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+; Test or with non-legal types
+
+define void @or_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; ELF64: or_i8
+  %a.addr = alloca i8, align 4
+  %0 = or i8 %a, %b
+; ELF64: or
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @or_i8_imm(i8 %a) nounwind ssp {
+entry:
+; ELF64: or_i8_imm
+  %a.addr = alloca i8, align 4
+  %0 = or i8 %a, -13;
+; ELF64: ori
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @or_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: or_i16
+  %a.addr = alloca i16, align 4
+  %0 = or i16 %a, %b
+; ELF64: or
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @or_i16_imm(i16 %a) nounwind ssp {
+entry:
+; ELF64: or_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = or i16 %a, 273;
+; ELF64: ori
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+; Test sub with non-legal types
+
+define void @sub_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; ELF64: sub_i8
+  %a.addr = alloca i8, align 4
+  %0 = sub i8 %a, %b
+; ELF64: subf
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i8_imm(i8 %a) nounwind ssp {
+entry:
+; ELF64: sub_i8_imm
+  %a.addr = alloca i8, align 4
+  %0 = sub i8 %a, 22;
+; ELF64: addi
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: sub_i16
+  %a.addr = alloca i16, align 4
+  %0 = sub i16 %a, %b
+; ELF64: subf
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i16_imm(i16 %a) nounwind ssp {
+entry:
+; ELF64: sub_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = sub i16 %a, 247;
+; ELF64: addi
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i16_badimm(i16 %a) nounwind ssp {
+entry:
+; ELF64: sub_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = sub i16 %a, -32768;
+; ELF64: subf
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
--- a/test/CodeGen/PowerPC/fast-isel-fold.ll
+++ b/test/CodeGen/PowerPC/fast-isel-fold.ll
@ -0,0 +1,95 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+@a = global i8 1, align 1
+@b = global i16 2, align 2
+@c = global i32 4, align 4
+
+define i32 @t3() nounwind uwtable ssp {
+; ELF64: t3
+  %1 = load i8* @a, align 1
+  %2 = zext i8 %1 to i32
+; ELF64: lbz
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i32 @t4() nounwind uwtable ssp {
+; ELF64: t4
+  %1 = load i16* @b, align 2
+  %2 = zext i16 %1 to i32
+; ELF64: lhz
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i32 @t5() nounwind uwtable ssp {
+; ELF64: t5
+  %1 = load i16* @b, align 2
+  %2 = sext i16 %1 to i32
+; ELF64: lha
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i32 @t6() nounwind uwtable ssp {
+; ELF64: t6
+  %1 = load i8* @a, align 2
+  %2 = sext i8 %1 to i32
+; ELF64: lbz
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i64 @t7() nounwind uwtable ssp {
+; ELF64: t7
+  %1 = load i8* @a, align 1
+  %2 = zext i8 %1 to i64
+; ELF64: lbz
+; ELF64-NOT: rldicl
+  ret i64 %2
+}
+
+define i64 @t8() nounwind uwtable ssp {
+; ELF64: t8
+  %1 = load i16* @b, align 2
+  %2 = zext i16 %1 to i64
+; ELF64: lhz
+; ELF64-NOT: rldicl
+  ret i64 %2
+}
+
+define i64 @t9() nounwind uwtable ssp {
+; ELF64: t9
+  %1 = load i16* @b, align 2
+  %2 = sext i16 %1 to i64
+; ELF64: lha
+; ELF64-NOT: extsh
+  ret i64 %2
+}
+
+define i64 @t10() nounwind uwtable ssp {
+; ELF64: t10
+  %1 = load i8* @a, align 2
+  %2 = sext i8 %1 to i64
+; ELF64: lbz
+; ELF64: extsb
+  ret i64 %2
+}
+
+define i64 @t11() nounwind uwtable ssp {
+; ELF64: t11
+  %1 = load i32* @c, align 4
+  %2 = zext i32 %1 to i64
+; ELF64: lwz
+; ELF64-NOT: rldicl
+  ret i64 %2
+}
+
+define i64 @t12() nounwind uwtable ssp {
+; ELF64: t12
+  %1 = load i32* @c, align 4
+  %2 = sext i32 %1 to i64
+; ELF64: lwa
+; ELF64-NOT: extsw
+  ret i64 %2
+}
--- a/test/CodeGen/PowerPC/fast-isel-load-store.ll
+++ b/test/CodeGen/PowerPC/fast-isel-load-store.ll
@ -0,0 +1,202 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+; This test verifies that load/store instructions are properly generated,
+; and that they pass MI verification.
+
+@a = global i8 1, align 1
+@b = global i16 2, align 2
+@c = global i32 4, align 4
+@d = global i64 8, align 8
+@e = global float 1.25, align 4
+@f = global double 3.5, align 8
+
+%struct.s = type<{ i8, i32 }>
+%struct.t = type<{ i8, i64 }>
+
+@g = global %struct.s <{ i8 1, i32 2 }>, align 1
+@h = global %struct.t <{ i8 1, i64 2 }>, align 1
+
+@i = common global [8192 x i64] zeroinitializer, align 8
+
+; load
+
+define i8 @t1() nounwind uwtable ssp {
+; ELF64: t1
+  %1 = load i8* @a, align 1
+; ELF64: lbz
+  %2 = add nsw i8 %1, 1
+; ELF64: addi
+  ret i8 %2
+}
+
+define i16 @t2() nounwind uwtable ssp {
+; ELF64: t2
+  %1 = load i16* @b, align 2
+; ELF64: lhz
+  %2 = add nsw i16 %1, 1
+; ELF64: addi
+  ret i16 %2
+}
+
+define i32 @t3() nounwind uwtable ssp {
+; ELF64: t3
+  %1 = load i32* @c, align 4
+; ELF64: lwz
+  %2 = add nsw i32 %1, 1
+; ELF64: addi
+  ret i32 %2
+}
+
+define i64 @t4() nounwind uwtable ssp {
+; ELF64: t4
+  %1 = load i64* @d, align 4
+; ELF64: ld
+  %2 = add nsw i64 %1, 1
+; ELF64: addi
+  ret i64 %2
+}
+
+define float @t5() nounwind uwtable ssp {
+; ELF64: t5
+  %1 = load float* @e, align 4
+; ELF64: lfs
+  %2 = fadd float %1, 1.0
+; ELF64: fadds
+  ret float %2
+}
+
+define double @t6() nounwind uwtable ssp {
+; ELF64: t6
+  %1 = load double* @f, align 8
+; ELF64: lfd
+  %2 = fadd double %1, 1.0
+; ELF64: fadd
+  ret double %2
+}
+
+; store
+
+define void @t7(i8 %v) nounwind uwtable ssp {
+; ELF64: t7
+  %1 = add nsw i8 %v, 1
+  store i8 %1, i8* @a, align 1
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: stb
+  ret void
+}
+
+define void @t8(i16 %v) nounwind uwtable ssp {
+; ELF64: t8
+  %1 = add nsw i16 %v, 1
+  store i16 %1, i16* @b, align 2
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: sth
+  ret void
+}
+
+define void @t9(i32 %v) nounwind uwtable ssp {
+; ELF64: t9
+  %1 = add nsw i32 %v, 1
+  store i32 %1, i32* @c, align 4
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: stw
+  ret void
+}
+
+define void @t10(i64 %v) nounwind uwtable ssp {
+; ELF64: t10
+  %1 = add nsw i64 %v, 1
+  store i64 %1, i64* @d, align 4
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: std
+  ret void
+}
+
+define void @t11(float %v) nounwind uwtable ssp {
+; ELF64: t11
+  %1 = fadd float %v, 1.0
+  store float %1, float* @e, align 4
+; ELF64: fadds
+; ELF64: stfs
+  ret void
+}
+
+define void @t12(double %v) nounwind uwtable ssp {
+; ELF64: t12
+  %1 = fadd double %v, 1.0
+  store double %1, double* @f, align 8
+; ELF64: fadd
+; ELF64: stfd
+  ret void
+}
+
+;; lwa requires an offset divisible by 4, so we need lwax here.
+define i64 @t13() nounwind uwtable ssp {
+; ELF64: t13
+  %1 = load i32* getelementptr inbounds (%struct.s* @g, i32 0, i32 1), align 1
+  %2 = sext i32 %1 to i64
+; ELF64: li
+; ELF64: lwax
+  %3 = add nsw i64 %2, 1
+; ELF64: addi
+  ret i64 %3
+}
+
+;; ld requires an offset divisible by 4, so we need ldx here.
+define i64 @t14() nounwind uwtable ssp {
+; ELF64: t14
+  %1 = load i64* getelementptr inbounds (%struct.t* @h, i32 0, i32 1), align 1
+; ELF64: li
+; ELF64: ldx
+  %2 = add nsw i64 %1, 1
+; ELF64: addi
+  ret i64 %2
+}
+
+;; std requires an offset divisible by 4, so we need stdx here.
+define void @t15(i64 %v) nounwind uwtable ssp {
+; ELF64: t15
+  %1 = add nsw i64 %v, 1
+  store i64 %1, i64* getelementptr inbounds (%struct.t* @h, i32 0, i32 1), align 1
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: li
+; ELF64: stdx
+  ret void
+}
+
+;; ld requires an offset that fits in 16 bits, so we need ldx here.
+define i64 @t16() nounwind uwtable ssp {
+; ELF64: t16
+  %1 = load i64* getelementptr inbounds ([8192 x i64]* @i, i32 0, i64 5000), align 8
+; ELF64: lis
+; ELF64: ori
+; ELF64: ldx
+  %2 = add nsw i64 %1, 1
+; ELF64: addi
+  ret i64 %2
+}
+
+;; std requires an offset that fits in 16 bits, so we need stdx here.
+define void @t17(i64 %v) nounwind uwtable ssp {
+; ELF64: t17
+  %1 = add nsw i64 %v, 1
+  store i64 %1, i64* getelementptr inbounds ([8192 x i64]* @i, i32 0, i64 5000), align 8
+; ELF64: addis
+; ELF64: ld
+; ELF64: addi
+; ELF64: lis
+; ELF64: ori
+; ELF64: stdx
+  ret void
+}
+