Now that SROA can form alloca's for dynamic vector accesses, further improve it to be able to replace operations on these vector alloca's with insert/extract element insts

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@158623 91177308-0d34-0410-b5e6-96231b3b80d8
2025-03-03 14:31:10 +00:00 · 2012-06-17 03:58:26 +00:00 · 2012-06-17 03:58:26 +00:00 · 80f020a34a
commit 80f020a34a
parent e04690e092
2 changed files with 161 additions and 61 deletions
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@ -264,23 +264,31 @@ class ConvertToScalarInfo {
  /// large integers unless there is some potential for optimization.
  bool HadNonMemTransferAccess;

+  /// HadDynamicAccess - True if some element of this alloca was dynamic.
+  /// We don't yet have support for turning a dynamic access into a large
+  /// integer.
+  bool HadDynamicAccess;
+
 public:
  explicit ConvertToScalarInfo(unsigned Size, const TargetData &td)
    : AllocaSize(Size), TD(td), IsNotTrivial(false), ScalarKind(Unknown),
-      VectorTy(0), HadNonMemTransferAccess(false) { }
+      VectorTy(0), HadNonMemTransferAccess(false), HadDynamicAccess(false) { }

  AllocaInst *TryConvert(AllocaInst *AI);

 private:
-  bool CanConvertToScalar(Value *V, uint64_t Offset);
+  bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx);
  void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset);
  bool MergeInVectorType(VectorType *VInTy, uint64_t Offset);
-  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
+  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset,
+                           Value *NonConstantIdx);

  Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType,
-                                    uint64_t Offset, IRBuilder<> &Builder);
+                                    uint64_t Offset, Value* NonConstantIdx,
+                                    IRBuilder<> &Builder);
  Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
-                                   uint64_t Offset, IRBuilder<> &Builder);
+                                   uint64_t Offset, Value* NonConstantIdx,
+                                   IRBuilder<> &Builder);
 };
 } // end anonymous namespace.

@ -291,7 +299,7 @@ private:
 AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
  // If we can't convert this scalar, or if mem2reg can trivially do it, bail
  // out.
-  if (!CanConvertToScalar(AI, 0) || !IsNotTrivial)
+  if (!CanConvertToScalar(AI, 0, 0) || !IsNotTrivial)
    return 0;

  // If an alloca has only memset / memcpy uses, it may still have an Unknown
@ -319,13 +327,18 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
    if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
        !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth))
      return 0;
+    // Dynamic accesses on integers aren't yet supported.  They need us to shift
+    // by a dynamic amount which could be difficult to work out as we might not
+    // know whether to use a left or right shift.
+    if (ScalarKind == Integer && HadDynamicAccess)
+      return 0;

    DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
    // Create and insert the integer alloca.
    NewTy = IntegerType::get(AI->getContext(), BitWidth);
  }
  AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin());
-  ConvertUsesToScalar(AI, NewAI, 0);
+  ConvertUsesToScalar(AI, NewAI, 0, 0);
  return NewAI;
 }

@ -412,7 +425,8 @@ bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
 ///
 /// If we see at least one access to the value that is as a vector type, set the
 /// SawVec flag.
-bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
+bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
+                                             Value* NonConstantIdx) {
  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
    Instruction *User = cast<Instruction>(*UI);

@ -442,24 +456,35 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
    if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
      if (!onlyUsedByLifetimeMarkers(BCI))
        IsNotTrivial = true;  // Can't be mem2reg'd.
-      if (!CanConvertToScalar(BCI, Offset))
+      if (!CanConvertToScalar(BCI, Offset, NonConstantIdx))
        return false;
      continue;
    }

    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
      // If this is a GEP with a variable indices, we can't handle it.
-      if (!GEP->hasAllConstantIndices())
+      PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
+      if (!PtrTy)
        return false;

      // Compute the offset that this GEP adds to the pointer.
      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-      if (!GEP->getPointerOperandType()->isPointerTy())
-        return false;
-      uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
+      Value *GEPNonConstantIdx = 0;
+      if (!GEP->hasAllConstantIndices()) {
+        if (!isa<VectorType>(PtrTy->getElementType()))
+          return false;
+        if (NonConstantIdx)
+          return false;
+        GEPNonConstantIdx = Indices.pop_back_val();
+        if (!GEPNonConstantIdx->getType()->isIntegerTy(32))
+          return false;
+        HadDynamicAccess = true;
+      } else
+        GEPNonConstantIdx = NonConstantIdx;
+      uint64_t GEPOffset = TD.getIndexedOffset(PtrTy,
                                               Indices);
      // See if all uses can be converted.
-      if (!CanConvertToScalar(GEP, Offset+GEPOffset))
+      if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx))
        return false;
      IsNotTrivial = true;  // Can't be mem2reg'd.
      HadNonMemTransferAccess = true;
@ -469,6 +494,9 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
    // If this is a constant sized memset of a constant value (e.g. 0) we can
    // handle it.
    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
      // Store of constant value.
      if (!isa<ConstantInt>(MSI->getValue()))
        return false;
@ -493,6 +521,9 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
    // If this is a memcpy or memmove into or out of the whole allocation, we
    // can handle it like a load or store of the scalar type.
    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
      ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength());
      if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0)
        return false;
@ -524,12 +555,13 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.  By the end of this, there should be no uses of Ptr.
 void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
-                                              uint64_t Offset) {
+                                              uint64_t Offset,
+                                              Value* NonConstantIdx) {
  while (!Ptr->use_empty()) {
    Instruction *User = cast<Instruction>(Ptr->use_back());

    if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
-      ConvertUsesToScalar(CI, NewAI, Offset);
+      ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx);
      CI->eraseFromParent();
      continue;
    }
@ -537,9 +569,11 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
      // Compute the offset that this GEP adds to the pointer.
      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      if (!GEP->hasAllConstantIndices())
+        NonConstantIdx = Indices.pop_back_val();
      uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
                                               Indices);
-      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
+      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, NonConstantIdx);
      GEP->eraseFromParent();
      continue;
    }
@ -550,7 +584,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
      // The load is a bit extract from NewAI shifted right by Offset bits.
      Value *LoadedVal = Builder.CreateLoad(NewAI);
      Value *NewLoadVal
-        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder);
+        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset,
+                                     NonConstantIdx, Builder);
      LI->replaceAllUsesWith(NewLoadVal);
      LI->eraseFromParent();
      continue;
@ -560,7 +595,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
      assert(SI->getOperand(0) != Ptr && "Consistency error!");
      Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
      Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
-                                             Builder);
+                                             NonConstantIdx, Builder);
      Builder.CreateStore(New, NewAI);
      SI->eraseFromParent();

@ -575,6 +610,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
    // transform it into a store of the expanded constant value.
    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
      assert(MSI->getRawDest() == Ptr && "Consistency error!");
+      assert(!NonConstantIdx && "Cannot replace dynamic memset with insert");
      int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
      if (SNumBytes > 0 && (SNumBytes >> 32) == 0) {
        unsigned NumBytes = static_cast<unsigned>(SNumBytes);
@ -591,7 +627,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
        Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
        Value *New = ConvertScalar_InsertValue(
                                    ConstantInt::get(User->getContext(), APVal),
-                                               Old, Offset, Builder);
+                                               Old, Offset, 0, Builder);
        Builder.CreateStore(New, NewAI);

        // If the load we just inserted is now dead, then the memset overwrote
@ -607,6 +643,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
    // can handle it like a load or store of the scalar type.
    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
      assert(Offset == 0 && "must be store to start of alloca");
+      assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert");

      // If the source and destination are both to the same alloca, then this is
      // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
@ -679,7 +716,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
 /// shifted to the right.
 Value *ConvertToScalarInfo::
 ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
-                           uint64_t Offset, IRBuilder<> &Builder) {
+                           uint64_t Offset, Value* NonConstantIdx,
+                           IRBuilder<> &Builder) {
  // If the load is of the whole new alloca, no conversion is needed.
  Type *FromType = FromVal->getType();
  if (FromType == ToType && Offset == 0)
@ -701,7 +739,17 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
    }
    // Return the element extracted out of it.
-    Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt));
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    Value *V = Builder.CreateExtractElement(FromVal, Idx);
    if (V->getType() != ToType)
      V = Builder.CreateBitCast(V, ToType);
    return V;
@ -710,23 +758,27 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
  // If ToType is a first class aggregate, extract out each of the pieces and
  // use insertvalue's to form the FCA.
  if (StructType *ST = dyn_cast<StructType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
    const StructLayout &Layout = *TD.getStructLayout(ST);
    Value *Res = UndefValue::get(ST);
    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
      Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
                                        Offset+Layout.getElementOffsetInBits(i),
-                                              Builder);
+                                              0, Builder);
      Res = Builder.CreateInsertValue(Res, Elt, i);
    }
    return Res;
  }

  if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
    uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
    Value *Res = UndefValue::get(AT);
    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
      Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
-                                              Offset+i*EltSize, Builder);
+                                              Offset+i*EltSize, 0, Builder);
      Res = Builder.CreateInsertValue(Res, Elt, i);
    }
    return Res;
@ -792,9 +844,14 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
 ///
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.
+///
+/// NonConstantIdx is an index value if there was a GEP with a non-constant
+/// index value.  If this is 0 then all GEPs used to find this insert address
+/// are constant.
 Value *ConvertToScalarInfo::
 ConvertScalar_InsertValue(Value *SV, Value *Old,
-                          uint64_t Offset, IRBuilder<> &Builder) {
+                          uint64_t Offset, Value* NonConstantIdx,
+                          IRBuilder<> &Builder) {
  // Convert the stored type to the actual type, shift it left to insert
  // then 'or' into place.
  Type *AllocaType = Old->getType();
@ -815,26 +872,40 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
      SV = Builder.CreateBitCast(SV, EltTy);
    uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy);
    unsigned Elt = Offset/EltSize;
-    return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt));
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    return Builder.CreateInsertElement(Old, SV, Idx);
  }

  // If SV is a first-class aggregate value, insert each value recursively.
  if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
    const StructLayout &Layout = *TD.getStructLayout(ST);
    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
      Value *Elt = Builder.CreateExtractValue(SV, i);
      Old = ConvertScalar_InsertValue(Elt, Old,
                                      Offset+Layout.getElementOffsetInBits(i),
-                                      Builder);
+                                      0, Builder);
    }
    return Old;
  }

  if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
    uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
      Value *Elt = Builder.CreateExtractValue(SV, i);
-      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder);
+      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder);
    }
    return Old;
  }
--- a/test/Transforms/ScalarRepl/dynamic-vector-gep.ll
+++ b/test/Transforms/ScalarRepl/dynamic-vector-gep.ll
@ -4,12 +4,14 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 target triple = "x86_64-apple-darwin10.0.0"

 ; CHECK: @test1
-; CHECK: %[[alloc0:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: %[[alloc1:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc0]]
+; CHECK: %[[alloc:[\.a-z0-9]*]] = alloca <4 x float>
+; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc]]
+; CHECK: memset
+; CHECK: extractelement <4 x float> zeroinitializer, i32 %idx2

 ; Split the array but don't replace the memset with an insert
 ; element as its not a constant offset.
+; The load, however, can be replaced with an extract element.
 define float @test1(i32 %idx1, i32 %idx2) {
 entry:
  %0 = alloca [4 x <4 x float>]
@ -23,13 +25,8 @@ entry:
 }

 ; CHECK: @test2
-; CHECK: %[[alloc:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc]]
-; CHECK: %ptr1 = getelementptr inbounds <4 x float>* %[[alloc]], i32 0, i32 %idx1
-; CHECK: store float 1.000000e+00, float* %ptr1
-; CHECK: %ptr2 = getelementptr inbounds <4 x float>* %[[alloc]], i32 0, i32 %idx2
-; CHECK: %ret = load float* %ptr2
-; CHECK: ret float %ret
+; CHECK: %[[ins:[\.a-z0-9]*]] = insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <4 x float> %[[ins]], i32 %idx2

 ; Do SROA on the array when it has dynamic vector reads and writes.
 define float @test2(i32 %idx1, i32 %idx2) {
@ -61,13 +58,34 @@ entry:
  ret float %ret
 }

-; CHECK: @test4
+; CHECK: test4
+; CHECK: insertelement <16 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <16 x float> %0, i32 %idx2
+
+; Don't do SROA on a dynamically indexed vector when it spans
+; more than one array element of the alloca array it is within.
+; However, unlike test3, the store is on the vector type
+; so SROA will convert the large alloca into the large vector
+; type and do all accesses with insert/extract element
+define float @test4(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca [4 x <4 x float>]
+  %bigvec = bitcast [4 x <4 x float>]* %0 to <16 x float>*
+  store <16 x float> zeroinitializer, <16 x float>* %bigvec
+  %ptr1 = getelementptr <16 x float>* %bigvec, i32 0, i32 %idx1
+  store float 1.0, float* %ptr1
+  %ptr2 = getelementptr <16 x float>* %bigvec, i32 0, i32 %idx2
+  %ret = load float* %ptr2
+  ret float %ret
+}
+
+; CHECK: @test5
 ; CHECK: %0 = alloca [4 x <4 x float>]
 ; CHECK-NOT: alloca

 ; Don't do SROA as the is a second dynamically indexed array
 ; which may span multiple elements of the alloca.
-define float @test4(i32 %idx1, i32 %idx2) {
+define float @test5(i32 %idx1, i32 %idx2) {
 entry:
  %0 = alloca [4 x <4 x float>]
  store [4 x <4 x float>] zeroinitializer, [4 x <4 x float>]* %0
@ -80,15 +98,9 @@ entry:
  ret float %ret
 }

-; CHECK: test5
-; CHECK: %[[alloc0:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: %[[alloc1:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc0]]
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc1]]
-; CHECK: %ptr1 = getelementptr inbounds <4 x float>* %[[alloc0]], i32 0, i32 %idx1
-; CHECK: store float 1.000000e+00, float* %ptr1
-; CHECK: %ptr2 = getelementptr inbounds <4 x float>* %[[alloc1]], i32 0, i32 %idx2
-; CHECK: %ret = load float* %ptr2
+; CHECK: test6
+; CHECK: insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <4 x float> zeroinitializer, i32 %idx2

 %vector.pair = type { %vector.anon, %vector.anon }
 %vector.anon = type { %vector }
@ -99,7 +111,7 @@ entry:
 ; the original GEP, just the indices it needs to get to the correct offset of
 ; some type, not necessarily the dynamic vector.
 ; This test makes sure we don't have this crash.
-define float @test5(i32 %idx1, i32 %idx2) {
+define float @test6(i32 %idx1, i32 %idx2) {
 entry:
  %0 = alloca %vector.pair
  store %vector.pair zeroinitializer, %vector.pair* %0
@ -110,21 +122,15 @@ entry:
  ret float %ret
 }

-; CHECK: test6
-; CHECK: %[[alloc0:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: %[[alloc1:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc0]]
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc1]]
-; CHECK: %ptr1 = getelementptr inbounds <4 x float>* %[[alloc0]], i32 0, i32 %idx1
-; CHECK: store float 1.000000e+00, float* %ptr1
-; CHECK: %ptr2 = getelementptr inbounds <4 x float>* %[[alloc1]], i32 0, i32 %idx2
-; CHECK: %ret = load float* %ptr2
+; CHECK: test7
+; CHECK: insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <4 x float> zeroinitializer, i32 %idx2

 %array.pair = type { [2 x %array.anon], %array.anon }
 %array.anon = type { [2 x %vector] }

-; This is the same as test5 and tests the same crash, but on arrays.
-define float @test6(i32 %idx1, i32 %idx2) {
+; This is the same as test6 and tests the same crash, but on arrays.
+define float @test7(i32 %idx1, i32 %idx2) {
 entry:
  %0 = alloca %array.pair
  store %array.pair zeroinitializer, %array.pair* %0
@ -135,4 +141,27 @@ entry:
  ret float %ret
 }

+; CHECK: test8
+; CHECK: %[[offset1:[\.a-z0-9]*]] = add i32 %idx1, 1
+; CHECK: %[[ins:[\.a-z0-9]*]] = insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %[[offset1]]
+; CHECK: %[[offset2:[\.a-z0-9]*]] = add i32 %idx2, 2
+; CHECK: extractelement <4 x float> %[[ins]], i32 %[[offset2]]
+
+; Do SROA on the vector when it has dynamic vector reads and writes
+; from a non-zero offset.
+define float @test8(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca <4 x float>
+  store <4 x float> zeroinitializer, <4 x float>* %0
+  %ptr1 = getelementptr <4 x float>* %0, i32 0, i32 1
+  %ptr2 = bitcast float* %ptr1 to <3 x float>*
+  %ptr3 = getelementptr <3 x float>* %ptr2, i32 0, i32 %idx1
+  store float 1.0, float* %ptr3
+  %ptr4 = getelementptr <4 x float>* %0, i32 0, i32 2
+  %ptr5 = bitcast float* %ptr4 to <2 x float>*
+  %ptr6 = getelementptr <2 x float>* %ptr5, i32 0, i32 %idx2
+  %ret = load float* %ptr6
+  ret float %ret
+}
+
 declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1)