From 0328ca6cd7f194c4ee7e4a8203f56ae7a17ed014 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Wed, 18 Mar 2015 12:01:59 +0000
Subject: [PATCH] [ARM] Align stack objects passed to memory intrinsics

Memcpy, and other memory intrinsics, typically tries to use LDM/STM if
the source and target addresses are 4-byte aligned. In CodeGenPrepare
look for calls to memory intrinsics and, if the object is on the
stack, 4-byte align it if it's large enough that we expect that memcpy
would want to use LDM/STM to copy it.

Differential Revision: http://reviews.llvm.org/D7908


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@232627 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h |   9 +
 lib/CodeGen/CodeGenPrepare.cpp       |  36 ++++
 lib/Target/ARM/ARMISelLowering.cpp   |  15 ++
 lib/Target/ARM/ARMISelLowering.h     |   3 +
 test/CodeGen/ARM/memfunc.ll          | 293 +++++++++++++++++++++++++--
 test/CodeGen/ARM/memset-inline.ll    |   2 +-
 6 files changed, 337 insertions(+), 21 deletions(-)
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 5cf853cca2a..390cc9250b0 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -976,6 +976,15 @@ public:
     return false;
   }
 
+  /// Return true if the pointer arguments to CI should be aligned by aligning
+  /// the object whose address is being passed. If so then MinSize is set to the
+  /// minimum size the object must be to be aligned and PrefAlign is set to the
+  /// preferred alignment.
+  virtual bool shouldAlignPointerArgs(CallInst */*CI*/, unsigned &/*MinSize*/,
+                                      unsigned &/*PrefAlign*/) const {
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   /// \name Helpers for TargetTransformInfo implementations
   /// @{
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 61485fc67df..314c73437e0 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -1228,6 +1228,42 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       return true;
   }
 
+  const DataLayout *TD = TLI ? TLI->getDataLayout() : nullptr;
+
+  // Align the pointer arguments to this call if the target thinks it's a good
+  // idea
+  unsigned MinSize, PrefAlign;
+  if (TLI && TD && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
+    for (auto &Arg : CI->arg_operands()) {
+      // We want to align both objects whose address is used directly and
+      // objects whose address is used in casts and GEPs, though it only makes
+      // sense for GEPs if the offset is a multiple of the desired alignment and
+      // if size - offset meets the size threshold.
+      if (!Arg->getType()->isPointerTy())
+        continue;
+      APInt Offset(TD->getPointerSizeInBits(
+                     cast<PointerType>(Arg->getType())->getAddressSpace()), 0);
+      Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*TD, Offset);
+      uint64_t Offset2 = Offset.getLimitedValue();
+      AllocaInst *AI;
+      if ((Offset2 & (PrefAlign-1)) == 0 &&
+          (AI = dyn_cast<AllocaInst>(Val)) &&
+          AI->getAlignment() < PrefAlign &&
+          TD->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
+        AI->setAlignment(PrefAlign);
+      // TODO: Also align GlobalVariables
+    }
+    // If this is a memcpy (or similar) then we may be able to improve the
+    // alignment
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
+      unsigned Align = getKnownAlignment(MI->getDest(), *TD);
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
+        Align = std::min(Align, getKnownAlignment(MTI->getSource(), *TD));
+      if (Align > MI->getAlignment())
+        MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align));
+    }
+  }
+
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (II) {
     switch (II->getIntrinsicID()) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 74e8512851e..fb12cc226af 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Support/CommandLine.h"
@@ -1163,6 +1164,20 @@ const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
   return TargetLowering::getRegClassFor(VT);
 }
 
+// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
+// source/dest is aligned and the copy size is large enough. We therefore want
+// to align such objects passed to memory intrinsics.
+bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
+                                               unsigned &PrefAlign) const {
+  if (!isa<MemIntrinsic>(CI))
+    return false;
+  MinSize = 8;
+  // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
+  // cycle faster than 4-byte aligned LDM.
+  PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
+  return true;
+}
+
 // Create a fast isel object.
 FastISel *
 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index a364933b38a..682f479278e 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -368,6 +368,9 @@ namespace llvm {
       return true;
     }
 
+    bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
+                                unsigned &PrefAlign) const override;
+
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index 8d3800b43c1..160096a3808 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -1,31 +1,284 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios -o - | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7m-none-macho -o - | FileCheck %s --check-prefix=DARWIN
-; RUN: llc < %s -mtriple=arm-none-eabi -o - | FileCheck --check-prefix=EABI %s
-; RUN: llc < %s -mtriple=arm-none-eabihf -o - | FileCheck --check-prefix=EABI %s
+; RUN: llc < %s -mtriple=armv7-apple-ios -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-IOS --check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7m-none-macho -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-DARWIN --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
 
 @from = common global [500 x i32] zeroinitializer, align 4
 @to = common global [500 x i32] zeroinitializer, align 4
 
-define void @f() {
+define void @f1() {
 entry:
+  ; CHECK-LABEL: f1
 
-        ; CHECK: memmove
-        ; EABI: __aeabi_memmove
-        call void @llvm.memmove.p0i8.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0, i1 false)
+  ; CHECK-IOS: memmove
+  ; CHECK-DARWIN: memmove
+  ; CHECK-EABI: __aeabi_memmove
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0, i1 false)
 
-        ; CHECK: memcpy
-        ; EABI: __aeabi_memcpy
-        call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0, i1 false)
+  ; CHECK-IOS: memcpy
+  ; CHECK-DARWIN: memcpy
+  ; CHECK-EABI: __aeabi_memcpy
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0, i1 false)
 
-        ; EABI memset swaps arguments
-        ; CHECK: mov r1, #0
-        ; CHECK: memset
-        ; DARWIN: movs r1, #0
-        ; DARWIN: memset
-        ; EABI: mov r2, #0
-        ; EABI: __aeabi_memset
-        call void @llvm.memset.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8 0, i32 500, i32 0, i1 false)
-        unreachable
+  ; EABI memset swaps arguments
+  ; CHECK-IOS: mov r1, #0
+  ; CHECK-IOS: memset
+  ; CHECK-DARWIN: movs r1, #0
+  ; CHECK-DARWIN: memset
+  ; CHECK-EABI: mov r2, #0
+  ; CHECK-EABI: __aeabi_memset
+  call void @llvm.memset.p0i8.i32(i8* bitcast ([500 x i32]* @from to i8*), i8 0, i32 500, i32 0, i1 false)
+  unreachable
+}
+
+; Check that alloca arguments to memory intrinsics are automatically aligned if at least 8 bytes in size
+define void @f2(i8* %dest, i32 %n) {
+entry:
+  ; CHECK-LABEL: f2
+
+  ; IOS (ARMv7) should 8-byte align, others should 4-byte align
+  ; CHECK-IOS: add r1, sp, #32
+  ; CHECK-IOS: memmove
+  ; CHECK-DARWIN: add r1, sp, #28
+  ; CHECK-DARWIN: memmove
+  ; CHECK-EABI: add r1, sp, #28
+  ; CHECK-EABI: __aeabi_memmove
+  %arr0 = alloca [9 x i8], align 1
+  %0 = bitcast [9 x i8]* %arr0 to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
+
+  ; CHECK: add r1, sp, #16
+  ; CHECK-IOS: memcpy
+  ; CHECK-DARWIN: memcpy
+  ; CHECK-EABI: __aeabi_memcpy
+  %arr1 = alloca [9 x i8], align 1
+  %1 = bitcast [9 x i8]* %arr1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
+
+  ; CHECK-IOS: mov r0, sp
+  ; CHECK-IOS: mov r1, #0
+  ; CHECK-IOS: memset
+  ; CHECK-DARINW: add r0, sp, #4
+  ; CHECK-DARWIN: movs r1, #0
+  ; CHECK-DARWIN: memset
+  ; CHECK-EABI: add r0, sp, #4
+  ; CHECK-EABI: mov r2, #0
+  ; CHECK-EABI: __aeabi_memset
+  %arr2 = alloca [9 x i8], align 1
+  %2 = bitcast [9 x i8]* %arr2 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false)
+
+  unreachable
+}
+
+; Check that alloca arguments are not aligned if less than 8 bytes in size
+define void @f3(i8* %dest, i32 %n) {
+entry:
+  ; CHECK-LABEL: f3
+
+  ; CHECK: {{add(.w)? r1, sp, #17|sub(.w)? r1, r7, #15}}
+  ; CHECK-IOS: memmove
+  ; CHECK-DARWIN: memmove
+  ; CHECK-EABI: __aeabi_memmove
+  %arr0 = alloca [7 x i8], align 1
+  %0 = bitcast [7 x i8]* %arr0 to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r1, sp, #10}}
+  ; CHECK-IOS: memcpy
+  ; CHECK-DARWIN: memcpy
+  ; CHECK-EABI: __aeabi_memcpy
+  %arr1 = alloca [7 x i8], align 1
+  %1 = bitcast [7 x i8]* %arr1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r0, sp, #3}}
+  ; CHECK-IOS: mov r1, #0
+  ; CHECK-IOS: memset
+  ; CHECK-DARWIN: movs r1, #0
+  ; CHECK-DARWIN: memset
+  ; CHECK-EABI: mov r2, #0
+  ; CHECK-EABI: __aeabi_memset
+  %arr2 = alloca [7 x i8], align 1
+  %2 = bitcast [7 x i8]* %arr2 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false)
+
+  unreachable
+}
+
+; Check that alloca arguments are not aligned if size+offset is less than 8 bytes
+define void @f4(i8* %dest, i32 %n) {
+entry:
+  ; CHECK-LABEL: f4
+
+  ; CHECK: {{add(.w)? r., sp, #23|sub(.w)? r., r7, #17}}
+  ; CHECK-IOS: memmove
+  ; CHECK-DARWIN: memmove
+  ; CHECK-EABI: __aeabi_memmove
+  %arr0 = alloca [9 x i8], align 1
+  %0 = getelementptr inbounds [9 x i8], [9 x i8]* %arr0, i32 0, i32 4
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r., sp, #(10|14)}}
+  ; CHECK-IOS: memcpy
+  ; CHECK-DARWIN: memcpy
+  ; CHECK-EABI: __aeabi_memcpy
+  %arr1 = alloca [9 x i8], align 1
+  %1 = getelementptr inbounds [9 x i8], [9 x i8]* %arr1, i32 0, i32 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r., sp, #(1|5)}}
+  ; CHECK-IOS: mov r1, #0
+  ; CHECK-IOS: memset
+  ; CHECK-DARWIN: movs r1, #0
+  ; CHECK-DARWIN: memset
+  ; CHECK-EABI: mov r2, #0
+  ; CHECK-EABI: __aeabi_memset
+  %arr2 = alloca [9 x i8], align 1
+  %2 = getelementptr inbounds [9 x i8], [9 x i8]* %arr2, i32 0, i32 4
+  call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false)
+
+  unreachable
+}
+
+; Check that alloca arguments are not aligned if the offset is not a multiple of 4
+define void @f5(i8* %dest, i32 %n) {
+entry:
+  ; CHECK-LABEL: f5
+
+  ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #21}}
+  ; CHECK-IOS: memmove
+  ; CHECK-DARWIN: memmove
+  ; CHECK-EABI: __aeabi_memmove
+  %arr0 = alloca [13 x i8], align 1
+  %0 = getelementptr inbounds [13 x i8], [13 x i8]* %arr0, i32 0, i32 1
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r., sp, #(10|14)}}
+  ; CHECK-IOS: memcpy
+  ; CHECK-DARWIN: memcpy
+  ; CHECK-EABI: __aeabi_memcpy
+  %arr1 = alloca [13 x i8], align 1
+  %1 = getelementptr inbounds [13 x i8], [13 x i8]* %arr1, i32 0, i32 1
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r., sp, #(1|5)}}
+  ; CHECK-IOS: mov r1, #0
+  ; CHECK-IOS: memset
+  ; CHECK-DARWIN: movs r1, #0
+  ; CHECK-DARWIN: memset
+  ; CHECK-EABI: mov r2, #0
+  ; CHECK-EABI: __aeabi_memset
+  %arr2 = alloca [13 x i8], align 1
+  %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 1
+  call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false)
+
+  unreachable
+}
+
+; Check that alloca arguments are not aligned if the offset is unknown
+define void @f6(i8* %dest, i32 %n, i32 %i) {
+entry:
+  ; CHECK-LABEL: f6
+
+  ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #25}}
+  ; CHECK-IOS: memmove
+  ; CHECK-DARWIN: memmove
+  ; CHECK-EABI: __aeabi_memmove
+  %arr0 = alloca [13 x i8], align 1
+  %0 = getelementptr inbounds [13 x i8], [13 x i8]* %arr0, i32 0, i32 %i
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r., sp, #(10|14)}}
+  ; CHECK-IOS: memcpy
+  ; CHECK-DARWIN: memcpy
+  ; CHECK-EABI: __aeabi_memcpy
+  %arr1 = alloca [13 x i8], align 1
+  %1 = getelementptr inbounds [13 x i8], [13 x i8]* %arr1, i32 0, i32 %i
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r., sp, #(1|5)}}
+  ; CHECK-IOS: mov r1, #0
+  ; CHECK-IOS: memset
+  ; CHECK-DARWIN: movs r1, #0
+  ; CHECK-DARWIN: memset
+  ; CHECK-EABI: mov r2, #0
+  ; CHECK-EABI: __aeabi_memset
+  %arr2 = alloca [13 x i8], align 1
+  %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 %i
+  call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false)
+
+  unreachable
+}
+
+; Check that alloca arguments are not aligned if the GEP is not inbounds
+define void @f7(i8* %dest, i32 %n) {
+entry:
+  ; CHECK-LABEL: f7
+
+  ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #21}}
+  ; CHECK-IOS: memmove
+  ; CHECK-DARWIN: memmove
+  ; CHECK-EABI: __aeabi_memmove
+  %arr0 = alloca [13 x i8], align 1
+  %0 = getelementptr [13 x i8], [13 x i8]* %arr0, i32 0, i32 4
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r., sp, #(10|14)}}
+  ; CHECK-IOS: memcpy
+  ; CHECK-DARWIN: memcpy
+  ; CHECK-EABI: __aeabi_memcpy
+  %arr1 = alloca [13 x i8], align 1
+  %1 = getelementptr [13 x i8], [13 x i8]* %arr1, i32 0, i32 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r., sp, #(1|5)}}
+  ; CHECK-IOS: mov r1, #0
+  ; CHECK-IOS: memset
+  ; CHECK-DARWIN: movs r1, #0
+  ; CHECK-DARWIN: memset
+  ; CHECK-EABI: mov r2, #0
+  ; CHECK-EABI: __aeabi_memset
+  %arr2 = alloca [13 x i8], align 1
+  %2 = getelementptr [13 x i8], [13 x i8]* %arr2, i32 0, i32 4
+  call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false)
+
+  unreachable
+}
+
+; Check that alloca arguments are not aligned when the offset is past the end of the allocation
+define void @f8(i8* %dest, i32 %n) {
+entry:
+  ; CHECK-LABEL: f8
+
+  ; CHECK: {{add(.w)? r., sp, #27|sub(.w)? r., r7, #21}}
+  ; CHECK-IOS: memmove
+  ; CHECK-DARWIN: memmove
+  ; CHECK-EABI: __aeabi_memmove
+  %arr0 = alloca [13 x i8], align 1
+  %0 = getelementptr inbounds [13 x i8], [13 x i8]* %arr0, i32 0, i32 16
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r., sp, #(10|14)}}
+  ; CHECK-IOS: memcpy
+  ; CHECK-DARWIN: memcpy
+  ; CHECK-EABI: __aeabi_memcpy
+  %arr1 = alloca [13 x i8], align 1
+  %1 = getelementptr inbounds [13 x i8], [13 x i8]* %arr1, i32 0, i32 16
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false)
+
+  ; CHECK: {{add(.w)? r., sp, #(1|5)}}
+  ; CHECK-IOS: mov r1, #0
+  ; CHECK-IOS: memset
+  ; CHECK-DARWIN: movs r1, #0
+  ; CHECK-DARWIN: memset
+  ; CHECK-EABI: mov r2, #0
+  ; CHECK-EABI: __aeabi_memset
+  %arr2 = alloca [13 x i8], align 1
+  %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 16
+  call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false)
+
+  unreachable
 }
 
 declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll
index fa4f850b95a..191db1e20a2 100644
--- a/test/CodeGen/ARM/memset-inline.ll
+++ b/test/CodeGen/ARM/memset-inline.ll
@@ -17,7 +17,7 @@ entry:
 ; CHECK: add.w r1, r0, #10
 ; CHECK: vmov.i32 {{q[0-9]+}}, #0x0
 ; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
-; CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
   %buf = alloca [26 x i8], align 1
   %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0
   call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)