diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 91366e2cceb..b7d7aaa5a88 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -8095,6 +8095,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // This is a type-legal unaligned Altivec load. SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); + bool isLittleEndian = PPCSubTarget.isLittleEndian(); // This implements the loading of unaligned vectors as described in // the venerable Apple Velocity Engine overview. Specifically: @@ -8102,25 +8103,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html // // The general idea is to expand a sequence of one or more unaligned - // loads into a alignment-based permutation-control instruction (lvsl), - // a series of regular vector loads (which always truncate their - // input address to an aligned address), and a series of permutations. - // The results of these permutations are the requested loaded values. - // The trick is that the last "extra" load is not taken from the address - // you might suspect (sizeof(vector) bytes after the last requested - // load), but rather sizeof(vector) - 1 bytes after the last - // requested vector. The point of this is to avoid a page fault if the - // base address happened to be aligned. This works because if the base - // address is aligned, then adding less than a full vector length will - // cause the last vector in the sequence to be (re)loaded. Otherwise, - // the next vector will be fetched as you might suspect was necessary. + // loads into an alignment-based permutation-control instruction (lvsl + // or lvsr), a series of regular vector loads (which always truncate + // their input address to an aligned address), and a series of + // permutations. The results of these permutations are the requested + // loaded values. The trick is that the last "extra" load is not taken + // from the address you might suspect (sizeof(vector) bytes after the + // last requested load), but rather sizeof(vector) - 1 bytes after the + // last requested vector. The point of this is to avoid a page fault if + // the base address happened to be aligned. This works because if the + // base address is aligned, then adding less than a full vector length + // will cause the last vector in the sequence to be (re)loaded. + // Otherwise, the next vector will be fetched as you might suspect was + // necessary. // We might be able to reuse the permutation generation from // a different base address offset from this one by an aligned amount. // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this // optimization later. - SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr, - DAG, dl, MVT::v16i8); + Intrinsic::ID Intr = (isLittleEndian ? + Intrinsic::ppc_altivec_lvsr : + Intrinsic::ppc_altivec_lvsl); + SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8); // Refine the alignment of the original load (a "new" load created here // which was identical to the first except for the alignment would be @@ -8169,8 +8173,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (ExtraLoad.getValueType() != MVT::v4i32) ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad); - SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, - BaseLoad, ExtraLoad, PermCntl, DAG, dl); + // Because vperm has a big-endian bias, we must reverse the order + // of the input vectors and complement the permute control vector + // when generating little endian code. We have already handled the + // latter by using lvsr instead of lvsl, so just reverse BaseLoad + // and ExtraLoad here. + SDValue Perm; + if (isLittleEndian) + Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, + ExtraLoad, BaseLoad, PermCntl, DAG, dl); + else + Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, + BaseLoad, ExtraLoad, PermCntl, DAG, dl); if (VT != MVT::v4i32) Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm); @@ -8210,9 +8224,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } break; - case ISD::INTRINSIC_WO_CHAIN: - if (cast(N->getOperand(0))->getZExtValue() == - Intrinsic::ppc_altivec_lvsl && + case ISD::INTRINSIC_WO_CHAIN: { + bool isLittleEndian = PPCSubTarget.isLittleEndian(); + Intrinsic::ID Intr = (isLittleEndian ? + Intrinsic::ppc_altivec_lvsr : + Intrinsic::ppc_altivec_lvsl); + if (cast(N->getOperand(0))->getZExtValue() == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) { SDValue Add = N->getOperand(1); @@ -8224,8 +8241,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && cast(UI->getOperand(0))->getZExtValue() == - Intrinsic::ppc_altivec_lvsl) { - // We've found another LVSL, and this address if an aligned + Intr) { + // We've found another LVSL/LVSR, and this address is an aligned // multiple of that one. The results will be the same, so use the // one we've just found instead. @@ -8234,6 +8251,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } } + } break; case ISD::BSWAP: diff --git a/test/CodeGen/PowerPC/vec_misaligned.ll b/test/CodeGen/PowerPC/vec_misaligned.ll index d7ed64a5b1c..304a84d49a9 100644 --- a/test/CodeGen/PowerPC/vec_misaligned.ll +++ b/test/CodeGen/PowerPC/vec_misaligned.ll @@ -1,4 +1,6 @@ -; RUN: llc < %s -march=ppc32 -mcpu=g5 +; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128" target triple = "powerpc-apple-darwin8" @@ -8,6 +10,8 @@ target triple = "powerpc-apple-darwin8" define void @foo(i32 %x, ...) { entry: +; CHECK: foo: +; CHECK-LE: foo: %x_addr = alloca i32 ; [#uses=1] %ap = alloca i8* ; [#uses=3] %ap.0 = alloca i8* ; [#uses=3] @@ -27,6 +31,10 @@ entry: %tmp8 = getelementptr %struct.u16qi* %tmp6, i32 0, i32 0 ; <<16 x i8>*> [#uses=1] %tmp9 = getelementptr %struct.u16qi* %tmp7, i32 0, i32 0 ; <<16 x i8>*> [#uses=1] %tmp10 = load <16 x i8>* %tmp9, align 4 ; <<16 x i8>> [#uses=1] +; CHECK: lvsl +; CHECK: vperm +; CHECK-LE: lvsr +; CHECK-LE: vperm store <16 x i8> %tmp10, <16 x i8>* %tmp8, align 4 br label %return