[PPC64LE] Generate correct code for unaligned little-endian vector loads

The code in PPCTargetLowering::PerformDAGCombine() that handles
unaligned Altivec vector loads generates a lvsl followed by a vperm.
As we've seen in numerous other places, the vperm instruction has a
big-endian bias, and this is fixed for little endian by complementing
the permute control vector and swapping the input operands.  In this
case the lvsl is providing the permute control vector.  Rather than
generating an lvsl and a complement operation, it is sufficient to
generate an lvsr instruction instead.  Thus for LE code generation we
will generate an lvsr rather than an lvsl, and swap the other input
arguments on the vperm.

The existing test/CodeGen/PowerPC/vec_misalign.ll is updated to test
the code generation for PPC64 and PPC64LE, in addition to the existing
PPC32/G5 testing.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@210493 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Bill Schmidt 2014-06-09 22:00:52 +00:00
parent f4a702c079
commit 8e38e86266
2 changed files with 48 additions and 22 deletions

View File

@ -8095,6 +8095,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
// This is a type-legal unaligned Altivec load.
SDValue Chain = LD->getChain();
SDValue Ptr = LD->getBasePtr();
bool isLittleEndian = PPCSubTarget.isLittleEndian();
// This implements the loading of unaligned vectors as described in
// the venerable Apple Velocity Engine overview. Specifically:
@ -8102,25 +8103,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
// https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
//
// The general idea is to expand a sequence of one or more unaligned
// loads into a alignment-based permutation-control instruction (lvsl),
// a series of regular vector loads (which always truncate their
// input address to an aligned address), and a series of permutations.
// The results of these permutations are the requested loaded values.
// The trick is that the last "extra" load is not taken from the address
// you might suspect (sizeof(vector) bytes after the last requested
// load), but rather sizeof(vector) - 1 bytes after the last
// requested vector. The point of this is to avoid a page fault if the
// base address happened to be aligned. This works because if the base
// address is aligned, then adding less than a full vector length will
// cause the last vector in the sequence to be (re)loaded. Otherwise,
// the next vector will be fetched as you might suspect was necessary.
// loads into an alignment-based permutation-control instruction (lvsl
// or lvsr), a series of regular vector loads (which always truncate
// their input address to an aligned address), and a series of
// permutations. The results of these permutations are the requested
// loaded values. The trick is that the last "extra" load is not taken
// from the address you might suspect (sizeof(vector) bytes after the
// last requested load), but rather sizeof(vector) - 1 bytes after the
// last requested vector. The point of this is to avoid a page fault if
// the base address happened to be aligned. This works because if the
// base address is aligned, then adding less than a full vector length
// will cause the last vector in the sequence to be (re)loaded.
// Otherwise, the next vector will be fetched as you might suspect was
// necessary.
// We might be able to reuse the permutation generation from
// a different base address offset from this one by an aligned amount.
// The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
// optimization later.
SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr,
DAG, dl, MVT::v16i8);
Intrinsic::ID Intr = (isLittleEndian ?
Intrinsic::ppc_altivec_lvsr :
Intrinsic::ppc_altivec_lvsl);
SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8);
// Refine the alignment of the original load (a "new" load created here
// which was identical to the first except for the alignment would be
@ -8169,8 +8173,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
if (ExtraLoad.getValueType() != MVT::v4i32)
ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad);
SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
BaseLoad, ExtraLoad, PermCntl, DAG, dl);
// Because vperm has a big-endian bias, we must reverse the order
// of the input vectors and complement the permute control vector
// when generating little endian code. We have already handled the
// latter by using lvsr instead of lvsl, so just reverse BaseLoad
// and ExtraLoad here.
SDValue Perm;
if (isLittleEndian)
Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
ExtraLoad, BaseLoad, PermCntl, DAG, dl);
else
Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
BaseLoad, ExtraLoad, PermCntl, DAG, dl);
if (VT != MVT::v4i32)
Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
@ -8210,9 +8224,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
}
}
break;
case ISD::INTRINSIC_WO_CHAIN:
if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() ==
Intrinsic::ppc_altivec_lvsl &&
case ISD::INTRINSIC_WO_CHAIN: {
bool isLittleEndian = PPCSubTarget.isLittleEndian();
Intrinsic::ID Intr = (isLittleEndian ?
Intrinsic::ppc_altivec_lvsr :
Intrinsic::ppc_altivec_lvsl);
if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() == Intr &&
N->getOperand(1)->getOpcode() == ISD::ADD) {
SDValue Add = N->getOperand(1);
@ -8224,8 +8241,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
UE = BasePtr->use_end(); UI != UE; ++UI) {
if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
Intrinsic::ppc_altivec_lvsl) {
// We've found another LVSL, and this address if an aligned
Intr) {
// We've found another LVSL/LVSR, and this address is an aligned
// multiple of that one. The results will be the same, so use the
// one we've just found instead.
@ -8234,6 +8251,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
}
}
}
}
break;
case ISD::BSWAP:

View File

@ -1,4 +1,6 @@
; RUN: llc < %s -march=ppc32 -mcpu=g5
; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s
; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec | FileCheck %s
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
target triple = "powerpc-apple-darwin8"
@ -8,6 +10,8 @@ target triple = "powerpc-apple-darwin8"
define void @foo(i32 %x, ...) {
entry:
; CHECK: foo:
; CHECK-LE: foo:
%x_addr = alloca i32 ; <i32*> [#uses=1]
%ap = alloca i8* ; <i8**> [#uses=3]
%ap.0 = alloca i8* ; <i8**> [#uses=3]
@ -27,6 +31,10 @@ entry:
%tmp8 = getelementptr %struct.u16qi* %tmp6, i32 0, i32 0 ; <<16 x i8>*> [#uses=1]
%tmp9 = getelementptr %struct.u16qi* %tmp7, i32 0, i32 0 ; <<16 x i8>*> [#uses=1]
%tmp10 = load <16 x i8>* %tmp9, align 4 ; <<16 x i8>> [#uses=1]
; CHECK: lvsl
; CHECK: vperm
; CHECK-LE: lvsr
; CHECK-LE: vperm
store <16 x i8> %tmp10, <16 x i8>* %tmp8, align 4
br label %return