[PPC64LE] Generate correct code for unaligned little-endian vector loads

The code in PPCTargetLowering::PerformDAGCombine() that handles unaligned Altivec vector loads generates a lvsl followed by a vperm. As we've seen in numerous other places, the vperm instruction has a big-endian bias, and this is fixed for little endian by complementing the permute control vector and swapping the input operands. In this case the lvsl is providing the permute control vector. Rather than generating an lvsl and a complement operation, it is sufficient to generate an lvsr instruction instead. Thus for LE code generation we will generate an lvsr rather than an lvsl, and swap the other input arguments on the vperm. The existing test/CodeGen/PowerPC/vec_misalign.ll is updated to test the code generation for PPC64 and PPC64LE, in addition to the existing PPC32/G5 testing. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@210493 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-11 00:39:36 +00:00 · 2014-06-09 22:00:52 +00:00 · 2014-06-09 22:00:52 +00:00 · 8e38e86266
commit 8e38e86266
parent f4a702c079
2 changed files with 48 additions and 22 deletions
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@ -8095,6 +8095,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
      // This is a type-legal unaligned Altivec load.
      SDValue Chain = LD->getChain();
      SDValue Ptr = LD->getBasePtr();
+      bool isLittleEndian = PPCSubTarget.isLittleEndian();

      // This implements the loading of unaligned vectors as described in
      // the venerable Apple Velocity Engine overview. Specifically:
@ -8102,25 +8103,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
      // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
      //
      // The general idea is to expand a sequence of one or more unaligned
-      // loads into a alignment-based permutation-control instruction (lvsl),
-      // a series of regular vector loads (which always truncate their
-      // input address to an aligned address), and a series of permutations.
-      // The results of these permutations are the requested loaded values.
-      // The trick is that the last "extra" load is not taken from the address
-      // you might suspect (sizeof(vector) bytes after the last requested
-      // load), but rather sizeof(vector) - 1 bytes after the last
-      // requested vector. The point of this is to avoid a page fault if the
-      // base address happened to be aligned. This works because if the base
-      // address is aligned, then adding less than a full vector length will
-      // cause the last vector in the sequence to be (re)loaded. Otherwise,
-      // the next vector will be fetched as you might suspect was necessary.
+      // loads into an alignment-based permutation-control instruction (lvsl
+      // or lvsr), a series of regular vector loads (which always truncate
+      // their input address to an aligned address), and a series of
+      // permutations.  The results of these permutations are the requested
+      // loaded values.  The trick is that the last "extra" load is not taken
+      // from the address you might suspect (sizeof(vector) bytes after the
+      // last requested load), but rather sizeof(vector) - 1 bytes after the
+      // last requested vector. The point of this is to avoid a page fault if
+      // the base address happened to be aligned. This works because if the
+      // base address is aligned, then adding less than a full vector length
+      // will cause the last vector in the sequence to be (re)loaded.
+      // Otherwise, the next vector will be fetched as you might suspect was
+      // necessary.

      // We might be able to reuse the permutation generation from
      // a different base address offset from this one by an aligned amount.
      // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
      // optimization later.
-      SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr,
-                                          DAG, dl, MVT::v16i8);
+      Intrinsic::ID Intr = (isLittleEndian ?
+                            Intrinsic::ppc_altivec_lvsr :
+                            Intrinsic::ppc_altivec_lvsl);
+      SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8);

      // Refine the alignment of the original load (a "new" load created here
      // which was identical to the first except for the alignment would be
@ -8169,8 +8173,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
      if (ExtraLoad.getValueType() != MVT::v4i32)
        ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad);

-      SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
-                                      BaseLoad, ExtraLoad, PermCntl, DAG, dl);
+      // Because vperm has a big-endian bias, we must reverse the order
+      // of the input vectors and complement the permute control vector
+      // when generating little endian code.  We have already handled the
+      // latter by using lvsr instead of lvsl, so just reverse BaseLoad
+      // and ExtraLoad here.
+      SDValue Perm;
+      if (isLittleEndian)
+        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                ExtraLoad, BaseLoad, PermCntl, DAG, dl);
+      else
+        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                BaseLoad, ExtraLoad, PermCntl, DAG, dl);

      if (VT != MVT::v4i32)
        Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
@ -8210,9 +8224,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
    }
    }
    break;
-  case ISD::INTRINSIC_WO_CHAIN:
-    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() ==
-          Intrinsic::ppc_altivec_lvsl &&
+  case ISD::INTRINSIC_WO_CHAIN: {
+    bool isLittleEndian = PPCSubTarget.isLittleEndian();
+    Intrinsic::ID Intr = (isLittleEndian ?
+                          Intrinsic::ppc_altivec_lvsr :
+                          Intrinsic::ppc_altivec_lvsl);
+    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() == Intr &&
        N->getOperand(1)->getOpcode() == ISD::ADD) {
      SDValue Add = N->getOperand(1);

@ -8224,8 +8241,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
             UE = BasePtr->use_end(); UI != UE; ++UI) {
          if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
              cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
-                Intrinsic::ppc_altivec_lvsl) {
-            // We've found another LVSL, and this address if an aligned
+                Intr) {
+            // We've found another LVSL/LVSR, and this address is an aligned
            // multiple of that one. The results will be the same, so use the
            // one we've just found instead.

@ -8234,6 +8251,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
        }
      }
    }
+    }

    break;
  case ISD::BSWAP:
--- a/test/CodeGen/PowerPC/vec_misaligned.ll
+++ b/test/CodeGen/PowerPC/vec_misaligned.ll
@ -1,4 +1,6 @@
-; RUN: llc < %s -march=ppc32 -mcpu=g5
+; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE

 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin8"
@ -8,6 +10,8 @@ target triple = "powerpc-apple-darwin8"

 define void @foo(i32 %x, ...) {
 entry:
+; CHECK: foo:
+; CHECK-LE: foo:
 	%x_addr = alloca i32		; <i32*> [#uses=1]
 	%ap = alloca i8*		; <i8**> [#uses=3]
 	%ap.0 = alloca i8*		; <i8**> [#uses=3]
@ -27,6 +31,10 @@ entry:
 	%tmp8 = getelementptr %struct.u16qi* %tmp6, i32 0, i32 0		; <<16 x i8>*> [#uses=1]
 	%tmp9 = getelementptr %struct.u16qi* %tmp7, i32 0, i32 0		; <<16 x i8>*> [#uses=1]
 	%tmp10 = load <16 x i8>* %tmp9, align 4		; <<16 x i8>> [#uses=1]
+; CHECK: lvsl
+; CHECK: vperm
+; CHECK-LE: lvsr
+; CHECK-LE: vperm
 	store <16 x i8> %tmp10, <16 x i8>* %tmp8, align 4
 	br label %return