diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index f3b0154103a..d5a685d9f48 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -5558,6 +5558,22 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } + // The remaining cases assume either big endian element order or + // a splat-size that equates to the element size of the vector + // to be built. An example that doesn't work for little endian is + // {0, -1, 0, -1, 0, -1, 0, -1} which has a splat size of 32 bits + // and a vector element size of 16 bits. The code below will + // produce the vector in big endian element order, which for little + // endian is {-1, 0, -1, 0, -1, 0, -1, 0}. + + // For now, just avoid these optimizations in that case. + // FIXME: Develop correct optimizations for LE with mismatched + // splat and element sizes. + + if (PPCSubTarget.isLittleEndian() && + SplatSize != Op.getValueType().getVectorElementType().getSizeInBits()) + return SDValue(); + // Check to see if this is a wide variety of vsplti*, binop self cases. static const signed char SplatCsts[] = { -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, @@ -5821,21 +5837,36 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except // that it is in input element units, not in bytes. Convert now. + + // For little endian, the order of the input vectors is reversed, and + // the permutation mask is complemented with respect to 31. This is + // necessary to produce proper semantics with the big-endian-biased vperm + // instruction. EVT EltVT = V1.getValueType().getVectorElementType(); unsigned BytesPerElement = EltVT.getSizeInBits()/8; + bool isLittleEndian = PPCSubTarget.isLittleEndian(); SmallVector ResultMask; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; for (unsigned j = 0; j != BytesPerElement; ++j) - ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j, - MVT::i32)); + if (isLittleEndian) + ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement+j), + MVT::i32)); + else + ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j, + MVT::i32)); } SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, ResultMask); - return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask); + if (isLittleEndian) + return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), + V2, V1, VPermMask); + else + return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), + V1, V2, VPermMask); } /// getAltivecCompareInfo - Given an intrinsic, return false if it is not an diff --git a/test/CodeGen/PowerPC/vperm-lowering.ll b/test/CodeGen/PowerPC/vperm-lowering.ll new file mode 100644 index 00000000000..d55d26c959b --- /dev/null +++ b/test/CodeGen/PowerPC/vperm-lowering.ll @@ -0,0 +1,66 @@ +; RUN: llc -O0 -fast-isel=false -mcpu=ppc64 < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +define <16 x i8> @foo() nounwind ssp { + %1 = shufflevector <16 x i8> , <16 x i8> , <16 x i32> + ret <16 x i8> %1 +} + +; CHECK: .LCPI0_0: +; CHECK: .byte 31 +; CHECK: .byte 26 +; CHECK: .byte 21 +; CHECK: .byte 16 +; CHECK: .byte 11 +; CHECK: .byte 6 +; CHECK: .byte 1 +; CHECK: .byte 28 +; CHECK: .byte 23 +; CHECK: .byte 18 +; CHECK: .byte 13 +; CHECK: .byte 8 +; CHECK: .byte 3 +; CHECK: .byte 30 +; CHECK: .byte 25 +; CHECK: .byte 20 +; CHECK: .LCPI0_1: +; CHECK: .byte 0 +; CHECK: .byte 1 +; CHECK: .byte 2 +; CHECK: .byte 3 +; CHECK: .byte 4 +; CHECK: .byte 5 +; CHECK: .byte 6 +; CHECK: .byte 7 +; CHECK: .byte 8 +; CHECK: .byte 9 +; CHECK: .byte 10 +; CHECK: .byte 11 +; CHECK: .byte 12 +; CHECK: .byte 13 +; CHECK: .byte 14 +; CHECK: .byte 15 +; CHECK: .LCPI0_2: +; CHECK: .byte 16 +; CHECK: .byte 17 +; CHECK: .byte 18 +; CHECK: .byte 19 +; CHECK: .byte 20 +; CHECK: .byte 21 +; CHECK: .byte 22 +; CHECK: .byte 23 +; CHECK: .byte 24 +; CHECK: .byte 25 +; CHECK: .byte 26 +; CHECK: .byte 27 +; CHECK: .byte 28 +; CHECK: .byte 29 +; CHECK: .byte 30 +; CHECK: .byte 31 +; CHECK: foo: +; CHECK: addis [[REG1:[0-9]+]], 2, .LCPI0_2@toc@ha +; CHECK: addi [[REG2:[0-9]+]], [[REG1]], .LCPI0_2@toc@l +; CHECK: lvx [[REG3:[0-9]+]], 0, [[REG2]] +; CHECK: vperm {{[0-9]+}}, [[REG3]], {{[0-9]+}}, {{[0-9]+}}