diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 6bcbc8e0ff4..898cd29c914 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -63,6 +63,8 @@ class VectorLegalizer { SDValue ExpandUINT_TO_FLOAT(SDValue Op); // Implement expansion for SIGN_EXTEND_INREG using SRL and SRA. SDValue ExpandSEXTINREG(SDValue Op); + // Expand bswap of vectors into a shuffle if legal. + SDValue ExpandBSWAP(SDValue Op); // Implement vselect in terms of XOR, AND, OR when blend is not supported // by the target. SDValue ExpandVSELECT(SDValue Op); @@ -297,6 +299,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case TargetLowering::Expand: if (Node->getOpcode() == ISD::SIGN_EXTEND_INREG) Result = ExpandSEXTINREG(Op); + else if (Node->getOpcode() == ISD::BSWAP) + Result = ExpandBSWAP(Op); else if (Node->getOpcode() == ISD::VSELECT) Result = ExpandVSELECT(Op); else if (Node->getOpcode() == ISD::SELECT) @@ -682,6 +686,29 @@ SDValue VectorLegalizer::ExpandSEXTINREG(SDValue Op) { return DAG.getNode(ISD::SRA, DL, VT, Op, ShiftSz); } +SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) { + EVT VT = Op.getValueType(); + + // Generate a byte wise shuffle mask for the BSWAP. + SmallVector ShuffleMask; + int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8; + for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I) + for (int J = ScalarSizeInBytes - 1; J >= 0; --J) + ShuffleMask.push_back((I * ScalarSizeInBytes) + J); + + EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, ShuffleMask.size()); + + // Only emit a shuffle if the mask is legal. + if (!TLI.isShuffleMaskLegal(ShuffleMask, ByteVT)) + return DAG.UnrollVectorOp(Op.getNode()); + + SDLoc DL(Op); + Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Op.getOperand(0)); + Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT), + ShuffleMask.data()); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); +} + SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) { // Implement VSELECT in terms of XOR, AND, OR // on platforms which do not support blend natively. diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index ac47f3e9eb1..d02a03ccb2a 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -520,6 +520,8 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + setOperationAction(ISD::BSWAP, VT, Expand); } // There is no v1i64/v2i64 multiply, expand v1i64/v2i64 to GPR i64 multiply. diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 976cef39a21..5beb752d3a4 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -414,6 +414,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); + + setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); } setOperationAction(ISD::ConstantFP, MVT::f32, Custom); diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp index e6790cfdfd5..538360cf39d 100644 --- a/lib/Target/ARM64/ARM64ISelLowering.cpp +++ b/lib/Target/ARM64/ARM64ISelLowering.cpp @@ -450,6 +450,8 @@ ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM) setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); + for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) setTruncStoreAction((MVT::SimpleValueType)VT, diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index a0b3e467d20..04bd43547c7 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -460,6 +460,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 63d28034ca8..e1db618baf1 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -15116,7 +15116,23 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, if (VT.getSizeInBits() == 64) return false; - // FIXME: pshufb, blends, shifts. + // If this is a single-input shuffle with no 128 bit lane crossings we can + // lower it into pshufb. + if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) || + (SVT.is256BitVector() && Subtarget->hasInt256())) { + bool isLegal = true; + for (unsigned I = 0, E = M.size(); I != E; ++I) { + if (M[I] >= (int)SVT.getVectorNumElements() || + ShuffleCrosses128bitLane(SVT, I, M[I])) { + isLegal = false; + break; + } + } + if (isLegal) + return true; + } + + // FIXME: blends, shifts. return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || diff --git a/test/CodeGen/ARM/vrev.ll b/test/CodeGen/ARM/vrev.ll index eb76ba6ea08..7215ad615e8 100644 --- a/test/CodeGen/ARM/vrev.ll +++ b/test/CodeGen/ARM/vrev.ll @@ -178,3 +178,11 @@ entry: ret void } +define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind { +; CHECK-LABEL: test_vrev32_bswap: +; CHECK: vrev32.8 + %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source) + ret <4 x i32> %bswap +} + +declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM64/rev.ll b/test/CodeGen/ARM64/rev.ll index 0006ea33175..1da59e42f6b 100644 --- a/test/CodeGen/ARM64/rev.ll +++ b/test/CodeGen/ARM64/rev.ll @@ -222,3 +222,14 @@ entry: ret void } + +define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind { +; CHECK-LABEL: test_vrev32_bswap: +; CHECK: rev32.16b +; CHECK-NOT: rev +; CHECK: ret + %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source) + ret <4 x i32> %bswap +} + +declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll index 6b77176cb69..a18fd68865c 100644 --- a/test/CodeGen/X86/bswap-vector.ll +++ b/test/CodeGen/X86/bswap-vector.ll @@ -1,19 +1,127 @@ -; RUN: llc < %s -mcpu=x86_64 | FileCheck %s +; RUN: llc < %s -mcpu=x86-64 | FileCheck %s -check-prefix=CHECK-NOSSSE3 +; RUN: llc < %s -mcpu=core2 | FileCheck %s -check-prefix=CHECK-SSSE3 +; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" +declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>) +declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) -define <2 x i64> @foo(<2 x i64> %v) #0 { +define <8 x i16> @test1(<8 x i16> %v) #0 { +entry: + %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v) + ret <8 x i16> %r + +; CHECK-NOSSSE3-LABEL: @test1 +; CHECK-NOSSSE3: rolw +; CHECK-NOSSSE3: rolw +; CHECK-NOSSSE3: rolw +; CHECK-NOSSSE3: rolw +; CHECK-NOSSSE3: rolw +; CHECK-NOSSSE3: rolw +; CHECK-NOSSSE3: rolw +; CHECK-NOSSSE3: rolw +; CHECK-NOSSSE3: retq + +; CHECK-SSSE3-LABEL: @test1 +; CHECK-SSSE3: pshufb +; CHECK-SSSE3-NEXT: retq + +; CHECK-AVX2-LABEL: @test1 +; CHECK-AVX2: vpshufb +; CHECK-AVX2-NEXT: retq +} + +define <4 x i32> @test2(<4 x i32> %v) #0 { +entry: + %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v) + ret <4 x i32> %r + +; CHECK-NOSSSE3-LABEL: @test2 +; CHECK-NOSSSE3: bswapl +; CHECK-NOSSSE3: bswapl +; CHECK-NOSSSE3: bswapl +; CHECK-NOSSSE3: bswapl +; CHECK-NOSSSE3: retq + +; CHECK-SSSE3-LABEL: @test2 +; CHECK-SSSE3: pshufb +; CHECK-SSSE3-NEXT: retq + +; CHECK-AVX2-LABEL: @test2 +; CHECK-AVX2: vpshufb +; CHECK-AVX2-NEXT: retq +} + +define <2 x i64> @test3(<2 x i64> %v) #0 { entry: %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v) ret <2 x i64> %r + +; CHECK-NOSSSE3-LABEL: @test3 +; CHECK-NOSSSE3: bswapq +; CHECK-NOSSSE3: bswapq +; CHECK-NOSSSE3: retq + +; CHECK-SSSE3-LABEL: @test3 +; CHECK-SSSE3: pshufb +; CHECK-SSSE3-NEXT: retq + +; CHECK-AVX2-LABEL: @test3 +; CHECK-AVX2: vpshufb +; CHECK-AVX2-NEXT: retq +} + +declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>) +declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) +declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) + +define <16 x i16> @test4(<16 x i16> %v) #0 { +entry: + %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v) + ret <16 x i16> %r + +; CHECK-SSSE3-LABEL: @test4 +; CHECK-SSSE3: pshufb +; CHECK-SSSE3: pshufb +; CHECK-SSSE3-NEXT: retq + +; CHECK-AVX2-LABEL: @test4 +; CHECK-AVX2: vpshufb +; CHECK-AVX2-NEXT: retq +} + +define <8 x i32> @test5(<8 x i32> %v) #0 { +entry: + %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v) + ret <8 x i32> %r + +; CHECK-SSSE3-LABEL: @test5 +; CHECK-SSSE3: pshufb +; CHECK-SSSE3: pshufb +; CHECK-SSSE3-NEXT: retq + +; CHECK-AVX2-LABEL: @test5 +; CHECK-AVX2: vpshufb +; CHECK-AVX2-NEXT: retq +} + +define <4 x i64> @test6(<4 x i64> %v) #0 { +entry: + %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v) + ret <4 x i64> %r + +; CHECK-SSSE3-LABEL: @test6 +; CHECK-SSSE3: pshufb +; CHECK-SSSE3: pshufb +; CHECK-SSSE3-NEXT: retq + +; CHECK-AVX2-LABEL: @test6 +; CHECK-AVX2: vpshufb +; CHECK-AVX2-NEXT: retq } -; CHECK-LABEL: @foo -; CHECK: bswapq -; CHECK: bswapq -; CHECK: retq attributes #0 = { nounwind uwtable }