From 68a4b416482ea92fe9aa4f1adf86c6e0574a4186 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 21 Jul 2015 21:40:17 +0000 Subject: [PATCH] [PPC64LE] More vector swap optimization TLC This makes one substantive change and a few stylistic changes to the VSX swap optimization pass. The substantive change is to permit LXSDX and LXSSPX instructions to participate in swap optimization computations. The previous change to insert a swap following a SUBREG_TO_REG widening operation makes this almost trivial. I experimented with also permitting STXSDX and STXSSPX instructions. This can be done using similar techniques: we could insert a swap prior to a narrowing COPY operation, and then permit these stores to participate. I prototyped this, but discovered that the pattern of a narrowing COPY followed by an STXSDX does not occur in any of our test-suite code. So instead, I added commentary indicating that this could be done. Other TLC: - I changed SH_COPYSCALAR to SH_COPYWIDEN to more clearly indicate the direction of the copy. - I factored the insertion of swap instructions into a separate function. Finally, I added a new test case to check that the scalar-to-vector loads are working properly with swap optimization. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242838 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCVSXSwapRemoval.cpp | 68 ++++++++++++++++-------- test/CodeGen/PowerPC/swaps-le-6.ll | 44 +++++++++++++++ 2 files changed, 91 insertions(+), 21 deletions(-) create mode 100644 test/CodeGen/PowerPC/swaps-le-6.ll diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index 3fb1dcc3d4a..d9504710b3f 100644 --- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -94,7 +94,7 @@ enum SHValues { SH_NOSWAP_ST, SH_SPLAT, SH_XXPERMDI, - SH_COPYSCALAR + SH_COPYWIDEN }; struct PPCVSXSwapRemoval : public MachineFunctionPass { @@ -149,6 +149,11 @@ private: // handling. Return true iff any changes are made. bool removeSwaps(); + // Insert a swap instruction from SrcReg to DstReg at the given + // InsertPoint. + void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint, + unsigned DstReg, unsigned SrcReg); + // Update instructions requiring special handling. void handleSpecialSwappables(int EntryIdx); @@ -340,6 +345,15 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { SwapVector[VecIdx].IsLoad = 1; SwapVector[VecIdx].IsSwap = 1; break; + case PPC::LXSDX: + case PPC::LXSSPX: + // A load of a floating-point value into the high-order half of + // a vector register is safe, provided that we introduce a swap + // following the load, which will be done by the SUBREG_TO_REG + // support. So just mark these as safe. + SwapVector[VecIdx].IsLoad = 1; + SwapVector[VecIdx].IsSwappable = 1; + break; case PPC::STVX: // Non-permuting stores are currently unsafe. We can use special // handling for this in the future. By not marking these as @@ -382,7 +396,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { else if (isVecReg(MI.getOperand(0).getReg()) && isScalarVecReg(MI.getOperand(2).getReg())) { SwapVector[VecIdx].IsSwappable = 1; - SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYSCALAR; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN; } break; } @@ -417,7 +431,14 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { case PPC::STVEHX: case PPC::STVEWX: case PPC::STVXL: + // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX, + // by adding special handling for narrowing copies as well as + // widening ones. However, I've experimented with this, and in + // practice we currently do not appear to use STXSDX fed by + // a narrowing copy from a full vector register. Since I can't + // generate any useful test cases, I've left this alone for now. case PPC::STXSDX: + case PPC::STXSSPX: case PPC::VCIPHER: case PPC::VCIPHERLAST: case PPC::VMRGHB: @@ -540,7 +561,8 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg, } if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) { - SwapVector[VecIdx].MentionsPhysVR = 1; + if (!isScalarVecReg(CopySrcReg)) + SwapVector[VecIdx].MentionsPhysVR = 1; return CopySrcReg; } @@ -626,8 +648,8 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() { SwapVector[Repr].WebRejected = 1; DEBUG(dbgs() << - format("Web %d rejected for physreg, partial reg, or not swap[pable]\n", - Repr)); + format("Web %d rejected for physreg, partial reg, or not " + "swap[pable]\n", Repr)); DEBUG(dbgs() << " in " << EntryIdx << ": "); DEBUG(SwapVector[EntryIdx].VSEMI->dump()); DEBUG(dbgs() << "\n"); @@ -740,6 +762,21 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() { } } +// Create an xxswapd instruction and insert it prior to the given point. +// MI is used to determine basic block and debug loc information. +// FIXME: When inserting a swap, we should check whether SrcReg is +// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so, +// then instead we should generate a copy from Reg to DstReg. +void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI, + MachineBasicBlock::iterator InsertPoint, + unsigned DstReg, unsigned SrcReg) { + BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), + TII->get(PPC::XXPERMDI), DstReg) + .addReg(SrcReg) + .addReg(SrcReg) + .addImm(2); +} + // The identified swap entry requires special handling to allow its // containing computation to be optimized. Perform that handling // here. @@ -808,7 +845,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { // For a copy from a scalar floating-point register to a vector // register, removing swaps will leave the copied value in the // wrong lane. Insert a swap following the copy to fix this. - case SHValues::SH_COPYSCALAR: { + case SHValues::SH_COPYWIDEN: { MachineInstr *MI = SwapVector[EntryIdx].VSEMI; DEBUG(dbgs() << "Changing SUBREG_TO_REG: "); @@ -829,7 +866,6 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { // assignment problem. In this case we must copy from VRRC to VSRC // prior to the swap, and from VSRC to VRRC following the swap. // Coalescing will usually remove all this mess. - if (DstRC == &PPC::VRRCRegClass) { unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass); unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass); @@ -839,11 +875,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { .addReg(NewVReg); DEBUG(MI->getNextNode()->dump()); - BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), - TII->get(PPC::XXPERMDI), VSRCTmp2) - .addReg(VSRCTmp1) - .addReg(VSRCTmp1) - .addImm(2); + insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1); DEBUG(MI->getNextNode()->getNextNode()->dump()); BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), @@ -852,13 +884,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { DEBUG(MI->getNextNode()->getNextNode()->getNextNode()->dump()); } else { - - BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), - TII->get(PPC::XXPERMDI), DstReg) - .addReg(NewVReg) - .addReg(NewVReg) - .addImm(2); - + insertSwap(MI, InsertPoint, DstReg, NewVReg); DEBUG(MI->getNextNode()->dump()); } break; @@ -944,8 +970,8 @@ void PPCVSXSwapRemoval::dumpSwapVector() { case SH_XXPERMDI: DEBUG(dbgs() << "special:xxpermdi "); break; - case SH_COPYSCALAR: - DEBUG(dbgs() << "special:copyscalar "); + case SH_COPYWIDEN: + DEBUG(dbgs() << "special:copywiden "); break; } } diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll new file mode 100644 index 00000000000..365aeee2d8f --- /dev/null +++ b/test/CodeGen/PowerPC/swaps-le-6.ll @@ -0,0 +1,44 @@ +; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -O3 < %s | FileCheck %s + +; These tests verify that VSX swap optimization works when loading a scalar +; into a vector register. + + +@x = global <2 x double> , align 16 +@z = global <2 x double> , align 16 +@y = global double 1.780000e+00, align 8 + +define void @bar0() { +entry: + %0 = load <2 x double>, <2 x double>* @x, align 16 + %1 = load double, double* @y, align 8 + %vecins = insertelement <2 x double> %0, double %1, i32 0 + store <2 x double> %vecins, <2 x double>* @z, align 16 + ret void +} + +; CHECK-LABEL: @bar0 +; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] +; CHECK-DAG: lxsdx [[REG2:[0-9]+]] +; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]] +; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1 +; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG4]], [[REG1]], 1 +; CHECK: stxvd2x [[REG5]] + +define void @bar1() { +entry: + %0 = load <2 x double>, <2 x double>* @x, align 16 + %1 = load double, double* @y, align 8 + %vecins = insertelement <2 x double> %0, double %1, i32 1 + store <2 x double> %vecins, <2 x double>* @z, align 16 + ret void +} + +; CHECK-LABEL: @bar1 +; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] +; CHECK-DAG: lxsdx [[REG2:[0-9]+]] +; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]] +; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1 +; CHECK: xxmrghd [[REG5:[0-9]+]], [[REG1]], [[REG4]] +; CHECK: stxvd2x [[REG5]] +