[PPC64LE] More vector swap optimization TLC

This makes one substantive change and a few stylistic changes to the
VSX swap optimization pass.

The substantive change is to permit LXSDX and LXSSPX instructions to
participate in swap optimization computations.  The previous change to
insert a swap following a SUBREG_TO_REG widening operation makes this
almost trivial.

I experimented with also permitting STXSDX and STXSSPX instructions.
This can be done using similar techniques:  we could insert a swap
prior to a narrowing COPY operation, and then permit these stores to
participate.  I prototyped this, but discovered that the pattern of a
narrowing COPY followed by an STXSDX does not occur in any of our
test-suite code.  So instead, I added commentary indicating that this
could be done.

Other TLC:
 - I changed SH_COPYSCALAR to SH_COPYWIDEN to more clearly indicate
 the direction of the copy.
 - I factored the insertion of swap instructions into a separate
 function.

Finally, I added a new test case to check that the scalar-to-vector
loads are working properly with swap optimization.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242838 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Bill Schmidt 2015-07-21 21:40:17 +00:00
parent 5938be0e84
commit 68a4b41648
2 changed files with 91 additions and 21 deletions

View File

@ -94,7 +94,7 @@ enum SHValues {
SH_NOSWAP_ST,
SH_SPLAT,
SH_XXPERMDI,
SH_COPYSCALAR
SH_COPYWIDEN
};
struct PPCVSXSwapRemoval : public MachineFunctionPass {
@ -149,6 +149,11 @@ private:
// handling. Return true iff any changes are made.
bool removeSwaps();
// Insert a swap instruction from SrcReg to DstReg at the given
// InsertPoint.
void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
unsigned DstReg, unsigned SrcReg);
// Update instructions requiring special handling.
void handleSpecialSwappables(int EntryIdx);
@ -340,6 +345,15 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
SwapVector[VecIdx].IsLoad = 1;
SwapVector[VecIdx].IsSwap = 1;
break;
case PPC::LXSDX:
case PPC::LXSSPX:
// A load of a floating-point value into the high-order half of
// a vector register is safe, provided that we introduce a swap
// following the load, which will be done by the SUBREG_TO_REG
// support. So just mark these as safe.
SwapVector[VecIdx].IsLoad = 1;
SwapVector[VecIdx].IsSwappable = 1;
break;
case PPC::STVX:
// Non-permuting stores are currently unsafe. We can use special
// handling for this in the future. By not marking these as
@ -382,7 +396,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
else if (isVecReg(MI.getOperand(0).getReg()) &&
isScalarVecReg(MI.getOperand(2).getReg())) {
SwapVector[VecIdx].IsSwappable = 1;
SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYSCALAR;
SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
}
break;
}
@ -417,7 +431,14 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
case PPC::STVEHX:
case PPC::STVEWX:
case PPC::STVXL:
// We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
// by adding special handling for narrowing copies as well as
// widening ones. However, I've experimented with this, and in
// practice we currently do not appear to use STXSDX fed by
// a narrowing copy from a full vector register. Since I can't
// generate any useful test cases, I've left this alone for now.
case PPC::STXSDX:
case PPC::STXSSPX:
case PPC::VCIPHER:
case PPC::VCIPHERLAST:
case PPC::VMRGHB:
@ -540,7 +561,8 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
}
if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) {
SwapVector[VecIdx].MentionsPhysVR = 1;
if (!isScalarVecReg(CopySrcReg))
SwapVector[VecIdx].MentionsPhysVR = 1;
return CopySrcReg;
}
@ -626,8 +648,8 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
SwapVector[Repr].WebRejected = 1;
DEBUG(dbgs() <<
format("Web %d rejected for physreg, partial reg, or not swap[pable]\n",
Repr));
format("Web %d rejected for physreg, partial reg, or not "
"swap[pable]\n", Repr));
DEBUG(dbgs() << " in " << EntryIdx << ": ");
DEBUG(SwapVector[EntryIdx].VSEMI->dump());
DEBUG(dbgs() << "\n");
@ -740,6 +762,21 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
}
}
// Create an xxswapd instruction and insert it prior to the given point.
// MI is used to determine basic block and debug loc information.
// FIXME: When inserting a swap, we should check whether SrcReg is
// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so,
// then instead we should generate a copy from Reg to DstReg.
void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
MachineBasicBlock::iterator InsertPoint,
unsigned DstReg, unsigned SrcReg) {
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
TII->get(PPC::XXPERMDI), DstReg)
.addReg(SrcReg)
.addReg(SrcReg)
.addImm(2);
}
// The identified swap entry requires special handling to allow its
// containing computation to be optimized. Perform that handling
// here.
@ -808,7 +845,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
// For a copy from a scalar floating-point register to a vector
// register, removing swaps will leave the copied value in the
// wrong lane. Insert a swap following the copy to fix this.
case SHValues::SH_COPYSCALAR: {
case SHValues::SH_COPYWIDEN: {
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
@ -829,7 +866,6 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
// assignment problem. In this case we must copy from VRRC to VSRC
// prior to the swap, and from VSRC to VRRC following the swap.
// Coalescing will usually remove all this mess.
if (DstRC == &PPC::VRRCRegClass) {
unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
@ -839,11 +875,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
.addReg(NewVReg);
DEBUG(MI->getNextNode()->dump());
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
TII->get(PPC::XXPERMDI), VSRCTmp2)
.addReg(VSRCTmp1)
.addReg(VSRCTmp1)
.addImm(2);
insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1);
DEBUG(MI->getNextNode()->getNextNode()->dump());
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
@ -852,13 +884,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
DEBUG(MI->getNextNode()->getNextNode()->getNextNode()->dump());
} else {
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
TII->get(PPC::XXPERMDI), DstReg)
.addReg(NewVReg)
.addReg(NewVReg)
.addImm(2);
insertSwap(MI, InsertPoint, DstReg, NewVReg);
DEBUG(MI->getNextNode()->dump());
}
break;
@ -944,8 +970,8 @@ void PPCVSXSwapRemoval::dumpSwapVector() {
case SH_XXPERMDI:
DEBUG(dbgs() << "special:xxpermdi ");
break;
case SH_COPYSCALAR:
DEBUG(dbgs() << "special:copyscalar ");
case SH_COPYWIDEN:
DEBUG(dbgs() << "special:copywiden ");
break;
}
}

View File

@ -0,0 +1,44 @@
; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -O3 < %s | FileCheck %s
; These tests verify that VSX swap optimization works when loading a scalar
; into a vector register.
@x = global <2 x double> <double 9.970000e+01, double -1.032220e+02>, align 16
@z = global <2 x double> <double 2.332000e+01, double 3.111111e+01>, align 16
@y = global double 1.780000e+00, align 8
define void @bar0() {
entry:
%0 = load <2 x double>, <2 x double>* @x, align 16
%1 = load double, double* @y, align 8
%vecins = insertelement <2 x double> %0, double %1, i32 0
store <2 x double> %vecins, <2 x double>* @z, align 16
ret void
}
; CHECK-LABEL: @bar0
; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]]
; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1
; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG4]], [[REG1]], 1
; CHECK: stxvd2x [[REG5]]
define void @bar1() {
entry:
%0 = load <2 x double>, <2 x double>* @x, align 16
%1 = load double, double* @y, align 8
%vecins = insertelement <2 x double> %0, double %1, i32 1
store <2 x double> %vecins, <2 x double>* @z, align 16
ret void
}
; CHECK-LABEL: @bar1
; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]]
; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1
; CHECK: xxmrghd [[REG5:[0-9]+]], [[REG1]], [[REG4]]
; CHECK: stxvd2x [[REG5]]