diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1d39e89f17b..47b12ee6a9e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -390,15 +390,24 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::CTTZ , MVT::i64 , Custom); } - setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i8 , Expand); if (Subtarget->hasLZCNT()) { setOperationAction(ISD::CTLZ , MVT::i8 , Promote); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); - if (Subtarget->is64Bit()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); + if (Subtarget->is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + } } if (Subtarget->hasPOPCNT()) { @@ -9834,6 +9843,32 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { return Op; } +SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + EVT OpVT = VT; + unsigned NumBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + + Op = Op.getOperand(0); + if (VT == MVT::i8) { + // Zero extend to i32 since there is not an i8 bsr. + OpVT = MVT::i32; + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); + } + + // Issue a bsr (scan bits in reverse). + SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); + + // And xor with NumBits-1. + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); + + if (VT == MVT::i8) + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); + return Op; +} + SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT OpVT = VT; @@ -10686,6 +10721,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, DAG); + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SRA: diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 52908b19bf2..7bfcc45948d 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -823,6 +823,7 @@ namespace llvm { SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index f593092df76..a5c05de0199 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -1761,12 +1761,3 @@ def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>; def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>; def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>; def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>; -def : Pat<(ctlz_zero_undef GR16:$src), (XOR16ri (BSR16rr GR16:$src), 15)>; -def : Pat<(ctlz_zero_undef GR32:$src), (XOR32ri (BSR32rr GR32:$src), 31)>; -def : Pat<(ctlz_zero_undef GR64:$src), (XOR64ri8 (BSR64rr GR64:$src), 63)>; -def : Pat<(ctlz_zero_undef (loadi16 addr:$src)), - (XOR16ri (BSR16rm addr:$src), 15)>; -def : Pat<(ctlz_zero_undef (loadi32 addr:$src)), - (XOR32ri (BSR32rm addr:$src), 31)>; -def : Pat<(ctlz_zero_undef (loadi64 addr:$src)), - (XOR64ri8 (BSR64rm addr:$src), 63)>; diff --git a/test/CodeGen/X86/clz.ll b/test/CodeGen/X86/clz.ll index 4e080309b6f..ad47bde8549 100644 --- a/test/CodeGen/X86/clz.ll +++ b/test/CodeGen/X86/clz.ll @@ -31,7 +31,7 @@ entry: ; CHECK: t3: ; CHECK: bsrw ; CHECK-NOT: cmov -; CHECK: xorw $15, +; CHECK: xorl $15, ; CHECK: ret } @@ -63,3 +63,28 @@ entry: %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %or, i1 false) ret i32 %tmp1 } + +define i32 @t6(i32 %n) nounwind { +entry: +; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute +; the most significant bit, which is what 'bsr' does natively. +; CHECK: t6: +; CHECK: bsrl +; CHECK-NOT: xorl +; CHECK: ret + %ctlz = tail call i32 @llvm.ctlz.i32(i32 %n, i1 true) + %bsr = xor i32 %ctlz, 31 + ret i32 %bsr +} + +define i32 @t7(i32 %n) nounwind { +entry: +; Same as t6, but ensure this happens even when there is a potential zero. +; CHECK: t7: +; CHECK: bsrl +; CHECK-NOT: xorl +; CHECK: ret + %ctlz = tail call i32 @llvm.ctlz.i32(i32 %n, i1 false) + %bsr = xor i32 %ctlz, 31 + ret i32 %bsr +} diff --git a/test/CodeGen/X86/lzcnt.ll b/test/CodeGen/X86/lzcnt.ll index c2b3e68cbd2..eb010d7f5a8 100644 --- a/test/CodeGen/X86/lzcnt.ll +++ b/test/CodeGen/X86/lzcnt.ll @@ -32,3 +32,31 @@ define i64 @t4(i64 %x) nounwind { ; CHECK: t4: ; CHECK: lzcntq } + +define i8 @t5(i8 %x) nounwind { + %tmp = tail call i8 @llvm.ctlz.i8( i8 %x, i1 true ) + ret i8 %tmp +; CHECK: t5: +; CHECK: lzcntw +} + +define i16 @t6(i16 %x) nounwind { + %tmp = tail call i16 @llvm.ctlz.i16( i16 %x, i1 true ) + ret i16 %tmp +; CHECK: t6: +; CHECK: lzcntw +} + +define i32 @t7(i32 %x) nounwind { + %tmp = tail call i32 @llvm.ctlz.i32( i32 %x, i1 true ) + ret i32 %tmp +; CHECK: t7: +; CHECK: lzcntl +} + +define i64 @t8(i64 %x) nounwind { + %tmp = tail call i64 @llvm.ctlz.i64( i64 %x, i1 true ) + ret i64 %tmp +; CHECK: t8: +; CHECK: lzcntq +}