From acc068e873a1a2afa1edef20452722d97eec8f71 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Sat, 24 Dec 2011 10:55:54 +0000
Subject: [PATCH] Switch the lowering of CTLZ_ZERO_UNDEF from a .td pattern
 back to the X86ISelLowering C++ code. Because this is lowered via an xor
 wrapped around a bsr, we want the dagcombine which runs after isel lowering
 to have a chance to clean things up. In particular, it is very common to see
 code which looks like:

  (sizeof(x)*8 - 1) ^ __builtin_clz(x)

Which is trying to compute the most significant bit of 'x'. That's
actually the value computed directly by the 'bsr' instruction, but if we
match it too late, we'll get completely redundant xor instructions.

The more naive code for the above (subtracting rather than using an xor)
still isn't handled correctly due to the dagcombine getting confused.

Also, while here fix an issue spotted by inspection: we should have been
expanding the zero-undef variants to the normal variants when there is
an 'lzcnt' instruction. Do so, and test for this. We don't want to
generate unnecessary 'bsr' instructions.

These two changes fix some regressions in encoding and decoding
benchmarks. However, there is still a *lot* to be improve on in this
type of code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@147244 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 40 ++++++++++++++++++++++++++++--
 lib/Target/X86/X86ISelLowering.h   |  1 +
 lib/Target/X86/X86InstrCompiler.td |  9 -------
 test/CodeGen/X86/clz.ll            | 27 +++++++++++++++++++-
 test/CodeGen/X86/lzcnt.ll          | 28 +++++++++++++++++++++
 5 files changed, 93 insertions(+), 12 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1d39e89f17b..47b12ee6a9e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -390,15 +390,24 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
   }
 
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i8   , Expand);
   if (Subtarget->hasLZCNT()) {
     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
   } else {
     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
-    if (Subtarget->is64Bit())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
+    if (Subtarget->is64Bit()) {
       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+    }
   }
 
   if (Subtarget->hasPOPCNT()) {
@@ -9834,6 +9843,32 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
+SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  EVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    // Zero extend to i32 since there is not an i8 bsr.
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsr (scan bits in reverse).
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+  // And xor with NumBits-1.
+  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
 SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   EVT OpVT = VT;
@@ -10686,6 +10721,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
+  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, DAG);
   case ISD::SRA:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 52908b19bf2..7bfcc45948d 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -823,6 +823,7 @@ namespace llvm {
     SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index f593092df76..a5c05de0199 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -1761,12 +1761,3 @@ def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
 def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
 def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
 def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
-def : Pat<(ctlz_zero_undef GR16:$src), (XOR16ri (BSR16rr GR16:$src), 15)>;
-def : Pat<(ctlz_zero_undef GR32:$src), (XOR32ri (BSR32rr GR32:$src), 31)>;
-def : Pat<(ctlz_zero_undef GR64:$src), (XOR64ri8 (BSR64rr GR64:$src), 63)>;
-def : Pat<(ctlz_zero_undef (loadi16 addr:$src)),
-          (XOR16ri (BSR16rm addr:$src), 15)>;
-def : Pat<(ctlz_zero_undef (loadi32 addr:$src)),
-          (XOR32ri (BSR32rm addr:$src), 31)>;
-def : Pat<(ctlz_zero_undef (loadi64 addr:$src)),
-          (XOR64ri8 (BSR64rm addr:$src), 63)>;
diff --git a/test/CodeGen/X86/clz.ll b/test/CodeGen/X86/clz.ll
index 4e080309b6f..ad47bde8549 100644
--- a/test/CodeGen/X86/clz.ll
+++ b/test/CodeGen/X86/clz.ll
@@ -31,7 +31,7 @@ entry:
 ; CHECK: t3:
 ; CHECK: bsrw
 ; CHECK-NOT: cmov
-; CHECK: xorw $15,
+; CHECK: xorl $15,
 ; CHECK: ret
 }
 
@@ -63,3 +63,28 @@ entry:
   %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %or, i1 false)
   ret i32 %tmp1
 }
+
+define i32 @t6(i32 %n) nounwind {
+entry:
+; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
+; the most significant bit, which is what 'bsr' does natively.
+; CHECK: t6:
+; CHECK: bsrl
+; CHECK-NOT: xorl
+; CHECK: ret
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %n, i1 true)
+  %bsr = xor i32 %ctlz, 31
+  ret i32 %bsr
+}
+
+define i32 @t7(i32 %n) nounwind {
+entry:
+; Same as t6, but ensure this happens even when there is a potential zero.
+; CHECK: t7:
+; CHECK: bsrl
+; CHECK-NOT: xorl
+; CHECK: ret
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %n, i1 false)
+  %bsr = xor i32 %ctlz, 31
+  ret i32 %bsr
+}
diff --git a/test/CodeGen/X86/lzcnt.ll b/test/CodeGen/X86/lzcnt.ll
index c2b3e68cbd2..eb010d7f5a8 100644
--- a/test/CodeGen/X86/lzcnt.ll
+++ b/test/CodeGen/X86/lzcnt.ll
@@ -32,3 +32,31 @@ define i64 @t4(i64 %x) nounwind  {
 ; CHECK: t4:
 ; CHECK: lzcntq
 }
+
+define i8 @t5(i8 %x) nounwind  {
+	%tmp = tail call i8 @llvm.ctlz.i8( i8 %x, i1 true )
+	ret i8 %tmp
+; CHECK: t5:
+; CHECK: lzcntw
+}
+
+define i16 @t6(i16 %x) nounwind  {
+	%tmp = tail call i16 @llvm.ctlz.i16( i16 %x, i1 true )
+	ret i16 %tmp
+; CHECK: t6:
+; CHECK: lzcntw
+}
+
+define i32 @t7(i32 %x) nounwind  {
+	%tmp = tail call i32 @llvm.ctlz.i32( i32 %x, i1 true )
+	ret i32 %tmp
+; CHECK: t7:
+; CHECK: lzcntl
+}
+
+define i64 @t8(i64 %x) nounwind  {
+	%tmp = tail call i64 @llvm.ctlz.i64( i64 %x, i1 true )
+	ret i64 %tmp
+; CHECK: t8:
+; CHECK: lzcntq
+}