Teach constant folding to perform conversions from constant floating

point values to their integer representation through the SSE intrinsic calls. This is the last part of a README.txt entry for which I have real world examples. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123206 91177308-0d34-0410-b5e6-96231b3b80d8
2025-07-17 18:24:34 +00:00 · 2011-01-11 01:07:24 +00:00
parent f7b0047f5f
commit 15ed90c859
3 changed files with 89 additions and 55 deletions
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1047,6 +1047,14 @@ llvm::canConstantFoldCallTo(const Function *F) {
  case Intrinsic::smul_with_overflow:
  case Intrinsic::convert_from_fp16:
  case Intrinsic::convert_to_fp16:
  case Intrinsic::x86_sse_cvtss2si:
  case Intrinsic::x86_sse_cvtss2si64:
  case Intrinsic::x86_sse_cvttss2si:
  case Intrinsic::x86_sse_cvttss2si64:
  case Intrinsic::x86_sse2_cvtsd2si:
  case Intrinsic::x86_sse2_cvtsd2si64:
  case Intrinsic::x86_sse2_cvttsd2si:
  case Intrinsic::x86_sse2_cvttsd2si64:
    return true;
  default:
    return false;
@@ -1116,6 +1124,36 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
  return 0; // dummy return to suppress warning
 }
 /// ConstantFoldConvertToInt - Attempt to an SSE floating point to integer
 /// conversion of a constant floating point. If roundTowardZero is false, the
 /// default IEEE rounding is used (toward nearest, ties to even). This matches
 /// the behavior of the non-truncating SSE instructions in the default rounding
 /// mode. The desired integer type Ty is used to select how many bits are
 /// available for the result. Returns null if the conversion cannot be
 /// performed, otherwise returns the Constant value resulting from the
 /// conversion.
 static Constant *ConstantFoldConvertToInt(ConstantFP *Op, bool roundTowardZero,
                                          const Type *Ty) {
  assert(Op && "Called with NULL operand");
  APFloat Val(Op->getValueAPF());
  // All of these conversion intrinsics form an integer of at most 64bits.
  unsigned ResultWidth = cast<IntegerType>(Ty)->getBitWidth();
  assert(ResultWidth <= 64 &&
         "Can only constant fold conversions to 64 and 32 bit ints");
  uint64_t UIntVal;
  bool isExact = false;
  APFloat::roundingMode mode = roundTowardZero? APFloat::rmTowardZero
                                              : APFloat::rmNearestTiesToEven;
  APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
                                                  /*isSigned=*/true, mode,
                                                  &isExact);
  if (status != APFloat::opOK && status != APFloat::opInexact)
    return 0;
  return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
 }
 /// ConstantFoldCall - Attempt to constant fold a call to the specified function
 /// with the specified arguments, returning null if unsuccessful.
 Constant *
@@ -1246,6 +1284,24 @@ llvm::ConstantFoldCall(Function *F,
      }
    }
    if (ConstantVector *Op = dyn_cast<ConstantVector>(Operands[0])) {
      switch (F->getIntrinsicID()) {
      default: break;
      case Intrinsic::x86_sse_cvtss2si:
      case Intrinsic::x86_sse_cvtss2si64:
      case Intrinsic::x86_sse2_cvtsd2si:
      case Intrinsic::x86_sse2_cvtsd2si64:
        if (ConstantFP *FPOp = dyn_cast<ConstantFP>(Op->getOperand(0)))
          return ConstantFoldConvertToInt(FPOp, /*roundTowardZero=*/false, Ty);
      case Intrinsic::x86_sse_cvttss2si:
      case Intrinsic::x86_sse_cvttss2si64:
      case Intrinsic::x86_sse2_cvttsd2si:
      case Intrinsic::x86_sse2_cvttsd2si64:
        if (ConstantFP *FPOp = dyn_cast<ConstantFP>(Op->getOperand(0)))
          return ConstantFoldConvertToInt(FPOp, /*roundTowardZero=*/true, Ty);
      }
    }
    if (isa<UndefValue>(Operands[0])) {
      if (F->getIntrinsicID() == Intrinsic::bswap)
        return Operands[0];
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2259,58 +2259,3 @@ Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
 transform the fmul to 0.0, and then the fadd to 2.0.
 //===---------------------------------------------------------------------===//
 clang -O3 currently compiles this code:
 #include <emmintrin.h>
 int f(double x) { return _mm_cvtsd_si32(_mm_set_sd(x)); }
 int g(double x) { return _mm_cvttsd_si32(_mm_set_sd(x)); }
 into
 define i32 @_Z1fd(double %x) nounwind readnone {
 entry:
  %vecinit.i = insertelement <2 x double> undef, double %x, i32 0
  %vecinit1.i = insertelement <2 x double> %vecinit.i, double 0.000000e+00,i32 1
  %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %vecinit1.i) nounwind
  ret i32 %0
 }
 define i32 @_Z1gd(double %x) nounwind readnone {
 entry:
  %conv.i = fptosi double %x to i32
  ret i32 %conv.i
 }
 This difference carries over to the assmebly produced, resulting in:
 _Z1fd:                                  # @_Z1fd
 # BB#0:                                 # %entry
        pushq   %rbp
        movq    %rsp, %rbp
        xorps   %xmm1, %xmm1
        movsd   %xmm0, %xmm1
        cvtsd2sil       %xmm1, %eax
        popq    %rbp
        ret
 _Z1gd:                                  # @_Z1gd
 # BB#0:                                 # %entry
        pushq   %rbp
        movq    %rsp, %rbp
        cvttsd2si       %xmm0, %eax
        popq    %rbp
        ret
 The problem is that we can't see through the intrinsic call used for cvtsd2si,
 and fold away the unnecessary manipulation of the function parameter. When
 these functions are inlined, it forms a barrier preventing many further
 optimizations. LLVM IR doesn't have a good way to model the logic of
 'cvtsd2si', its only FP -> int conversion path forces truncation. We should add
 a rounding flag onto fptosi so that it can represent this type of rounding
 naturally in the IR rather than using intrinsics. We might need to use a
 'system_rounding_mode' flag to encode that the semantics of the rounding mode
 can be changed by the program, but ideally we could just say that isn't
 supported, and hard code the rounding.
 //===---------------------------------------------------------------------===//
--- a/test/Transforms/ConstProp/calls.ll
+++ b/test/Transforms/ConstProp/calls.ll
@@ -21,3 +21,36 @@ define double @T() {
  %c = fadd double %b, %D
  ret double %c
 }
 define i1 @test_sse_cvt() nounwind readnone {
 ; CHECK: @test_sse_cvt
 ; CHECK-NOT: call
 ; CHECK: ret i1 true
 entry:
  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
  %i1 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
  %i2 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
  %i3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
  %i4 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 1.75, double undef>) nounwind
  %i5 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 1.75, double undef>) nounwind
  %i6 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 1.75, double undef>) nounwind
  %i7 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 1.75, double undef>) nounwind
  %sum11 = add i32 %i0, %i1
  %sum12 = add i32 %i4, %i5
  %sum1 = add i32 %sum11, %sum12
  %sum21 = add i64 %i2, %i3
  %sum22 = add i64 %i6, %i7
  %sum2 = add i64 %sum21, %sum22
  %sum1.sext = sext i32 %sum1 to i64
  %b = icmp eq i64 %sum1.sext, %sum2
  ret i1 %b
 }
 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone