mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-07-17 18:24:34 +00:00
Teach constant folding to perform conversions from constant floating
point values to their integer representation through the SSE intrinsic calls. This is the last part of a README.txt entry for which I have real world examples. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123206 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
@@ -1047,6 +1047,14 @@ llvm::canConstantFoldCallTo(const Function *F) {
|
|||||||
case Intrinsic::smul_with_overflow:
|
case Intrinsic::smul_with_overflow:
|
||||||
case Intrinsic::convert_from_fp16:
|
case Intrinsic::convert_from_fp16:
|
||||||
case Intrinsic::convert_to_fp16:
|
case Intrinsic::convert_to_fp16:
|
||||||
|
case Intrinsic::x86_sse_cvtss2si:
|
||||||
|
case Intrinsic::x86_sse_cvtss2si64:
|
||||||
|
case Intrinsic::x86_sse_cvttss2si:
|
||||||
|
case Intrinsic::x86_sse_cvttss2si64:
|
||||||
|
case Intrinsic::x86_sse2_cvtsd2si:
|
||||||
|
case Intrinsic::x86_sse2_cvtsd2si64:
|
||||||
|
case Intrinsic::x86_sse2_cvttsd2si:
|
||||||
|
case Intrinsic::x86_sse2_cvttsd2si64:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
@@ -1116,6 +1124,36 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
|
|||||||
return 0; // dummy return to suppress warning
|
return 0; // dummy return to suppress warning
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// ConstantFoldConvertToInt - Attempt to an SSE floating point to integer
|
||||||
|
/// conversion of a constant floating point. If roundTowardZero is false, the
|
||||||
|
/// default IEEE rounding is used (toward nearest, ties to even). This matches
|
||||||
|
/// the behavior of the non-truncating SSE instructions in the default rounding
|
||||||
|
/// mode. The desired integer type Ty is used to select how many bits are
|
||||||
|
/// available for the result. Returns null if the conversion cannot be
|
||||||
|
/// performed, otherwise returns the Constant value resulting from the
|
||||||
|
/// conversion.
|
||||||
|
static Constant *ConstantFoldConvertToInt(ConstantFP *Op, bool roundTowardZero,
|
||||||
|
const Type *Ty) {
|
||||||
|
assert(Op && "Called with NULL operand");
|
||||||
|
APFloat Val(Op->getValueAPF());
|
||||||
|
|
||||||
|
// All of these conversion intrinsics form an integer of at most 64bits.
|
||||||
|
unsigned ResultWidth = cast<IntegerType>(Ty)->getBitWidth();
|
||||||
|
assert(ResultWidth <= 64 &&
|
||||||
|
"Can only constant fold conversions to 64 and 32 bit ints");
|
||||||
|
|
||||||
|
uint64_t UIntVal;
|
||||||
|
bool isExact = false;
|
||||||
|
APFloat::roundingMode mode = roundTowardZero? APFloat::rmTowardZero
|
||||||
|
: APFloat::rmNearestTiesToEven;
|
||||||
|
APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
|
||||||
|
/*isSigned=*/true, mode,
|
||||||
|
&isExact);
|
||||||
|
if (status != APFloat::opOK && status != APFloat::opInexact)
|
||||||
|
return 0;
|
||||||
|
return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
|
||||||
|
}
|
||||||
|
|
||||||
/// ConstantFoldCall - Attempt to constant fold a call to the specified function
|
/// ConstantFoldCall - Attempt to constant fold a call to the specified function
|
||||||
/// with the specified arguments, returning null if unsuccessful.
|
/// with the specified arguments, returning null if unsuccessful.
|
||||||
Constant *
|
Constant *
|
||||||
@@ -1246,6 +1284,24 @@ llvm::ConstantFoldCall(Function *F,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ConstantVector *Op = dyn_cast<ConstantVector>(Operands[0])) {
|
||||||
|
switch (F->getIntrinsicID()) {
|
||||||
|
default: break;
|
||||||
|
case Intrinsic::x86_sse_cvtss2si:
|
||||||
|
case Intrinsic::x86_sse_cvtss2si64:
|
||||||
|
case Intrinsic::x86_sse2_cvtsd2si:
|
||||||
|
case Intrinsic::x86_sse2_cvtsd2si64:
|
||||||
|
if (ConstantFP *FPOp = dyn_cast<ConstantFP>(Op->getOperand(0)))
|
||||||
|
return ConstantFoldConvertToInt(FPOp, /*roundTowardZero=*/false, Ty);
|
||||||
|
case Intrinsic::x86_sse_cvttss2si:
|
||||||
|
case Intrinsic::x86_sse_cvttss2si64:
|
||||||
|
case Intrinsic::x86_sse2_cvttsd2si:
|
||||||
|
case Intrinsic::x86_sse2_cvttsd2si64:
|
||||||
|
if (ConstantFP *FPOp = dyn_cast<ConstantFP>(Op->getOperand(0)))
|
||||||
|
return ConstantFoldConvertToInt(FPOp, /*roundTowardZero=*/true, Ty);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (isa<UndefValue>(Operands[0])) {
|
if (isa<UndefValue>(Operands[0])) {
|
||||||
if (F->getIntrinsicID() == Intrinsic::bswap)
|
if (F->getIntrinsicID() == Intrinsic::bswap)
|
||||||
return Operands[0];
|
return Operands[0];
|
||||||
|
@@ -2259,58 +2259,3 @@ Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
|
|||||||
transform the fmul to 0.0, and then the fadd to 2.0.
|
transform the fmul to 0.0, and then the fadd to 2.0.
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
clang -O3 currently compiles this code:
|
|
||||||
|
|
||||||
#include <emmintrin.h>
|
|
||||||
int f(double x) { return _mm_cvtsd_si32(_mm_set_sd(x)); }
|
|
||||||
int g(double x) { return _mm_cvttsd_si32(_mm_set_sd(x)); }
|
|
||||||
|
|
||||||
into
|
|
||||||
|
|
||||||
define i32 @_Z1fd(double %x) nounwind readnone {
|
|
||||||
entry:
|
|
||||||
%vecinit.i = insertelement <2 x double> undef, double %x, i32 0
|
|
||||||
%vecinit1.i = insertelement <2 x double> %vecinit.i, double 0.000000e+00,i32 1
|
|
||||||
%0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %vecinit1.i) nounwind
|
|
||||||
ret i32 %0
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @_Z1gd(double %x) nounwind readnone {
|
|
||||||
entry:
|
|
||||||
%conv.i = fptosi double %x to i32
|
|
||||||
ret i32 %conv.i
|
|
||||||
}
|
|
||||||
|
|
||||||
This difference carries over to the assmebly produced, resulting in:
|
|
||||||
|
|
||||||
_Z1fd: # @_Z1fd
|
|
||||||
# BB#0: # %entry
|
|
||||||
pushq %rbp
|
|
||||||
movq %rsp, %rbp
|
|
||||||
xorps %xmm1, %xmm1
|
|
||||||
movsd %xmm0, %xmm1
|
|
||||||
cvtsd2sil %xmm1, %eax
|
|
||||||
popq %rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
_Z1gd: # @_Z1gd
|
|
||||||
# BB#0: # %entry
|
|
||||||
pushq %rbp
|
|
||||||
movq %rsp, %rbp
|
|
||||||
cvttsd2si %xmm0, %eax
|
|
||||||
popq %rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
The problem is that we can't see through the intrinsic call used for cvtsd2si,
|
|
||||||
and fold away the unnecessary manipulation of the function parameter. When
|
|
||||||
these functions are inlined, it forms a barrier preventing many further
|
|
||||||
optimizations. LLVM IR doesn't have a good way to model the logic of
|
|
||||||
'cvtsd2si', its only FP -> int conversion path forces truncation. We should add
|
|
||||||
a rounding flag onto fptosi so that it can represent this type of rounding
|
|
||||||
naturally in the IR rather than using intrinsics. We might need to use a
|
|
||||||
'system_rounding_mode' flag to encode that the semantics of the rounding mode
|
|
||||||
can be changed by the program, but ideally we could just say that isn't
|
|
||||||
supported, and hard code the rounding.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
@@ -21,3 +21,36 @@ define double @T() {
|
|||||||
%c = fadd double %b, %D
|
%c = fadd double %b, %D
|
||||||
ret double %c
|
ret double %c
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define i1 @test_sse_cvt() nounwind readnone {
|
||||||
|
; CHECK: @test_sse_cvt
|
||||||
|
; CHECK-NOT: call
|
||||||
|
; CHECK: ret i1 true
|
||||||
|
entry:
|
||||||
|
%i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
|
||||||
|
%i1 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
|
||||||
|
%i2 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
|
||||||
|
%i3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
|
||||||
|
%i4 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 1.75, double undef>) nounwind
|
||||||
|
%i5 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 1.75, double undef>) nounwind
|
||||||
|
%i6 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 1.75, double undef>) nounwind
|
||||||
|
%i7 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 1.75, double undef>) nounwind
|
||||||
|
%sum11 = add i32 %i0, %i1
|
||||||
|
%sum12 = add i32 %i4, %i5
|
||||||
|
%sum1 = add i32 %sum11, %sum12
|
||||||
|
%sum21 = add i64 %i2, %i3
|
||||||
|
%sum22 = add i64 %i6, %i7
|
||||||
|
%sum2 = add i64 %sum21, %sum22
|
||||||
|
%sum1.sext = sext i32 %sum1 to i64
|
||||||
|
%b = icmp eq i64 %sum1.sext, %sum2
|
||||||
|
ret i1 %b
|
||||||
|
}
|
||||||
|
|
||||||
|
declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
|
||||||
|
declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
|
||||||
|
declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
|
||||||
|
declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
|
||||||
|
declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
|
||||||
|
declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
|
||||||
|
declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
|
||||||
|
declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
|
||||||
|
Reference in New Issue
Block a user