mirror of
				https://github.com/c64scene-ar/llvm-6502.git
				synced 2025-11-04 05:17:07 +00:00 
			
		
		
		
	Set the transform bar at 2 divisions because the fastest current x86 FP divider circuit is in SandyBridge / Haswell at 10 cycle latency (best case) relative to a 5 cycle multiplier. So that's the worst case for this transform (no latency win), but multiplies are obviously pipelined while divisions are not, so there's still a big throughput win which we would expect to show up in typical FP code. These are the sequences I'm comparing: divss %xmm2, %xmm0 mulss %xmm1, %xmm0 divss %xmm2, %xmm0 Becomes: movss LCPI0_0(%rip), %xmm3 ## xmm3 = mem[0],zero,zero,zero divss %xmm2, %xmm3 mulss %xmm3, %xmm0 mulss %xmm1, %xmm0 mulss %xmm3, %xmm0 [Ignore for the moment that we don't optimize the chain of 3 multiplies into 2 independent fmuls followed by 1 dependent fmul...this is the DAG version of: https://llvm.org/bugs/show_bug.cgi?id=21768 ...if we fix that, then the transform becomes even more profitable on all targets.] Differential Revision: http://reviews.llvm.org/D8941 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@235012 91177308-0d34-0410-b5e6-96231b3b80d8
		
			
				
	
	
		
			32 lines
		
	
	
		
			993 B
		
	
	
	
		
			LLVM
		
	
	
	
	
	
			
		
		
	
	
			32 lines
		
	
	
		
			993 B
		
	
	
	
		
			LLVM
		
	
	
	
	
	
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 | 
						|
 | 
						|
; Anything more than one division using a single divisor operand
 | 
						|
; should be converted into a reciprocal and multiplication.
 | 
						|
 | 
						|
define float @div1_arcp(float %x, float %y, float %z) #0 {
 | 
						|
; CHECK-LABEL: div1_arcp:
 | 
						|
; CHECK:       # BB#0:
 | 
						|
; CHECK-NEXT:    divss %xmm1, %xmm0
 | 
						|
; CHECK-NEXT:    retq
 | 
						|
  %div1 = fdiv arcp float %x, %y
 | 
						|
  ret float %div1
 | 
						|
}
 | 
						|
 | 
						|
define float @div2_arcp(float %x, float %y, float %z) #0 {
 | 
						|
; CHECK-LABEL: div2_arcp:
 | 
						|
; CHECK:       # BB#0:
 | 
						|
; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 | 
						|
; CHECK-NEXT:    divss %xmm2, %xmm3
 | 
						|
; CHECK-NEXT:    mulss %xmm3, %xmm0
 | 
						|
; CHECK-NEXT:    mulss %xmm1, %xmm0
 | 
						|
; CHECK-NEXT:    mulss %xmm3, %xmm0
 | 
						|
; CHECK-NEXT:    retq
 | 
						|
  %div1 = fdiv arcp float %x, %z
 | 
						|
  %mul = fmul arcp float %div1, %y
 | 
						|
  %div2 = fdiv arcp float %mul, %z
 | 
						|
  ret float %div2
 | 
						|
}
 | 
						|
 | 
						|
; FIXME: If the backend understands 'arcp', then this attribute is unnecessary.
 | 
						|
attributes #0 = { "unsafe-fp-math"="true" }
 |