mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-07-04 02:24:29 +00:00
Split the SSE readme items out into their own README.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@28400 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
662
lib/Target/X86/README-SSE.txt
Normal file
662
lib/Target/X86/README-SSE.txt
Normal file
@ -0,0 +1,662 @@
|
|||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
// Random ideas for the X86 backend: SSE-specific stuff.
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
|
||||||
|
other fast SSE modes.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Think about doing i64 math in SSE regs.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
This testcase should have no SSE instructions in it, and only one load from
|
||||||
|
a constant pool:
|
||||||
|
|
||||||
|
double %test3(bool %B) {
|
||||||
|
%C = select bool %B, double 123.412, double 523.01123123
|
||||||
|
ret double %C
|
||||||
|
}
|
||||||
|
|
||||||
|
Currently, the select is being lowered, which prevents the dag combiner from
|
||||||
|
turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
|
||||||
|
|
||||||
|
The pattern isel got this one right.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction
|
||||||
|
like this:
|
||||||
|
|
||||||
|
X += y
|
||||||
|
|
||||||
|
and the register allocator decides to spill X, it is cheaper to emit this as:
|
||||||
|
|
||||||
|
Y += [xslot]
|
||||||
|
store Y -> [xslot]
|
||||||
|
|
||||||
|
than as:
|
||||||
|
|
||||||
|
tmp = [xslot]
|
||||||
|
tmp += y
|
||||||
|
store tmp -> [xslot]
|
||||||
|
|
||||||
|
..and this uses one fewer register (so this should be done at load folding
|
||||||
|
time, not at spiller time). *Note* however that this can only be done
|
||||||
|
if Y is dead. Here's a testcase:
|
||||||
|
|
||||||
|
%.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0]
|
||||||
|
implementation ; Functions:
|
||||||
|
declare void %printf(int, ...)
|
||||||
|
void %main() {
|
||||||
|
build_tree.exit:
|
||||||
|
br label %no_exit.i7
|
||||||
|
no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
|
||||||
|
%tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; <double> [#uses=1]
|
||||||
|
%tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; <double> [#uses=1]
|
||||||
|
%tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
|
||||||
|
%tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
|
||||||
|
br bool false, label %Compute_Tree.exit23, label %no_exit.i7
|
||||||
|
Compute_Tree.exit23: ; preds = %no_exit.i7
|
||||||
|
tail call void (int, ...)* %printf( int 0 )
|
||||||
|
store double %tmp.34.i18, double* null
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
We currently emit:
|
||||||
|
|
||||||
|
.BBmain_1:
|
||||||
|
xorpd %XMM1, %XMM1
|
||||||
|
addsd %XMM0, %XMM1
|
||||||
|
*** movsd %XMM2, QWORD PTR [%ESP + 8]
|
||||||
|
*** addsd %XMM2, %XMM1
|
||||||
|
*** movsd QWORD PTR [%ESP + 8], %XMM2
|
||||||
|
jmp .BBmain_1 # no_exit.i7
|
||||||
|
|
||||||
|
This is a bugpoint reduced testcase, which is why the testcase doesn't make
|
||||||
|
much sense (e.g. its an infinite loop). :)
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
SSE should implement 'select_cc' using 'emulated conditional moves' that use
|
||||||
|
pcmp/pand/pandn/por to do a selection instead of a conditional branch:
|
||||||
|
|
||||||
|
double %X(double %Y, double %Z, double %A, double %B) {
|
||||||
|
%C = setlt double %A, %B
|
||||||
|
%z = add double %Z, 0.0 ;; select operand is not a load
|
||||||
|
%D = select bool %C, double %Y, double %z
|
||||||
|
ret double %D
|
||||||
|
}
|
||||||
|
|
||||||
|
We currently emit:
|
||||||
|
|
||||||
|
_X:
|
||||||
|
subl $12, %esp
|
||||||
|
xorpd %xmm0, %xmm0
|
||||||
|
addsd 24(%esp), %xmm0
|
||||||
|
movsd 32(%esp), %xmm1
|
||||||
|
movsd 16(%esp), %xmm2
|
||||||
|
ucomisd 40(%esp), %xmm1
|
||||||
|
jb LBB_X_2
|
||||||
|
LBB_X_1:
|
||||||
|
movsd %xmm0, %xmm2
|
||||||
|
LBB_X_2:
|
||||||
|
movsd %xmm2, (%esp)
|
||||||
|
fldl (%esp)
|
||||||
|
addl $12, %esp
|
||||||
|
ret
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
It's not clear whether we should use pxor or xorps / xorpd to clear XMM
|
||||||
|
registers. The choice may depend on subtarget information. We should do some
|
||||||
|
more experiments on different x86 machines.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Currently the x86 codegen isn't very good at mixing SSE and FPStack
|
||||||
|
code:
|
||||||
|
|
||||||
|
unsigned int foo(double x) { return x; }
|
||||||
|
|
||||||
|
foo:
|
||||||
|
subl $20, %esp
|
||||||
|
movsd 24(%esp), %xmm0
|
||||||
|
movsd %xmm0, 8(%esp)
|
||||||
|
fldl 8(%esp)
|
||||||
|
fisttpll (%esp)
|
||||||
|
movl (%esp), %eax
|
||||||
|
addl $20, %esp
|
||||||
|
ret
|
||||||
|
|
||||||
|
This will be solved when we go to a dynamic programming based isel.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Should generate min/max for stuff like:
|
||||||
|
|
||||||
|
void minf(float a, float b, float *X) {
|
||||||
|
*X = a <= b ? a : b;
|
||||||
|
}
|
||||||
|
|
||||||
|
Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
|
||||||
|
and ISD::FMAX node types?
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
The first BB of this code:
|
||||||
|
|
||||||
|
declare bool %foo()
|
||||||
|
int %bar() {
|
||||||
|
%V = call bool %foo()
|
||||||
|
br bool %V, label %T, label %F
|
||||||
|
T:
|
||||||
|
ret int 1
|
||||||
|
F:
|
||||||
|
call bool %foo()
|
||||||
|
ret int 12
|
||||||
|
}
|
||||||
|
|
||||||
|
compiles to:
|
||||||
|
|
||||||
|
_bar:
|
||||||
|
subl $12, %esp
|
||||||
|
call L_foo$stub
|
||||||
|
xorb $1, %al
|
||||||
|
testb %al, %al
|
||||||
|
jne LBB_bar_2 # F
|
||||||
|
|
||||||
|
It would be better to emit "cmp %al, 1" than a xor and test.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
|
||||||
|
feasible.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
|
||||||
|
the reg-reg copy in this example:
|
||||||
|
|
||||||
|
float foo(int *x, float *y, unsigned c) {
|
||||||
|
float res = 0.0;
|
||||||
|
unsigned i;
|
||||||
|
for (i = 0; i < c; i++) {
|
||||||
|
float xx = (float)x[i];
|
||||||
|
xx = xx * y[i];
|
||||||
|
xx += res;
|
||||||
|
res = xx;
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
LBB_foo_3: # no_exit
|
||||||
|
cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
|
||||||
|
mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
|
||||||
|
addss %XMM0, %XMM1
|
||||||
|
inc %ESI
|
||||||
|
cmp %ESI, %ECX
|
||||||
|
**** movaps %XMM1, %XMM0
|
||||||
|
jb LBB_foo_3 # no_exit
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Codegen:
|
||||||
|
if (copysign(1.0, x) == copysign(1.0, y))
|
||||||
|
into:
|
||||||
|
if (x^y & mask)
|
||||||
|
when using SSE.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
|
||||||
|
of a v4sf value.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
|
||||||
|
Perhaps use pxor / xorp* to clear a XMM register first?
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Better codegen for:
|
||||||
|
|
||||||
|
void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
|
||||||
|
void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
|
||||||
|
|
||||||
|
For the later we generate:
|
||||||
|
|
||||||
|
_f:
|
||||||
|
pxor %xmm0, %xmm0
|
||||||
|
movss 8(%esp), %xmm1
|
||||||
|
movaps %xmm0, %xmm2
|
||||||
|
unpcklps %xmm1, %xmm2
|
||||||
|
movss 4(%esp), %xmm1
|
||||||
|
unpcklps %xmm0, %xmm1
|
||||||
|
unpcklps %xmm2, %xmm1
|
||||||
|
movl 12(%esp), %eax
|
||||||
|
movaps %xmm1, (%eax)
|
||||||
|
ret
|
||||||
|
|
||||||
|
This seems like it should use shufps, one for each of a & b.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
How to decide when to use the "floating point version" of logical ops? Here are
|
||||||
|
some code fragments:
|
||||||
|
|
||||||
|
movaps LCPI5_5, %xmm2
|
||||||
|
divps %xmm1, %xmm2
|
||||||
|
mulps %xmm2, %xmm3
|
||||||
|
mulps 8656(%ecx), %xmm3
|
||||||
|
addps 8672(%ecx), %xmm3
|
||||||
|
andps LCPI5_6, %xmm2
|
||||||
|
andps LCPI5_1, %xmm3
|
||||||
|
por %xmm2, %xmm3
|
||||||
|
movdqa %xmm3, (%edi)
|
||||||
|
|
||||||
|
movaps LCPI5_5, %xmm1
|
||||||
|
divps %xmm0, %xmm1
|
||||||
|
mulps %xmm1, %xmm3
|
||||||
|
mulps 8656(%ecx), %xmm3
|
||||||
|
addps 8672(%ecx), %xmm3
|
||||||
|
andps LCPI5_6, %xmm1
|
||||||
|
andps LCPI5_1, %xmm3
|
||||||
|
orps %xmm1, %xmm3
|
||||||
|
movaps %xmm3, 112(%esp)
|
||||||
|
movaps %xmm3, (%ebx)
|
||||||
|
|
||||||
|
Due to some minor source change, the later case ended up using orps and movaps
|
||||||
|
instead of por and movdqa. Does it matter?
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Use movddup to splat a v2f64 directly from a memory source. e.g.
|
||||||
|
|
||||||
|
#include <emmintrin.h>
|
||||||
|
|
||||||
|
void test(__m128d *r, double A) {
|
||||||
|
*r = _mm_set1_pd(A);
|
||||||
|
}
|
||||||
|
|
||||||
|
llc:
|
||||||
|
|
||||||
|
_test:
|
||||||
|
movsd 8(%esp), %xmm0
|
||||||
|
unpcklpd %xmm0, %xmm0
|
||||||
|
movl 4(%esp), %eax
|
||||||
|
movapd %xmm0, (%eax)
|
||||||
|
ret
|
||||||
|
|
||||||
|
icc:
|
||||||
|
|
||||||
|
_test:
|
||||||
|
movl 4(%esp), %eax
|
||||||
|
movddup 8(%esp), %xmm0
|
||||||
|
movapd %xmm0, (%eax)
|
||||||
|
ret
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
|
||||||
|
to choose between movaps, movapd, and movdqa based on types of source and
|
||||||
|
destination?
|
||||||
|
|
||||||
|
How about andps, andpd, and pand? Do we really care about the type of the packed
|
||||||
|
elements? If not, why not always use the "ps" variants which are likely to be
|
||||||
|
shorter.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
We are emitting bad code for this:
|
||||||
|
|
||||||
|
float %test(float* %V, int %I, int %D, float %V) {
|
||||||
|
entry:
|
||||||
|
%tmp = seteq int %D, 0
|
||||||
|
br bool %tmp, label %cond_true, label %cond_false23
|
||||||
|
|
||||||
|
cond_true:
|
||||||
|
%tmp3 = getelementptr float* %V, int %I
|
||||||
|
%tmp = load float* %tmp3
|
||||||
|
%tmp5 = setgt float %tmp, %V
|
||||||
|
%tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V )
|
||||||
|
%tmp7 = or bool %tmp5, %tmp6
|
||||||
|
br bool %tmp7, label %UnifiedReturnBlock, label %cond_next
|
||||||
|
|
||||||
|
cond_next:
|
||||||
|
%tmp10 = add int %I, 1
|
||||||
|
%tmp12 = getelementptr float* %V, int %tmp10
|
||||||
|
%tmp13 = load float* %tmp12
|
||||||
|
%tmp15 = setle float %tmp13, %V
|
||||||
|
%tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V )
|
||||||
|
%tmp17 = or bool %tmp15, %tmp16
|
||||||
|
%retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00
|
||||||
|
ret float %retval
|
||||||
|
|
||||||
|
cond_false23:
|
||||||
|
%tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V )
|
||||||
|
ret float %tmp28
|
||||||
|
|
||||||
|
UnifiedReturnBlock: ; preds = %cond_true
|
||||||
|
ret float 0.000000e+00
|
||||||
|
}
|
||||||
|
|
||||||
|
declare bool %llvm.isunordered.f32(float, float)
|
||||||
|
|
||||||
|
declare float %foo(float*, int, int, float)
|
||||||
|
|
||||||
|
|
||||||
|
It exposes a known load folding problem:
|
||||||
|
|
||||||
|
movss (%edx,%ecx,4), %xmm1
|
||||||
|
ucomiss %xmm1, %xmm0
|
||||||
|
|
||||||
|
As well as this:
|
||||||
|
|
||||||
|
LBB_test_2: # cond_next
|
||||||
|
movss LCPI1_0, %xmm2
|
||||||
|
pxor %xmm3, %xmm3
|
||||||
|
ucomiss %xmm0, %xmm1
|
||||||
|
jbe LBB_test_6 # cond_next
|
||||||
|
LBB_test_5: # cond_next
|
||||||
|
movaps %xmm2, %xmm3
|
||||||
|
LBB_test_6: # cond_next
|
||||||
|
movss %xmm3, 40(%esp)
|
||||||
|
flds 40(%esp)
|
||||||
|
addl $44, %esp
|
||||||
|
ret
|
||||||
|
|
||||||
|
Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting
|
||||||
|
three moves (movss, movaps, movss).
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
External test Nurbs exposed some problems. Look for
|
||||||
|
__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
|
||||||
|
emits:
|
||||||
|
|
||||||
|
movaps (%edx), %xmm2 #59.21
|
||||||
|
movaps (%edx), %xmm5 #60.21
|
||||||
|
movaps (%edx), %xmm4 #61.21
|
||||||
|
movaps (%edx), %xmm3 #62.21
|
||||||
|
movl 40(%ecx), %ebp #69.49
|
||||||
|
shufps $0, %xmm2, %xmm5 #60.21
|
||||||
|
movl 100(%esp), %ebx #69.20
|
||||||
|
movl (%ebx), %edi #69.20
|
||||||
|
imull %ebp, %edi #69.49
|
||||||
|
addl (%eax), %edi #70.33
|
||||||
|
shufps $85, %xmm2, %xmm4 #61.21
|
||||||
|
shufps $170, %xmm2, %xmm3 #62.21
|
||||||
|
shufps $255, %xmm2, %xmm2 #63.21
|
||||||
|
lea (%ebp,%ebp,2), %ebx #69.49
|
||||||
|
negl %ebx #69.49
|
||||||
|
lea -3(%edi,%ebx), %ebx #70.33
|
||||||
|
shll $4, %ebx #68.37
|
||||||
|
addl 32(%ecx), %ebx #68.37
|
||||||
|
testb $15, %bl #91.13
|
||||||
|
jne L_B1.24 # Prob 5% #91.13
|
||||||
|
|
||||||
|
This is the llvm code after instruction scheduling:
|
||||||
|
|
||||||
|
cond_next140 (0xa910740, LLVM BB @0xa90beb0):
|
||||||
|
%reg1078 = MOV32ri -3
|
||||||
|
%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
|
||||||
|
%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
|
||||||
|
%reg1080 = IMUL32rr %reg1079, %reg1037
|
||||||
|
%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
|
||||||
|
%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
|
||||||
|
%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
|
||||||
|
%reg1082 = SHL32ri %reg1038, 4
|
||||||
|
%reg1039 = ADD32rr %reg1036, %reg1082
|
||||||
|
%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
|
||||||
|
%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
|
||||||
|
%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
|
||||||
|
%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
|
||||||
|
%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
|
||||||
|
%reg1040 = MOV32rr %reg1039
|
||||||
|
%reg1084 = AND32ri8 %reg1039, 15
|
||||||
|
CMP32ri8 %reg1084, 0
|
||||||
|
JE mbb<cond_next204,0xa914d30>
|
||||||
|
|
||||||
|
Still ok. After register allocation:
|
||||||
|
|
||||||
|
cond_next140 (0xa910740, LLVM BB @0xa90beb0):
|
||||||
|
%EAX = MOV32ri -3
|
||||||
|
%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
|
||||||
|
ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
|
||||||
|
%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
|
||||||
|
%EDX = MOV32rm %EDX, 1, %NOREG, 40
|
||||||
|
IMUL32rr %EAX<def&use>, %EDX
|
||||||
|
%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
|
||||||
|
%ESI = MOV32rm %ESI, 1, %NOREG, 0
|
||||||
|
MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
|
||||||
|
%EAX = LEA32r %ESI, 1, %EAX, -3
|
||||||
|
%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
|
||||||
|
%ESI = MOV32rm %ESI, 1, %NOREG, 32
|
||||||
|
%EDI = MOV32rr %EAX
|
||||||
|
SHL32ri %EDI<def&use>, 4
|
||||||
|
ADD32rr %EDI<def&use>, %ESI
|
||||||
|
%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
|
||||||
|
%XMM1 = MOVAPSrr %XMM0
|
||||||
|
SHUFPSrr %XMM1<def&use>, %XMM1, 170
|
||||||
|
%XMM2 = MOVAPSrr %XMM0
|
||||||
|
SHUFPSrr %XMM2<def&use>, %XMM2, 0
|
||||||
|
%XMM3 = MOVAPSrr %XMM0
|
||||||
|
SHUFPSrr %XMM3<def&use>, %XMM3, 255
|
||||||
|
SHUFPSrr %XMM0<def&use>, %XMM0, 85
|
||||||
|
%EBX = MOV32rr %EDI
|
||||||
|
AND32ri8 %EBX<def&use>, 15
|
||||||
|
CMP32ri8 %EBX, 0
|
||||||
|
JE mbb<cond_next204,0xa914d30>
|
||||||
|
|
||||||
|
This looks really bad. The problem is shufps is a destructive opcode. Since it
|
||||||
|
appears as operand two in more than one shufps ops. It resulted in a number of
|
||||||
|
copies. Note icc also suffers from the same problem. Either the instruction
|
||||||
|
selector should select pshufd or The register allocator can made the two-address
|
||||||
|
to three-address transformation.
|
||||||
|
|
||||||
|
It also exposes some other problems. See MOV32ri -3 and the spills.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
|
||||||
|
|
||||||
|
LLVM is producing bad code.
|
||||||
|
|
||||||
|
LBB_main_4: # cond_true44
|
||||||
|
addps %xmm1, %xmm2
|
||||||
|
subps %xmm3, %xmm2
|
||||||
|
movaps (%ecx), %xmm4
|
||||||
|
movaps %xmm2, %xmm1
|
||||||
|
addps %xmm4, %xmm1
|
||||||
|
addl $16, %ecx
|
||||||
|
incl %edx
|
||||||
|
cmpl $262144, %edx
|
||||||
|
movaps %xmm3, %xmm2
|
||||||
|
movaps %xmm4, %xmm3
|
||||||
|
jne LBB_main_4 # cond_true44
|
||||||
|
|
||||||
|
There are two problems. 1) No need to two loop induction variables. We can
|
||||||
|
compare against 262144 * 16. 2) Known register coalescer issue. We should
|
||||||
|
be able eliminate one of the movaps:
|
||||||
|
|
||||||
|
addps %xmm2, %xmm1 <=== Commute!
|
||||||
|
subps %xmm3, %xmm1
|
||||||
|
movaps (%ecx), %xmm4
|
||||||
|
movaps %xmm1, %xmm1 <=== Eliminate!
|
||||||
|
addps %xmm4, %xmm1
|
||||||
|
addl $16, %ecx
|
||||||
|
incl %edx
|
||||||
|
cmpl $262144, %edx
|
||||||
|
movaps %xmm3, %xmm2
|
||||||
|
movaps %xmm4, %xmm3
|
||||||
|
jne LBB_main_4 # cond_true44
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Consider:
|
||||||
|
|
||||||
|
__m128 test(float a) {
|
||||||
|
return _mm_set_ps(0.0, 0.0, 0.0, a*a);
|
||||||
|
}
|
||||||
|
|
||||||
|
This compiles into:
|
||||||
|
|
||||||
|
movss 4(%esp), %xmm1
|
||||||
|
mulss %xmm1, %xmm1
|
||||||
|
xorps %xmm0, %xmm0
|
||||||
|
movss %xmm1, %xmm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
Because mulss doesn't modify the top 3 elements, the top elements of
|
||||||
|
xmm1 are already zero'd. We could compile this to:
|
||||||
|
|
||||||
|
movss 4(%esp), %xmm0
|
||||||
|
mulss %xmm0, %xmm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Here's a sick and twisted idea. Consider code like this:
|
||||||
|
|
||||||
|
__m128 test(__m128 a) {
|
||||||
|
float b = *(float*)&A;
|
||||||
|
...
|
||||||
|
return _mm_set_ps(0.0, 0.0, 0.0, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
This might compile to this code:
|
||||||
|
|
||||||
|
movaps c(%esp), %xmm1
|
||||||
|
xorps %xmm0, %xmm0
|
||||||
|
movss %xmm1, %xmm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
Now consider if the ... code caused xmm1 to get spilled. This might produce
|
||||||
|
this code:
|
||||||
|
|
||||||
|
movaps c(%esp), %xmm1
|
||||||
|
movaps %xmm1, c2(%esp)
|
||||||
|
...
|
||||||
|
|
||||||
|
xorps %xmm0, %xmm0
|
||||||
|
movaps c2(%esp), %xmm1
|
||||||
|
movss %xmm1, %xmm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
However, since the reload is only used by these instructions, we could
|
||||||
|
"fold" it into the uses, producing something like this:
|
||||||
|
|
||||||
|
movaps c(%esp), %xmm1
|
||||||
|
movaps %xmm1, c2(%esp)
|
||||||
|
...
|
||||||
|
|
||||||
|
movss c2(%esp), %xmm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
... saving two instructions.
|
||||||
|
|
||||||
|
The basic idea is that a reload from a spill slot, can, if only one 4-byte
|
||||||
|
chunk is used, bring in 3 zeros the the one element instead of 4 elements.
|
||||||
|
This can be used to simplify a variety of shuffle operations, where the
|
||||||
|
elements are fixed zeros.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
For this:
|
||||||
|
|
||||||
|
#include <emmintrin.h>
|
||||||
|
void test(__m128d *r, __m128d *A, double B) {
|
||||||
|
*r = _mm_loadl_pd(*A, &B);
|
||||||
|
}
|
||||||
|
|
||||||
|
We generates:
|
||||||
|
|
||||||
|
subl $12, %esp
|
||||||
|
movsd 24(%esp), %xmm0
|
||||||
|
movsd %xmm0, (%esp)
|
||||||
|
movl 20(%esp), %eax
|
||||||
|
movapd (%eax), %xmm0
|
||||||
|
movlpd (%esp), %xmm0
|
||||||
|
movl 16(%esp), %eax
|
||||||
|
movapd %xmm0, (%eax)
|
||||||
|
addl $12, %esp
|
||||||
|
ret
|
||||||
|
|
||||||
|
icc generates:
|
||||||
|
|
||||||
|
movl 4(%esp), %edx #3.6
|
||||||
|
movl 8(%esp), %eax #3.6
|
||||||
|
movapd (%eax), %xmm0 #4.22
|
||||||
|
movlpd 12(%esp), %xmm0 #4.8
|
||||||
|
movapd %xmm0, (%edx) #4.3
|
||||||
|
ret #5.1
|
||||||
|
|
||||||
|
So icc is smart enough to know that B is in memory so it doesn't load it and
|
||||||
|
store it back to stack.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
__m128d test1( __m128d A, __m128d B) {
|
||||||
|
return _mm_shuffle_pd(A, B, 0x3);
|
||||||
|
}
|
||||||
|
|
||||||
|
compiles to
|
||||||
|
|
||||||
|
shufpd $3, %xmm1, %xmm0
|
||||||
|
|
||||||
|
Perhaps it's better to use unpckhpd instead?
|
||||||
|
|
||||||
|
unpckhpd %xmm1, %xmm0
|
||||||
|
|
||||||
|
Don't know if unpckhpd is faster. But it is shorter.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
This code generates ugly code, probably due to costs being off or something:
|
||||||
|
|
||||||
|
void %test(float* %P, <4 x float>* %P2 ) {
|
||||||
|
%xFloat0.688 = load float* %P
|
||||||
|
%loadVector37.712 = load <4 x float>* %P2
|
||||||
|
%inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
|
||||||
|
store <4 x float> %inFloat3.713, <4 x float>* %P2
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
Generates:
|
||||||
|
|
||||||
|
_test:
|
||||||
|
pxor %xmm0, %xmm0
|
||||||
|
movd %xmm0, %eax ;; EAX = 0!
|
||||||
|
movl 8(%esp), %ecx
|
||||||
|
movaps (%ecx), %xmm0
|
||||||
|
pinsrw $6, %eax, %xmm0
|
||||||
|
shrl $16, %eax ;; EAX = 0 again!
|
||||||
|
pinsrw $7, %eax, %xmm0
|
||||||
|
movaps %xmm0, (%ecx)
|
||||||
|
ret
|
||||||
|
|
||||||
|
It would be better to generate:
|
||||||
|
|
||||||
|
_test:
|
||||||
|
movl 8(%esp), %ecx
|
||||||
|
movaps (%ecx), %xmm0
|
||||||
|
xor %eax, %eax
|
||||||
|
pinsrw $6, %eax, %xmm0
|
||||||
|
pinsrw $7, %eax, %xmm0
|
||||||
|
movaps %xmm0, (%ecx)
|
||||||
|
ret
|
||||||
|
|
||||||
|
or use pxor (to make a zero vector) and shuffle (to insert it).
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Some useful information in the Apple Altivec / SSE Migration Guide:
|
||||||
|
|
||||||
|
http://developer.apple.com/documentation/Performance/Conceptual/
|
||||||
|
Accelerate_sse_migration/index.html
|
||||||
|
|
||||||
|
e.g. SSE select using and, andnot, or. Various SSE compare translations.
|
@ -140,15 +140,6 @@ target specific hook.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
|
|
||||||
other fast SSE modes.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Think about doing i64 math in SSE regs.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
The DAG Isel doesn't fold the loads into the adds in this testcase. The
|
The DAG Isel doesn't fold the loads into the adds in this testcase. The
|
||||||
pattern selector does. This is because the chain value of the load gets
|
pattern selector does. This is because the chain value of the load gets
|
||||||
selected first, and the loads aren't checking to see if they are only used by
|
selected first, and the loads aren't checking to see if they are only used by
|
||||||
@ -194,74 +185,6 @@ better schedule. :)
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
This testcase should have no SSE instructions in it, and only one load from
|
|
||||||
a constant pool:
|
|
||||||
|
|
||||||
double %test3(bool %B) {
|
|
||||||
%C = select bool %B, double 123.412, double 523.01123123
|
|
||||||
ret double %C
|
|
||||||
}
|
|
||||||
|
|
||||||
Currently, the select is being lowered, which prevents the dag combiner from
|
|
||||||
turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
|
|
||||||
|
|
||||||
The pattern isel got this one right.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction
|
|
||||||
like this:
|
|
||||||
|
|
||||||
X += y
|
|
||||||
|
|
||||||
and the register allocator decides to spill X, it is cheaper to emit this as:
|
|
||||||
|
|
||||||
Y += [xslot]
|
|
||||||
store Y -> [xslot]
|
|
||||||
|
|
||||||
than as:
|
|
||||||
|
|
||||||
tmp = [xslot]
|
|
||||||
tmp += y
|
|
||||||
store tmp -> [xslot]
|
|
||||||
|
|
||||||
..and this uses one fewer register (so this should be done at load folding
|
|
||||||
time, not at spiller time). *Note* however that this can only be done
|
|
||||||
if Y is dead. Here's a testcase:
|
|
||||||
|
|
||||||
%.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0]
|
|
||||||
implementation ; Functions:
|
|
||||||
declare void %printf(int, ...)
|
|
||||||
void %main() {
|
|
||||||
build_tree.exit:
|
|
||||||
br label %no_exit.i7
|
|
||||||
no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
|
|
||||||
%tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; <double> [#uses=1]
|
|
||||||
%tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; <double> [#uses=1]
|
|
||||||
%tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
|
|
||||||
%tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
|
|
||||||
br bool false, label %Compute_Tree.exit23, label %no_exit.i7
|
|
||||||
Compute_Tree.exit23: ; preds = %no_exit.i7
|
|
||||||
tail call void (int, ...)* %printf( int 0 )
|
|
||||||
store double %tmp.34.i18, double* null
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
We currently emit:
|
|
||||||
|
|
||||||
.BBmain_1:
|
|
||||||
xorpd %XMM1, %XMM1
|
|
||||||
addsd %XMM0, %XMM1
|
|
||||||
*** movsd %XMM2, QWORD PTR [%ESP + 8]
|
|
||||||
*** addsd %XMM2, %XMM1
|
|
||||||
*** movsd QWORD PTR [%ESP + 8], %XMM2
|
|
||||||
jmp .BBmain_1 # no_exit.i7
|
|
||||||
|
|
||||||
This is a bugpoint reduced testcase, which is why the testcase doesn't make
|
|
||||||
much sense (e.g. its an infinite loop). :)
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
In many cases, LLVM generates code like this:
|
In many cases, LLVM generates code like this:
|
||||||
|
|
||||||
_test:
|
_test:
|
||||||
@ -316,36 +239,6 @@ which is smaller.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
SSE should implement 'select_cc' using 'emulated conditional moves' that use
|
|
||||||
pcmp/pand/pandn/por to do a selection instead of a conditional branch:
|
|
||||||
|
|
||||||
double %X(double %Y, double %Z, double %A, double %B) {
|
|
||||||
%C = setlt double %A, %B
|
|
||||||
%z = add double %Z, 0.0 ;; select operand is not a load
|
|
||||||
%D = select bool %C, double %Y, double %z
|
|
||||||
ret double %D
|
|
||||||
}
|
|
||||||
|
|
||||||
We currently emit:
|
|
||||||
|
|
||||||
_X:
|
|
||||||
subl $12, %esp
|
|
||||||
xorpd %xmm0, %xmm0
|
|
||||||
addsd 24(%esp), %xmm0
|
|
||||||
movsd 32(%esp), %xmm1
|
|
||||||
movsd 16(%esp), %xmm2
|
|
||||||
ucomisd 40(%esp), %xmm1
|
|
||||||
jb LBB_X_2
|
|
||||||
LBB_X_1:
|
|
||||||
movsd %xmm0, %xmm2
|
|
||||||
LBB_X_2:
|
|
||||||
movsd %xmm2, (%esp)
|
|
||||||
fldl (%esp)
|
|
||||||
addl $12, %esp
|
|
||||||
ret
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
We should generate bts/btr/etc instructions on targets where they are cheap or
|
We should generate bts/btr/etc instructions on targets where they are cheap or
|
||||||
when codesize is important. e.g., for:
|
when codesize is important. e.g., for:
|
||||||
|
|
||||||
@ -375,12 +268,6 @@ when we can spare a register. It reduces code size.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
It's not clear whether we should use pxor or xorps / xorpd to clear XMM
|
|
||||||
registers. The choice may depend on subtarget information. We should do some
|
|
||||||
more experiments on different x86 machines.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
|
Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
|
||||||
get this:
|
get this:
|
||||||
|
|
||||||
@ -412,25 +299,6 @@ which is probably slower, but it's interesting at least :)
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
Currently the x86 codegen isn't very good at mixing SSE and FPStack
|
|
||||||
code:
|
|
||||||
|
|
||||||
unsigned int foo(double x) { return x; }
|
|
||||||
|
|
||||||
foo:
|
|
||||||
subl $20, %esp
|
|
||||||
movsd 24(%esp), %xmm0
|
|
||||||
movsd %xmm0, 8(%esp)
|
|
||||||
fldl 8(%esp)
|
|
||||||
fisttpll (%esp)
|
|
||||||
movl (%esp), %eax
|
|
||||||
addl $20, %esp
|
|
||||||
ret
|
|
||||||
|
|
||||||
This will be solved when we go to a dynamic programming based isel.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Should generate min/max for stuff like:
|
Should generate min/max for stuff like:
|
||||||
|
|
||||||
void minf(float a, float b, float *X) {
|
void minf(float a, float b, float *X) {
|
||||||
@ -495,45 +363,6 @@ stores, TLB preheating, etc)
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
|
|
||||||
feasible.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
|
|
||||||
the reg-reg copy in this example:
|
|
||||||
|
|
||||||
float foo(int *x, float *y, unsigned c) {
|
|
||||||
float res = 0.0;
|
|
||||||
unsigned i;
|
|
||||||
for (i = 0; i < c; i++) {
|
|
||||||
float xx = (float)x[i];
|
|
||||||
xx = xx * y[i];
|
|
||||||
xx += res;
|
|
||||||
res = xx;
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
LBB_foo_3: # no_exit
|
|
||||||
cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
|
|
||||||
mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
|
|
||||||
addss %XMM0, %XMM1
|
|
||||||
inc %ESI
|
|
||||||
cmp %ESI, %ECX
|
|
||||||
**** movaps %XMM1, %XMM0
|
|
||||||
jb LBB_foo_3 # no_exit
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Codegen:
|
|
||||||
if (copysign(1.0, x) == copysign(1.0, y))
|
|
||||||
into:
|
|
||||||
if (x^y & mask)
|
|
||||||
when using SSE.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Optimize this into something reasonable:
|
Optimize this into something reasonable:
|
||||||
x * copysign(1.0, y) * copysign(1.0, z)
|
x * copysign(1.0, y) * copysign(1.0, z)
|
||||||
|
|
||||||
@ -611,39 +440,6 @@ directly %esp[0] if there are no other uses.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
|
|
||||||
of a v4sf value.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
|
|
||||||
Perhaps use pxor / xorp* to clear a XMM register first?
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Better codegen for:
|
|
||||||
|
|
||||||
void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
|
|
||||||
void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
|
|
||||||
|
|
||||||
For the later we generate:
|
|
||||||
|
|
||||||
_f:
|
|
||||||
pxor %xmm0, %xmm0
|
|
||||||
movss 8(%esp), %xmm1
|
|
||||||
movaps %xmm0, %xmm2
|
|
||||||
unpcklps %xmm1, %xmm2
|
|
||||||
movss 4(%esp), %xmm1
|
|
||||||
unpcklps %xmm0, %xmm1
|
|
||||||
unpcklps %xmm2, %xmm1
|
|
||||||
movl 12(%esp), %eax
|
|
||||||
movaps %xmm1, (%eax)
|
|
||||||
ret
|
|
||||||
|
|
||||||
This seems like it should use shufps, one for each of a & b.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Adding to the list of cmp / test poor codegen issues:
|
Adding to the list of cmp / test poor codegen issues:
|
||||||
|
|
||||||
int test(__m128 *A, __m128 *B) {
|
int test(__m128 *A, __m128 *B) {
|
||||||
@ -676,327 +472,6 @@ We probably need some kind of target DAG combine hook to fix this.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
How to decide when to use the "floating point version" of logical ops? Here are
|
|
||||||
some code fragments:
|
|
||||||
|
|
||||||
movaps LCPI5_5, %xmm2
|
|
||||||
divps %xmm1, %xmm2
|
|
||||||
mulps %xmm2, %xmm3
|
|
||||||
mulps 8656(%ecx), %xmm3
|
|
||||||
addps 8672(%ecx), %xmm3
|
|
||||||
andps LCPI5_6, %xmm2
|
|
||||||
andps LCPI5_1, %xmm3
|
|
||||||
por %xmm2, %xmm3
|
|
||||||
movdqa %xmm3, (%edi)
|
|
||||||
|
|
||||||
movaps LCPI5_5, %xmm1
|
|
||||||
divps %xmm0, %xmm1
|
|
||||||
mulps %xmm1, %xmm3
|
|
||||||
mulps 8656(%ecx), %xmm3
|
|
||||||
addps 8672(%ecx), %xmm3
|
|
||||||
andps LCPI5_6, %xmm1
|
|
||||||
andps LCPI5_1, %xmm3
|
|
||||||
orps %xmm1, %xmm3
|
|
||||||
movaps %xmm3, 112(%esp)
|
|
||||||
movaps %xmm3, (%ebx)
|
|
||||||
|
|
||||||
Due to some minor source change, the later case ended up using orps and movaps
|
|
||||||
instead of por and movdqa. Does it matter?
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Use movddup to splat a v2f64 directly from a memory source. e.g.
|
|
||||||
|
|
||||||
#include <emmintrin.h>
|
|
||||||
|
|
||||||
void test(__m128d *r, double A) {
|
|
||||||
*r = _mm_set1_pd(A);
|
|
||||||
}
|
|
||||||
|
|
||||||
llc:
|
|
||||||
|
|
||||||
_test:
|
|
||||||
movsd 8(%esp), %xmm0
|
|
||||||
unpcklpd %xmm0, %xmm0
|
|
||||||
movl 4(%esp), %eax
|
|
||||||
movapd %xmm0, (%eax)
|
|
||||||
ret
|
|
||||||
|
|
||||||
icc:
|
|
||||||
|
|
||||||
_test:
|
|
||||||
movl 4(%esp), %eax
|
|
||||||
movddup 8(%esp), %xmm0
|
|
||||||
movapd %xmm0, (%eax)
|
|
||||||
ret
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
|
|
||||||
to choose between movaps, movapd, and movdqa based on types of source and
|
|
||||||
destination?
|
|
||||||
|
|
||||||
How about andps, andpd, and pand? Do we really care about the type of the packed
|
|
||||||
elements? If not, why not always use the "ps" variants which are likely to be
|
|
||||||
shorter.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
We are emitting bad code for this:
|
|
||||||
|
|
||||||
float %test(float* %V, int %I, int %D, float %V) {
|
|
||||||
entry:
|
|
||||||
%tmp = seteq int %D, 0
|
|
||||||
br bool %tmp, label %cond_true, label %cond_false23
|
|
||||||
|
|
||||||
cond_true:
|
|
||||||
%tmp3 = getelementptr float* %V, int %I
|
|
||||||
%tmp = load float* %tmp3
|
|
||||||
%tmp5 = setgt float %tmp, %V
|
|
||||||
%tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V )
|
|
||||||
%tmp7 = or bool %tmp5, %tmp6
|
|
||||||
br bool %tmp7, label %UnifiedReturnBlock, label %cond_next
|
|
||||||
|
|
||||||
cond_next:
|
|
||||||
%tmp10 = add int %I, 1
|
|
||||||
%tmp12 = getelementptr float* %V, int %tmp10
|
|
||||||
%tmp13 = load float* %tmp12
|
|
||||||
%tmp15 = setle float %tmp13, %V
|
|
||||||
%tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V )
|
|
||||||
%tmp17 = or bool %tmp15, %tmp16
|
|
||||||
%retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00
|
|
||||||
ret float %retval
|
|
||||||
|
|
||||||
cond_false23:
|
|
||||||
%tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V )
|
|
||||||
ret float %tmp28
|
|
||||||
|
|
||||||
UnifiedReturnBlock: ; preds = %cond_true
|
|
||||||
ret float 0.000000e+00
|
|
||||||
}
|
|
||||||
|
|
||||||
declare bool %llvm.isunordered.f32(float, float)
|
|
||||||
|
|
||||||
declare float %foo(float*, int, int, float)
|
|
||||||
|
|
||||||
|
|
||||||
It exposes a known load folding problem:
|
|
||||||
|
|
||||||
movss (%edx,%ecx,4), %xmm1
|
|
||||||
ucomiss %xmm1, %xmm0
|
|
||||||
|
|
||||||
As well as this:
|
|
||||||
|
|
||||||
LBB_test_2: # cond_next
|
|
||||||
movss LCPI1_0, %xmm2
|
|
||||||
pxor %xmm3, %xmm3
|
|
||||||
ucomiss %xmm0, %xmm1
|
|
||||||
jbe LBB_test_6 # cond_next
|
|
||||||
LBB_test_5: # cond_next
|
|
||||||
movaps %xmm2, %xmm3
|
|
||||||
LBB_test_6: # cond_next
|
|
||||||
movss %xmm3, 40(%esp)
|
|
||||||
flds 40(%esp)
|
|
||||||
addl $44, %esp
|
|
||||||
ret
|
|
||||||
|
|
||||||
Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting
|
|
||||||
three moves (movss, movaps, movss).
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
External test Nurbs exposed some problems. Look for
|
|
||||||
__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
|
|
||||||
emits:
|
|
||||||
|
|
||||||
movaps (%edx), %xmm2 #59.21
|
|
||||||
movaps (%edx), %xmm5 #60.21
|
|
||||||
movaps (%edx), %xmm4 #61.21
|
|
||||||
movaps (%edx), %xmm3 #62.21
|
|
||||||
movl 40(%ecx), %ebp #69.49
|
|
||||||
shufps $0, %xmm2, %xmm5 #60.21
|
|
||||||
movl 100(%esp), %ebx #69.20
|
|
||||||
movl (%ebx), %edi #69.20
|
|
||||||
imull %ebp, %edi #69.49
|
|
||||||
addl (%eax), %edi #70.33
|
|
||||||
shufps $85, %xmm2, %xmm4 #61.21
|
|
||||||
shufps $170, %xmm2, %xmm3 #62.21
|
|
||||||
shufps $255, %xmm2, %xmm2 #63.21
|
|
||||||
lea (%ebp,%ebp,2), %ebx #69.49
|
|
||||||
negl %ebx #69.49
|
|
||||||
lea -3(%edi,%ebx), %ebx #70.33
|
|
||||||
shll $4, %ebx #68.37
|
|
||||||
addl 32(%ecx), %ebx #68.37
|
|
||||||
testb $15, %bl #91.13
|
|
||||||
jne L_B1.24 # Prob 5% #91.13
|
|
||||||
|
|
||||||
This is the llvm code after instruction scheduling:
|
|
||||||
|
|
||||||
cond_next140 (0xa910740, LLVM BB @0xa90beb0):
|
|
||||||
%reg1078 = MOV32ri -3
|
|
||||||
%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
|
|
||||||
%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
|
|
||||||
%reg1080 = IMUL32rr %reg1079, %reg1037
|
|
||||||
%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
|
|
||||||
%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
|
|
||||||
%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
|
|
||||||
%reg1082 = SHL32ri %reg1038, 4
|
|
||||||
%reg1039 = ADD32rr %reg1036, %reg1082
|
|
||||||
%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
|
|
||||||
%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
|
|
||||||
%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
|
|
||||||
%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
|
|
||||||
%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
|
|
||||||
%reg1040 = MOV32rr %reg1039
|
|
||||||
%reg1084 = AND32ri8 %reg1039, 15
|
|
||||||
CMP32ri8 %reg1084, 0
|
|
||||||
JE mbb<cond_next204,0xa914d30>
|
|
||||||
|
|
||||||
Still ok. After register allocation:
|
|
||||||
|
|
||||||
cond_next140 (0xa910740, LLVM BB @0xa90beb0):
|
|
||||||
%EAX = MOV32ri -3
|
|
||||||
%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
|
|
||||||
ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
|
|
||||||
%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
|
|
||||||
%EDX = MOV32rm %EDX, 1, %NOREG, 40
|
|
||||||
IMUL32rr %EAX<def&use>, %EDX
|
|
||||||
%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
|
|
||||||
%ESI = MOV32rm %ESI, 1, %NOREG, 0
|
|
||||||
MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
|
|
||||||
%EAX = LEA32r %ESI, 1, %EAX, -3
|
|
||||||
%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
|
|
||||||
%ESI = MOV32rm %ESI, 1, %NOREG, 32
|
|
||||||
%EDI = MOV32rr %EAX
|
|
||||||
SHL32ri %EDI<def&use>, 4
|
|
||||||
ADD32rr %EDI<def&use>, %ESI
|
|
||||||
%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
|
|
||||||
%XMM1 = MOVAPSrr %XMM0
|
|
||||||
SHUFPSrr %XMM1<def&use>, %XMM1, 170
|
|
||||||
%XMM2 = MOVAPSrr %XMM0
|
|
||||||
SHUFPSrr %XMM2<def&use>, %XMM2, 0
|
|
||||||
%XMM3 = MOVAPSrr %XMM0
|
|
||||||
SHUFPSrr %XMM3<def&use>, %XMM3, 255
|
|
||||||
SHUFPSrr %XMM0<def&use>, %XMM0, 85
|
|
||||||
%EBX = MOV32rr %EDI
|
|
||||||
AND32ri8 %EBX<def&use>, 15
|
|
||||||
CMP32ri8 %EBX, 0
|
|
||||||
JE mbb<cond_next204,0xa914d30>
|
|
||||||
|
|
||||||
This looks really bad. The problem is shufps is a destructive opcode. Since it
|
|
||||||
appears as operand two in more than one shufps ops. It resulted in a number of
|
|
||||||
copies. Note icc also suffers from the same problem. Either the instruction
|
|
||||||
selector should select pshufd or The register allocator can made the two-address
|
|
||||||
to three-address transformation.
|
|
||||||
|
|
||||||
It also exposes some other problems. See MOV32ri -3 and the spills.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
|
|
||||||
|
|
||||||
LLVM is producing bad code.
|
|
||||||
|
|
||||||
LBB_main_4: # cond_true44
|
|
||||||
addps %xmm1, %xmm2
|
|
||||||
subps %xmm3, %xmm2
|
|
||||||
movaps (%ecx), %xmm4
|
|
||||||
movaps %xmm2, %xmm1
|
|
||||||
addps %xmm4, %xmm1
|
|
||||||
addl $16, %ecx
|
|
||||||
incl %edx
|
|
||||||
cmpl $262144, %edx
|
|
||||||
movaps %xmm3, %xmm2
|
|
||||||
movaps %xmm4, %xmm3
|
|
||||||
jne LBB_main_4 # cond_true44
|
|
||||||
|
|
||||||
There are two problems. 1) No need to two loop induction variables. We can
|
|
||||||
compare against 262144 * 16. 2) Known register coalescer issue. We should
|
|
||||||
be able eliminate one of the movaps:
|
|
||||||
|
|
||||||
addps %xmm2, %xmm1 <=== Commute!
|
|
||||||
subps %xmm3, %xmm1
|
|
||||||
movaps (%ecx), %xmm4
|
|
||||||
movaps %xmm1, %xmm1 <=== Eliminate!
|
|
||||||
addps %xmm4, %xmm1
|
|
||||||
addl $16, %ecx
|
|
||||||
incl %edx
|
|
||||||
cmpl $262144, %edx
|
|
||||||
movaps %xmm3, %xmm2
|
|
||||||
movaps %xmm4, %xmm3
|
|
||||||
jne LBB_main_4 # cond_true44
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Consider:
|
|
||||||
|
|
||||||
__m128 test(float a) {
|
|
||||||
return _mm_set_ps(0.0, 0.0, 0.0, a*a);
|
|
||||||
}
|
|
||||||
|
|
||||||
This compiles into:
|
|
||||||
|
|
||||||
movss 4(%esp), %xmm1
|
|
||||||
mulss %xmm1, %xmm1
|
|
||||||
xorps %xmm0, %xmm0
|
|
||||||
movss %xmm1, %xmm0
|
|
||||||
ret
|
|
||||||
|
|
||||||
Because mulss doesn't modify the top 3 elements, the top elements of
|
|
||||||
xmm1 are already zero'd. We could compile this to:
|
|
||||||
|
|
||||||
movss 4(%esp), %xmm0
|
|
||||||
mulss %xmm0, %xmm0
|
|
||||||
ret
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
Here's a sick and twisted idea. Consider code like this:
|
|
||||||
|
|
||||||
__m128 test(__m128 a) {
|
|
||||||
float b = *(float*)&A;
|
|
||||||
...
|
|
||||||
return _mm_set_ps(0.0, 0.0, 0.0, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
This might compile to this code:
|
|
||||||
|
|
||||||
movaps c(%esp), %xmm1
|
|
||||||
xorps %xmm0, %xmm0
|
|
||||||
movss %xmm1, %xmm0
|
|
||||||
ret
|
|
||||||
|
|
||||||
Now consider if the ... code caused xmm1 to get spilled. This might produce
|
|
||||||
this code:
|
|
||||||
|
|
||||||
movaps c(%esp), %xmm1
|
|
||||||
movaps %xmm1, c2(%esp)
|
|
||||||
...
|
|
||||||
|
|
||||||
xorps %xmm0, %xmm0
|
|
||||||
movaps c2(%esp), %xmm1
|
|
||||||
movss %xmm1, %xmm0
|
|
||||||
ret
|
|
||||||
|
|
||||||
However, since the reload is only used by these instructions, we could
|
|
||||||
"fold" it into the uses, producing something like this:
|
|
||||||
|
|
||||||
movaps c(%esp), %xmm1
|
|
||||||
movaps %xmm1, c2(%esp)
|
|
||||||
...
|
|
||||||
|
|
||||||
movss c2(%esp), %xmm0
|
|
||||||
ret
|
|
||||||
|
|
||||||
... saving two instructions.
|
|
||||||
|
|
||||||
The basic idea is that a reload from a spill slot, can, if only one 4-byte
|
|
||||||
chunk is used, bring in 3 zeros the the one element instead of 4 elements.
|
|
||||||
This can be used to simplify a variety of shuffle operations, where the
|
|
||||||
elements are fixed zeros.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
We generate significantly worse code for this than GCC:
|
We generate significantly worse code for this than GCC:
|
||||||
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
|
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
|
||||||
http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
|
http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
|
||||||
@ -1005,56 +480,6 @@ There is also one case we do worse on PPC.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
For this:
|
|
||||||
|
|
||||||
#include <emmintrin.h>
|
|
||||||
void test(__m128d *r, __m128d *A, double B) {
|
|
||||||
*r = _mm_loadl_pd(*A, &B);
|
|
||||||
}
|
|
||||||
|
|
||||||
We generates:
|
|
||||||
|
|
||||||
subl $12, %esp
|
|
||||||
movsd 24(%esp), %xmm0
|
|
||||||
movsd %xmm0, (%esp)
|
|
||||||
movl 20(%esp), %eax
|
|
||||||
movapd (%eax), %xmm0
|
|
||||||
movlpd (%esp), %xmm0
|
|
||||||
movl 16(%esp), %eax
|
|
||||||
movapd %xmm0, (%eax)
|
|
||||||
addl $12, %esp
|
|
||||||
ret
|
|
||||||
|
|
||||||
icc generates:
|
|
||||||
|
|
||||||
movl 4(%esp), %edx #3.6
|
|
||||||
movl 8(%esp), %eax #3.6
|
|
||||||
movapd (%eax), %xmm0 #4.22
|
|
||||||
movlpd 12(%esp), %xmm0 #4.8
|
|
||||||
movapd %xmm0, (%edx) #4.3
|
|
||||||
ret #5.1
|
|
||||||
|
|
||||||
So icc is smart enough to know that B is in memory so it doesn't load it and
|
|
||||||
store it back to stack.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
__m128d test1( __m128d A, __m128d B) {
|
|
||||||
return _mm_shuffle_pd(A, B, 0x3);
|
|
||||||
}
|
|
||||||
|
|
||||||
compiles to
|
|
||||||
|
|
||||||
shufpd $3, %xmm1, %xmm0
|
|
||||||
|
|
||||||
Perhaps it's better to use unpckhpd instead?
|
|
||||||
|
|
||||||
unpckhpd %xmm1, %xmm0
|
|
||||||
|
|
||||||
Don't know if unpckhpd is faster. But it is shorter.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
If shorter, we should use things like:
|
If shorter, we should use things like:
|
||||||
movzwl %ax, %eax
|
movzwl %ax, %eax
|
||||||
instead of:
|
instead of:
|
||||||
@ -1114,10 +539,3 @@ _foo:
|
|||||||
ret
|
ret
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
Some useful information in the Apple Altivec / SSE Migration Guide:
|
|
||||||
|
|
||||||
http://developer.apple.com/documentation/Performance/Conceptual/
|
|
||||||
Accelerate_sse_migration/index.html
|
|
||||||
|
|
||||||
e.g. SSE select using and, andnot, or. Various SSE compare translations.
|
|
||||||
|
Reference in New Issue
Block a user