diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index b4fc53ad0aa..cadfc20bbb1 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -456,6 +456,18 @@ icc generates: So icc is smart enough to know that B is in memory so it doesn't load it and store it back to stack. +This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic, +lowering it to a load+insertelement instead. Already match the load+shuffle +as movlpd, so this should be easy. We already get optimal code for: + +define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) { +entry: + %tmp2 = load <2 x double>* %A, align 16 + %tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0 + store <2 x double> %tmp8, <2 x double>* %r, align 16 + ret void +} + //===---------------------------------------------------------------------===// __m128d test1( __m128d A, __m128d B) { @@ -476,10 +488,10 @@ Don't know if unpckhpd is faster. But it is shorter. This code generates ugly code, probably due to costs being off or something: -void %test(float* %P, <4 x float>* %P2 ) { +define void @test(float* %P, <4 x float>* %P2 ) { %xFloat0.688 = load float* %P - %loadVector37.712 = load <4 x float>* %P2 - %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3 + %tmp = load <4 x float>* %P2 + %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 store <4 x float> %inFloat3.713, <4 x float>* %P2 ret void } @@ -487,17 +499,16 @@ void %test(float* %P, <4 x float>* %P2 ) { Generates: _test: - pxor %xmm0, %xmm0 - movd %xmm0, %eax ;; EAX = 0! - movl 8(%esp), %ecx - movaps (%ecx), %xmm0 - pinsrw $6, %eax, %xmm0 - shrl $16, %eax ;; EAX = 0 again! - pinsrw $7, %eax, %xmm0 - movaps %xmm0, (%ecx) - ret + movl 8(%esp), %eax + movaps (%eax), %xmm0 + pxor %xmm1, %xmm1 + movaps %xmm0, %xmm2 + shufps $50, %xmm1, %xmm2 + shufps $132, %xmm2, %xmm0 + movaps %xmm0, (%eax) + ret -It would be better to generate: +Would it be better to generate: _test: movl 8(%esp), %ecx @@ -508,7 +519,7 @@ _test: movaps %xmm0, (%ecx) ret -or use pxor (to make a zero vector) and shuffle (to insert it). +? //===---------------------------------------------------------------------===// @@ -576,32 +587,6 @@ swizzle: //===---------------------------------------------------------------------===// -This code: - -#include -__m128i test(long long i) { return _mm_cvtsi64x_si128(i); } - -Should turn into a single 'movq %rdi, %xmm0' instruction. Instead, we -get this (on x86-64): - -_test: - movd %rdi, %xmm1 - xorps %xmm0, %xmm0 - movsd %xmm1, %xmm0 - ret - -The LLVM IR is: - -target triple = "x86_64-apple-darwin8" -define <2 x i64> @test(i64 %i) { -entry: - %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0 - %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1 - ret <2 x i64> %tmp11 -} - -//===---------------------------------------------------------------------===// - These functions should produce the same code: #include @@ -671,43 +656,6 @@ beneficial because it prevents the load from being folded into the multiply. //===---------------------------------------------------------------------===// -In this loop: - -bb49: ; preds = %bb49, %bb49.preheader - %indvar = phi i32 [ 0, %bb49.preheader ], [ %indvar.next, %bb49 ] ; [#uses=2] - %dp.089.0.rec = shl i32 %indvar, 3 ; [#uses=2] - %dp.089.0 = getelementptr i32* %tmp89, i32 %dp.089.0.rec ; [#uses=1] - %tmp5051 = bitcast i32* %dp.089.0 to <2 x i64>* ; <<2 x i64>*> [#uses=1] - store <2 x i64> zeroinitializer, <2 x i64>* %tmp5051, align 16 - %dp.089.0.sum105 = or i32 %dp.089.0.rec, 4 ; [#uses=1] - %tmp56 = getelementptr i32* %tmp89, i32 %dp.089.0.sum105 ; [#uses=1] - %tmp5657 = bitcast i32* %tmp56 to <2 x i64>* ; <<2 x i64>*> [#uses=1] - store <2 x i64> zeroinitializer, <2 x i64>* %tmp5657, align 16 - %indvar.next = add i32 %indvar, 1 ; [#uses=2] - %exitcond = icmp eq i32 %indvar.next, %tmp98 ; [#uses=1] - br i1 %exitcond, label %bb72, label %bb49 - -we get: - -LBB5_6: # bb49.preheader - shlw $2, %si - decw %si - movzwl %si, %eax - incl %eax - xorl %ecx, %ecx -LBB5_7: # bb49 - xorps %xmm0, %xmm0 # (1) - movaps %xmm0, (%edx) - movaps %xmm0, 16(%edx) - addl $32, %edx - incl %ecx - cmpl %eax, %ecx - jne LBB4_7 # bb47 - -The instruction at (1) can be moved out of the main body of the loop. - -//===---------------------------------------------------------------------===// - These functions: #include diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll new file mode 100644 index 00000000000..cca436bf643 --- /dev/null +++ b/test/CodeGen/X86/vec_set-8.ll @@ -0,0 +1,10 @@ +; RUN: llvm-as < %s | llc -march=x86-64 | not grep movsd +; RUN: llvm-as < %s | llc -march=x86-64 | grep {movd.*%rdi,.*%xmm0} + +define <2 x i64> @test(i64 %i) nounwind { +entry: + %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0 + %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1 + ret <2 x i64> %tmp11 +} +