mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-16 11:30:51 +00:00
4ad0654bb4
Patch to allow (v)blendps, (v)blendpd, (v)pblendw and vpblendd instructions to be commuted - swaps the src registers and inverts the blend mask. This is primarily to improve memory folding (see new tests), but it also improves the quality of shuffles (see modified tests). Differential Revision: http://reviews.llvm.org/D6015 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@221313 91177308-0d34-0410-b5e6-96231b3b80d8
35 lines
1.3 KiB
LLVM
35 lines
1.3 KiB
LLVM
; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=corei7 < %s | FileCheck %s
|
|
|
|
define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 {
|
|
%1 = load <8 x i16>* %b
|
|
%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
|
|
ret <8 x i16> %2
|
|
|
|
;LABEL: commute_fold_pblendw
|
|
;CHECK: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
|
|
;CHECK-NEXT: retq
|
|
}
|
|
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
|
|
|
|
define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 {
|
|
%1 = load <4 x float>* %b
|
|
%2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3)
|
|
ret <4 x float> %2
|
|
|
|
;LABEL: commute_fold_blendps
|
|
;CHECK: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
|
|
;CHECK-NEXT: retq
|
|
}
|
|
declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
|
|
|
|
define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 {
|
|
%1 = load <2 x double>* %b
|
|
%2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
|
|
ret <2 x double> %2
|
|
|
|
;LABEL: commute_fold_vblendpd
|
|
;CHECK: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
|
|
;CHECK-NEXT: retq
|
|
}
|
|
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
|