diff --git a/test/CodeGen/X86/avx-vperm2f128.ll b/test/CodeGen/X86/avx-vperm2f128.ll deleted file mode 100644 index c20775bacad..00000000000 --- a/test/CodeGen/X86/avx-vperm2f128.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: _A -; CHECK: vperm2f128 $1 -define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> - ret <8 x float> %shuffle -} - -; CHECK: _B -; CHECK: vblendps $240 -define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> - ret <8 x float> %shuffle -} - -; CHECK: _C -; CHECK: vperm2f128 $0 -define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> - ret <8 x float> %shuffle -} - -; CHECK: _D -; CHECK: vperm2f128 $17 -define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> - ret <8 x float> %shuffle -} - -; CHECK: _E -; CHECK: vperm2f128 $17 -define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> - ret <32 x i8> %shuffle -} - -; CHECK: _E2 -; CHECK: vperm2f128 $3 -define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> - ret <4 x i64> %shuffle -} - -;;;; Cases with undef indicies mixed in the mask - -; CHECK: _F -; CHECK: vperm2f128 $33 -define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> - ret <8 x float> %shuffle -} - -;;;; Cases we must not select vperm2f128 - -; CHECK: _G -; CHECK-NOT: vperm2f128 -define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> - ret <8 x float> %shuffle -} diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll new file mode 100644 index 00000000000..9d5be4ae28d --- /dev/null +++ b/test/CodeGen/X86/avx-vperm2x128.ll @@ -0,0 +1,217 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 + +define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: A: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: B: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: C: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: D: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: E: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + +define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: E2: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + ret <4 x i64> %shuffle +} + +define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: Ei: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: Ei: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpaddb {{.*}}, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <32 x i8> %a, + %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + +define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E2i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: E2i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpbroadcastq {{.*}}, %ymm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <4 x i64> %a, + %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> + ret <4 x i64> %shuffle +} + +define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E3i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: E3i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpbroadcastd {{.*}}, %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <8 x i32> %a, + %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + +define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E4i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: E4i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpaddw {{.*}}, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <16 x i16> %a, + %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E5i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: E5i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddw {{.*}}, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] +; AVX2-NEXT: retq +entry: + %c = load <16 x i16>* %a + %d = load <16 x i16>* %b + %c2 = add <16 x i16> %c, + %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> + ret <16 x i16> %shuffle +} + +;;;; Cases with undef indicies mixed in the mask + +define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: F: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +;;;; Cases we must not select vperm2f128 + +define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: G: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: G: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} diff --git a/test/CodeGen/X86/avx2-vperm2i128.ll b/test/CodeGen/X86/avx2-vperm2i128.ll deleted file mode 100644 index 1937db5d7c1..00000000000 --- a/test/CodeGen/X86/avx2-vperm2i128.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -; CHECK: vperm2i128 $17 -define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <32 x i8> %a, - %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> - ret <32 x i8> %shuffle -} - -; CHECK: vperm2i128 $3 -define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <4 x i64> %a, - %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> - ret <4 x i64> %shuffle -} - -; CHECK: vperm2i128 $49 -define <8 x i32> @E3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <8 x i32> %a, - %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> - ret <8 x i32> %shuffle -} - -; CHECK: vperm2i128 $2 -define <16 x i16> @E4(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <16 x i16> %a, - %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> - ret <16 x i16> %shuffle -} - -; CHECK: vperm2i128 $2, (% -define <16 x i16> @E5(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { -entry: - %c = load <16 x i16>* %a - %d = load <16 x i16>* %b - %c2 = add <16 x i16> %c, - %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> - ret <16 x i16> %shuffle -}