From 5440f2198dce5f02031d09390edf62a593af5370 Mon Sep 17 00:00:00 2001 From: Michael Kuperstein Date: Tue, 19 May 2015 11:06:56 +0000 Subject: [PATCH] [X86] ABI change for x86-32: pass 3 vector arguments in-register instead of 4, except on Darwin. This changes the ABI used on 32-bit x86 for passing vector arguments. Historically, clang passes the first 4 vector arguments in-register, and additional vector arguments on the stack, regardless of platform. That is different from the behavior of gcc, icc, and msvc, all of which pass only the first 3 arguments in-register. The 3-register convention is documented, unofficially, in Agner's calling convention guide, and, officially, in the recently released version 1.0 of the i386 psABI. Darwin is kept as is because the OS X ABI Function Call Guide explicitly documents the current (4-register) behavior. This fixes PR21510 Differential revision: http://reviews.llvm.org/D9644 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@237682 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86CallingConv.td | 89 +++++++++++++------ test/CodeGen/X86/fp-trunc.ll | 8 +- .../CodeGen/X86/illegal-vector-args-return.ll | 2 + .../CodeGen/X86/x86-32-vector-calling-conv.ll | 44 +++++++++ 4 files changed, 114 insertions(+), 29 deletions(-) create mode 100644 test/CodeGen/X86/x86-32-vector-calling-conv.ll diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 790160662ef..8f88888f5ce 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -445,9 +445,61 @@ def CC_X86_64_AnyReg : CallingConv<[ // X86 C Calling Convention //===----------------------------------------------------------------------===// +/// CC_X86_32_Vector_Common - In all X86-32 calling conventions, extra vector +/// values are spilled on the stack. +def CC_X86_32_Vector_Common : CallingConv<[ + // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>>, + + // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> +]>; + +// CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in +// vector registers +def CC_X86_32_Vector_Standard : CallingConv<[ + // SSE vector arguments are passed in XMM registers. + CCIfNotVarArg>>, + + // AVX 256-bit vector arguments are passed in YMM registers. + CCIfNotVarArg>>>, + + // AVX 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg>>, + + CCDelegateTo +]>; + +// CC_X86_32_Vector_Darwin - The first 4 vector arguments are passed in +// vector registers. +def CC_X86_32_Vector_Darwin : CallingConv<[ + // SSE vector arguments are passed in XMM registers. + CCIfNotVarArg>>, + + // AVX 256-bit vector arguments are passed in YMM registers. + CCIfNotVarArg>>>, + + // AVX 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg>>, + + CCDelegateTo +]>; + /// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP -/// values are spilled on the stack, and the first 4 vector values go in XMM -/// regs. +/// values are spilled on the stack. def CC_X86_32_Common : CallingConv<[ // Handles byval parameters. CCIfByVal>, @@ -483,33 +535,16 @@ def CC_X86_32_Common : CallingConv<[ CCIfType<[v32i1], CCPromoteToType>, CCIfType<[v64i1], CCPromoteToType>, - // The first 4 SSE vector arguments are passed in XMM registers. - CCIfNotVarArg>>, - - // The first 4 AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg>>>, - - // The first 4 AVX 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>, - - // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, - - // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToStack<32, 32>>, - - // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCAssignToStack<64, 64>>, - // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are // passed in the parameter area. - CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>; + CCIfType<[x86mmx], CCAssignToStack<8, 4>>, + + // Darwin passes vectors in a form that differs from the i386 psABI + CCIfSubtarget<"isTargetDarwin()", CCDelegateTo>, + + // Otherwise, drop to 'normal' X86-32 CC + CCDelegateTo +]>; def CC_X86_32_C : CallingConv<[ // Promote i1/i8/i16 arguments to i32. diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll index 6424bfc9c21..807a8c8fe5e 100644 --- a/test/CodeGen/X86/fp-trunc.ll +++ b/test/CodeGen/X86/fp-trunc.ll @@ -1,6 +1,8 @@ ; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s ; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX +target triple = "i686-pc-linux-gnu" + define <1 x float> @test1(<1 x double> %x) nounwind { ; CHECK-LABEL: test1: ; CHECK: # BB#0: @@ -59,12 +61,14 @@ define <4 x float> @test3(<4 x double> %x) nounwind { define <8 x float> @test4(<8 x double> %x) nounwind { ; CHECK-LABEL: test4: ; CHECK: # BB#0: +; CHECK-NEXT: subl $12, %esp ; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 ; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: cvtpd2ps %xmm3, %xmm3 ; CHECK-NEXT: cvtpd2ps %xmm2, %xmm1 -; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: cvtpd2ps 16(%esp), %xmm2 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: addl $12, %esp ; CHECK-NEXT: retl ; ; AVX-LABEL: test4: diff --git a/test/CodeGen/X86/illegal-vector-args-return.ll b/test/CodeGen/X86/illegal-vector-args-return.ll index 62a21f4c5aa..d783d4fa1b4 100644 --- a/test/CodeGen/X86/illegal-vector-args-return.ll +++ b/test/CodeGen/X86/illegal-vector-args-return.ll @@ -3,6 +3,8 @@ ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps %xmm3, %xmm1" ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps %xmm2, %xmm0" +target triple = "i686-apple-darwin8" + define <4 x double> @foo(<4 x double> %x, <4 x double> %z) { %y = fmul <4 x double> %x, %z ret <4 x double> %y diff --git a/test/CodeGen/X86/x86-32-vector-calling-conv.ll b/test/CodeGen/X86/x86-32-vector-calling-conv.ll new file mode 100644 index 00000000000..b2bda7ab8d0 --- /dev/null +++ b/test/CodeGen/X86/x86-32-vector-calling-conv.ll @@ -0,0 +1,44 @@ +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=DARWIN +; RUN: llc < %s -mtriple=i686-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=LINUX + +; CHECK-LABEL: test_sse: +; DARWIN-DAG: vpaddd %xmm1, %xmm0, %xmm0 +; DARWIN-DAG: vpaddd %xmm3, %xmm2, %xmm1 +; DARWIN: vpaddd %xmm1, %xmm0, %xmm0 +; LINUX-DAG: vpaddd %xmm1, %xmm0, %xmm0 +; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %xmm2, %xmm1 +; LINUX: vpaddd %xmm1, %xmm0, %xmm0 +define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind { + %r0 = add <4 x i32> %a, %b + %r1 = add <4 x i32> %c, %d + %ret = add <4 x i32> %r0, %r1 + ret <4 x i32> %ret +} + +; CHECK-LABEL: test_avx: +; DARWIN-DAG: vpaddd %ymm1, %ymm0, %ymm0 +; DARWIN-DAG: vpaddd %ymm3, %ymm2, %ymm1 +; DARWIN: vpaddd %ymm1, %ymm0, %ymm0 +; LINUX-DAG: vpaddd %ymm1, %ymm0, %ymm0 +; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %ymm2, %ymm1 +; LINUX: vpaddd %ymm1, %ymm0, %ymm0 +define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) nounwind { + %r0 = add <8 x i32> %a, %b + %r1 = add <8 x i32> %c, %d + %ret = add <8 x i32> %r0, %r1 + ret <8 x i32> %ret +} + +; CHECK-LABEL: test_avx512: +; DARWIN-DAG: vpaddd %zmm1, %zmm0, %zmm0 +; DARWIN-DAG: vpaddd %zmm3, %zmm2, %zmm1 +; DARWIN: vpaddd %zmm1, %zmm0, %zmm0 +; LINUX-DAG: vpaddd %zmm1, %zmm0, %zmm0 +; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %zmm2, %zmm1 +; LINUX: vpaddd %zmm1, %zmm0, %zmm0 +define <16 x i32> @test_avx512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) nounwind { + %r0 = add <16 x i32> %a, %b + %r1 = add <16 x i32> %c, %d + %ret = add <16 x i32> %r0, %r1 + ret <16 x i32> %ret +}