llvm-6502/test/CodeGen/PowerPC/ppc64-func-desc-hoist.ll

; RUN: llc -mcpu=a2 < %s | FileCheck %s -check-prefix=INVFUNCDESC
; RUN: llc -mcpu=a2 -mattr=-invariant-function-descriptors < %s | FileCheck %s -check-prefix=NONINVFUNCDESC
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"

; Function Attrs: nounwind
define void @bar(void (...)* nocapture %x) #0 {
entry:
  %callee.knr.cast = bitcast void (...)* %x to void ()*
  br label %for.body

; INVFUNCDESC-LABEL: @bar
; INVFUNCDESC-DAG: ld [[REG1:[0-9]+]], 8(3)
; INVFUNCDESC-DAG: ld [[REG2:[0-9]+]], 16(3)
; INVFUNCDESC-DAG: ld [[REG3:[0-9]+]], 0(3)

; INVFUNCDESC: %for.body
; INVFUNCDESC: std 2, 40(1)
; INVFUNCDESC-DAG: mtctr [[REG3]]
; INVFUNCDESC-DAG: mr 11, [[REG2]]
; INVFUNCDESC-DAG: mr 2, [[REG1]]
; INVFUNCDESC: bctrl
; INVFUNCDESC-NEXT: ld 2, 40(1)

; NONINVFUNCDESC-LABEL: @bar
; NONINVFUNCDESC: %for.body
; NONINVFUNCDESC: std 2, 40(1)
; NONINVFUNCDESC-DAG: ld 3, 0(30)
; NONINVFUNCDESC-DAG: ld 11, 16(30)
; NONINVFUNCDESC-DAG: ld 2, 8(30)
; NONINVFUNCDESC: mtctr 3
; NONINVFUNCDESC: bctrl
; NONINVFUNCDESC-NEXT: ld 2, 40(1)

for.body:                                         ; preds = %for.body, %entry
  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
  tail call void %callee.knr.cast() #0
  %inc = add nuw nsw i32 %i.02, 1
  %exitcond = icmp eq i32 %inc, 1600000000
  br i1 %exitcond, label %for.end, label %for.body

for.end:                                          ; preds = %for.body
  ret void
}

attributes #0 = { nounwind }
[PowerPC] Loosen ELFv1 PPC64 func descriptor loads for indirect calls Function pointers under PPC64 ELFv1 (which is used on PPC64/Linux on the POWER7, A2 and earlier cores) are really pointers to a function descriptor, a structure with three pointers: the actual pointer to the code to which to jump, the pointer to the TOC needed by the callee, and an environment pointer. We used to chain these loads, and make them opaque to the rest of the optimizer, so that they'd always occur directly before the call. This is not necessary, and in fact, highly suboptimal on embedded cores. Once the function pointer is known, the loads can be performed ahead of time; in fact, they can be hoisted out of loops. Now these function descriptors are almost always generated by the linker, and thus the contents of the descriptors are invariant. As a result, by default, we'll mark the associated loads as invariant (allowing them to be hoisted out of loops). I've added a target feature to turn this off, however, just in case someone needs that option (constructing an on-stack descriptor, casting it to a function pointer, and then calling it cannot be well-defined C/C++ code, but I can imagine some JIT-compilation system doing so). Consider this simple test: $ cat call.c typedef void (fp)(); void bar(fp x) { for (int i = 0; i < 1600000000; ++i) x(); } $ cat main.c typedef void (fp)(); void bar(fp x); void foo() {} int main() { bar(foo); } On the PPC A2 (the BG/Q supercomputer), marking the function-descriptor loads as invariant brings the execution time down to ~8 seconds from ~32 seconds with the loads in the loop. The difference on the POWER7 is smaller. Compiling with: gcc -std=c99 -O3 -mcpu=native call.c main.c : ~6 seconds [this is 4.8.2] clang -O3 -mcpu=native call.c main.c : ~5.3 seconds clang -O3 -mcpu=native call.c main.c -mno-invariant-function-descriptors : ~4 seconds (looks like we'd benefit from additional loop unrolling here, as a first guess, because this is faster with the extra loads) The -mno-invariant-function-descriptors will be added to Clang shortly. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226207 91177308-0d34-0410-b5e6-96231b3b80d8 2015-01-15 21:17:34 +00:00			`; RUN: llc -mcpu=a2 < %s \| FileCheck %s -check-prefix=INVFUNCDESC`
			`; RUN: llc -mcpu=a2 -mattr=-invariant-function-descriptors < %s \| FileCheck %s -check-prefix=NONINVFUNCDESC`
			`target datalayout = "E-m:e-i64:64-n32:64"`
			`target triple = "powerpc64-unknown-linux-gnu"`

			`; Function Attrs: nounwind`
			`define void @bar(void (...)* nocapture %x) #0 {`
			`entry:`
			`%callee.knr.cast = bitcast void (...)* %x to void ()*`
			`br label %for.body`

			`; INVFUNCDESC-LABEL: @bar`
			`; INVFUNCDESC-DAG: ld [[REG1:[0-9]+]], 8(3)`
			`; INVFUNCDESC-DAG: ld [[REG2:[0-9]+]], 16(3)`
			`; INVFUNCDESC-DAG: ld [[REG3:[0-9]+]], 0(3)`

			`; INVFUNCDESC: %for.body`
[PowerPC] Add r2 as an operand for all calls under both PPC64 ELF V1 and V2 Our PPC64 ELF V2 call lowering logic added r2 as an operand to all direct call instructions in order to represent the dependency on the TOC base pointer value. Restricting this to ELF V2, however, does not seem to make sense: calls under ELF V1 have the same dependence, and indirect calls have an r2 dependence just as direct ones. Make sure the dependence is noted for all calls under both ELF V1 and ELF V2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226432 91177308-0d34-0410-b5e6-96231b3b80d8 2015-01-19 07:20:27 +00:00			`; INVFUNCDESC: std 2, 40(1)`
[PowerPC] Loosen ELFv1 PPC64 func descriptor loads for indirect calls Function pointers under PPC64 ELFv1 (which is used on PPC64/Linux on the POWER7, A2 and earlier cores) are really pointers to a function descriptor, a structure with three pointers: the actual pointer to the code to which to jump, the pointer to the TOC needed by the callee, and an environment pointer. We used to chain these loads, and make them opaque to the rest of the optimizer, so that they'd always occur directly before the call. This is not necessary, and in fact, highly suboptimal on embedded cores. Once the function pointer is known, the loads can be performed ahead of time; in fact, they can be hoisted out of loops. Now these function descriptors are almost always generated by the linker, and thus the contents of the descriptors are invariant. As a result, by default, we'll mark the associated loads as invariant (allowing them to be hoisted out of loops). I've added a target feature to turn this off, however, just in case someone needs that option (constructing an on-stack descriptor, casting it to a function pointer, and then calling it cannot be well-defined C/C++ code, but I can imagine some JIT-compilation system doing so). Consider this simple test: $ cat call.c typedef void (fp)(); void bar(fp x) { for (int i = 0; i < 1600000000; ++i) x(); } $ cat main.c typedef void (fp)(); void bar(fp x); void foo() {} int main() { bar(foo); } On the PPC A2 (the BG/Q supercomputer), marking the function-descriptor loads as invariant brings the execution time down to ~8 seconds from ~32 seconds with the loads in the loop. The difference on the POWER7 is smaller. Compiling with: gcc -std=c99 -O3 -mcpu=native call.c main.c : ~6 seconds [this is 4.8.2] clang -O3 -mcpu=native call.c main.c : ~5.3 seconds clang -O3 -mcpu=native call.c main.c -mno-invariant-function-descriptors : ~4 seconds (looks like we'd benefit from additional loop unrolling here, as a first guess, because this is faster with the extra loads) The -mno-invariant-function-descriptors will be added to Clang shortly. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226207 91177308-0d34-0410-b5e6-96231b3b80d8 2015-01-15 21:17:34 +00:00			`; INVFUNCDESC-DAG: mtctr [[REG3]]`
			`; INVFUNCDESC-DAG: mr 11, [[REG2]]`
[PowerPC] Add r2 as an operand for all calls under both PPC64 ELF V1 and V2 Our PPC64 ELF V2 call lowering logic added r2 as an operand to all direct call instructions in order to represent the dependency on the TOC base pointer value. Restricting this to ELF V2, however, does not seem to make sense: calls under ELF V1 have the same dependence, and indirect calls have an r2 dependence just as direct ones. Make sure the dependence is noted for all calls under both ELF V1 and ELF V2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226432 91177308-0d34-0410-b5e6-96231b3b80d8 2015-01-19 07:20:27 +00:00			`; INVFUNCDESC-DAG: mr 2, [[REG1]]`
[PowerPC] Loosen ELFv1 PPC64 func descriptor loads for indirect calls Function pointers under PPC64 ELFv1 (which is used on PPC64/Linux on the POWER7, A2 and earlier cores) are really pointers to a function descriptor, a structure with three pointers: the actual pointer to the code to which to jump, the pointer to the TOC needed by the callee, and an environment pointer. We used to chain these loads, and make them opaque to the rest of the optimizer, so that they'd always occur directly before the call. This is not necessary, and in fact, highly suboptimal on embedded cores. Once the function pointer is known, the loads can be performed ahead of time; in fact, they can be hoisted out of loops. Now these function descriptors are almost always generated by the linker, and thus the contents of the descriptors are invariant. As a result, by default, we'll mark the associated loads as invariant (allowing them to be hoisted out of loops). I've added a target feature to turn this off, however, just in case someone needs that option (constructing an on-stack descriptor, casting it to a function pointer, and then calling it cannot be well-defined C/C++ code, but I can imagine some JIT-compilation system doing so). Consider this simple test: $ cat call.c typedef void (fp)(); void bar(fp x) { for (int i = 0; i < 1600000000; ++i) x(); } $ cat main.c typedef void (fp)(); void bar(fp x); void foo() {} int main() { bar(foo); } On the PPC A2 (the BG/Q supercomputer), marking the function-descriptor loads as invariant brings the execution time down to ~8 seconds from ~32 seconds with the loads in the loop. The difference on the POWER7 is smaller. Compiling with: gcc -std=c99 -O3 -mcpu=native call.c main.c : ~6 seconds [this is 4.8.2] clang -O3 -mcpu=native call.c main.c : ~5.3 seconds clang -O3 -mcpu=native call.c main.c -mno-invariant-function-descriptors : ~4 seconds (looks like we'd benefit from additional loop unrolling here, as a first guess, because this is faster with the extra loads) The -mno-invariant-function-descriptors will be added to Clang shortly. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226207 91177308-0d34-0410-b5e6-96231b3b80d8 2015-01-15 21:17:34 +00:00			`; INVFUNCDESC: bctrl`
			`; INVFUNCDESC-NEXT: ld 2, 40(1)`

			`; NONINVFUNCDESC-LABEL: @bar`
			`; NONINVFUNCDESC: %for.body`
			`; NONINVFUNCDESC: std 2, 40(1)`
			`; NONINVFUNCDESC-DAG: ld 3, 0(30)`
			`; NONINVFUNCDESC-DAG: ld 11, 16(30)`
			`; NONINVFUNCDESC-DAG: ld 2, 8(30)`
			`; NONINVFUNCDESC: mtctr 3`
			`; NONINVFUNCDESC: bctrl`
			`; NONINVFUNCDESC-NEXT: ld 2, 40(1)`

			`for.body: ; preds = %for.body, %entry`
			`%i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]`
			`tail call void %callee.knr.cast() #0`
			`%inc = add nuw nsw i32 %i.02, 1`
			`%exitcond = icmp eq i32 %inc, 1600000000`
			`br i1 %exitcond, label %for.end, label %for.body`

			`for.end: ; preds = %for.body`
			`ret void`
			`}`

			`attributes #0 = { nounwind }`