From 8939b0d8a930729cd8d9c06dd4afd08de93260d2 Mon Sep 17 00:00:00 2001 From: David Greene Date: Tue, 16 Feb 2010 20:50:18 +0000 Subject: [PATCH] Add support for emitting non-temporal stores for DAGs marked non-temporal. Fix from r96241 for botched encoding of MOVNTDQ. Add documentation for !nontemporal metadata. Add a simpler movnt testcase. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@96386 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/LangRef.html | 28 ++++++-- lib/Target/X86/X86InstrSSE.td | 78 +++++++++++++++++++--- test/CodeGen/X86/2010-02-11-NonTemporal.ll | 22 ++++++ 3 files changed, 115 insertions(+), 13 deletions(-) create mode 100644 test/CodeGen/X86/2010-02-11-NonTemporal.ll diff --git a/docs/LangRef.html b/docs/LangRef.html index b337b6a8571..76208388211 100644 --- a/docs/LangRef.html +++ b/docs/LangRef.html @@ -4074,8 +4074,9 @@ Instruction
Syntax:
-  <result> = load <ty>* <pointer>[, align <alignment>]
-  <result> = volatile load <ty>* <pointer>[, align <alignment>]
+  <result> = load <ty>* <pointer>[, align <alignment>][, !nontemporal !]
+  <result> = volatile load <ty>* <pointer>[, align <alignment>][, !nontemporal !]
+  ! = !{ i32 1 }
 
Overview:
@@ -4088,7 +4089,7 @@ Instruction marked as volatile, then the optimizer is not allowed to modify the number or order of execution of this load with other volatile load and store - instructions.

+ instructions.

The optional constant "align" argument specifies the alignment of the operation (that is, the alignment of the memory address). A value of 0 or an @@ -4098,6 +4099,14 @@ Instruction alignment results in an undefined behavior. Underestimating the alignment may produce less efficient code. An alignment of 1 is always safe.

+

The optional !nontemporal metadata must reference a single metatadata + name corresponding to a metadata node with one i32 entry of + value 1. The existance of the !nontemporal metatadata on the + instruction tells the optimizer and code generator that this load is + not expected to be reused in the cache. The code generator may + select special instructions to save cache bandwidth, such as the + MOVNT intruction on x86.

+
Semantics:

The location of memory pointed to is loaded. If the value being loaded is of scalar type then the number of bytes read does not exceed the minimum number @@ -4124,8 +4133,8 @@ Instruction

Syntax:
-  store <ty> <value>, <ty>* <pointer>[, align <alignment>]                   ; yields {void}
-  volatile store <ty> <value>, <ty>* <pointer>[, align <alignment>]          ; yields {void}
+  store <ty> <value>, <ty>* <pointer>[, align <alignment>][, !nontemporal !]                   ; yields {void}
+  volatile store <ty> <value>, <ty>* <pointer>[, align <alignment>][, !nontemporal !]          ; yields {void}
 
Overview:
@@ -4150,6 +4159,15 @@ Instruction alignment results in an undefined behavior. Underestimating the alignment may produce less efficient code. An alignment of 1 is always safe.

+

The optional !nontemporal metadata must reference a single metatadata + name corresponding to a metadata node with one i32 entry of + value 1. The existance of the !nontemporal metatadata on the + instruction tells the optimizer and code generator that this load is + not expected to be reused in the cache. The code generator may + select special instructions to save cache bandwidth, such as the + MOVNT intruction on x86.

+ +
Semantics:

The contents of memory are updated to contain '<value>' at the location specified by the '<pointer>' operand. If diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 9b2140f3a40..b69d2ca1548 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -160,6 +160,32 @@ def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>; def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>; def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>; +// MOVNT Support +// Like 'store', but requires the non-temporal bit to be set +def nontemporalstore : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast(N)) + return ST->isNonTemporal(); + return false; +}]>; + +def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast(N)) + return ST->isNonTemporal() && !ST->isTruncatingStore() && + ST->getAddressingMode() == ISD::UNINDEXED && + ST->getAlignment() >= 16; + return false; +}]>; + +def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast(N)) + return ST->isNonTemporal() && + ST->getAlignment() < 16; + return false; +}]>; + def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>; @@ -1013,10 +1039,33 @@ def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src), "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>; // Non-temporal stores -def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), +def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>; +let AddedComplexity = 400 in { // Prefer non-temporal versions +def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; + +def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>; + +def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (MOVNTDQ_64mr VR128:$src, addr:$dst)>; + +def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movnti\t{$src, $dst|$dst, $src}", + [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, + TB, Requires<[HasSSE2]>; + +def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movnti\t{$src, $dst|$dst, $src}", + [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, + TB, Requires<[HasSSE2]>; +} + // Load, store, and memory fence def SFENCE : PSI<0xAE, MRM7r, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>; @@ -2298,17 +2347,30 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; // Non-temporal stores -def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>; -def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>; -def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), +def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>; +def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>; +def MOVNTImr_Int : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "movnti\t{$src, $dst|$dst, $src}", [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>, TB, Requires<[HasSSE2]>; +let AddedComplexity = 400 in { // Prefer non-temporal versions +def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; + +def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; + +def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (MOVNTDQmr VR128:$src, addr:$dst)>; +} + // Flush cache def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, diff --git a/test/CodeGen/X86/2010-02-11-NonTemporal.ll b/test/CodeGen/X86/2010-02-11-NonTemporal.ll new file mode 100644 index 00000000000..5789a0b9847 --- /dev/null +++ b/test/CodeGen/X86/2010-02-11-NonTemporal.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s -march=x86-64 | FileCheck %s +; CHECK: movnt +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +target triple = "x86_64-unknown-linux-gnu" + +!0 = metadata !{ i32 1 } + +define void @sub_(i32* noalias %n) { +"file movnt.f90, line 2, bb1": + %n1 = alloca i32*, align 8 + %i = alloca i32, align 4 + %"$LCS_0" = alloca i64, align 8 + %"$LCS_S2" = alloca <2 x double>, align 16 + %r9 = load <2 x double>* %"$LCS_S2", align 8 + %r10 = load i64* %"$LCS_0", align 8 + %r11 = inttoptr i64 %r10 to <2 x double>* + store <2 x double> %r9, <2 x double>* %r11, align 16, !nontemporal !0 + br label %"file movnt.f90, line 18, bb5" + +"file movnt.f90, line 18, bb5": + ret void +}