diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4f898bc9d7a..c1ff817fc35 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7321,22 +7321,27 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { &DAG.getTarget().Options)) return GetNegatedExpression(N0, DAG, LegalOperations); - // Transform fneg(bitconvert(x)) -> bitconvert(x^sign) to avoid loading + // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading // constant pool values. - // TODO: We can also optimize for vectors here, but we need to make sure - // that the sign mask is created properly for each vector element. if (!TLI.isFNegFree(VT) && N0.getOpcode() == ISD::BITCAST && - !VT.isVector() && - N0.getNode()->hasOneUse() && - N0.getOperand(0).getValueType().isInteger()) { + N0.getNode()->hasOneUse()) { SDValue Int = N0.getOperand(0); EVT IntVT = Int.getValueType(); if (IntVT.isInteger() && !IntVT.isVector()) { + APInt SignMask; + if (N0.getValueType().isVector()) { + // For a vector, get a mask such as 0x80... per scalar element + // and splat it. + SignMask = APInt::getSignBit(N0.getValueType().getScalarSizeInBits()); + SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); + } else { + // For a scalar, just generate 0x80... + SignMask = APInt::getSignBit(IntVT.getSizeInBits()); + } Int = DAG.getNode(ISD::XOR, SDLoc(N0), IntVT, Int, - DAG.getConstant(APInt::getSignBit(IntVT.getSizeInBits()), IntVT)); + DAG.getConstant(SignMask, IntVT)); AddToWorklist(Int.getNode()); - return DAG.getNode(ISD::BITCAST, SDLoc(N), - VT, Int); + return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Int); } } diff --git a/test/CodeGen/ARM/2009-10-21-InvalidFNeg.ll b/test/CodeGen/ARM/2009-10-21-InvalidFNeg.ll deleted file mode 100644 index 0f021d28aa1..00000000000 --- a/test/CodeGen/ARM/2009-10-21-InvalidFNeg.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc -mcpu=cortex-a8 -mattr=+neon < %s | grep vneg -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64" -target triple = "armv7-eabi" - -%aaa = type { %fff, %fff } -%bbb = type { [6 x %ddd] } -%ccc = type { %eee, %fff } -%ddd = type { %fff } -%eee = type { %fff, %fff, %fff, %fff } -%fff = type { %struct.vec_float4 } -%struct.vec_float4 = type { <4 x float> } - -define linkonce_odr arm_aapcs_vfpcc void @foo(%eee* noalias sret %agg.result, i64 %tfrm.0.0, i64 %tfrm.0.1, i64 %tfrm.0.2, i64 %tfrm.0.3, i64 %tfrm.0.4, i64 %tfrm.0.5, i64 %tfrm.0.6, i64 %tfrm.0.7) nounwind noinline { -entry: - %tmp104 = zext i64 %tfrm.0.2 to i512 ; [#uses=1] - %tmp105 = shl i512 %tmp104, 128 ; [#uses=1] - %tmp118 = zext i64 %tfrm.0.3 to i512 ; [#uses=1] - %tmp119 = shl i512 %tmp118, 192 ; [#uses=1] - %ins121 = or i512 %tmp119, %tmp105 ; [#uses=1] - %tmp99 = zext i64 %tfrm.0.4 to i512 ; [#uses=1] - %tmp100 = shl i512 %tmp99, 256 ; [#uses=1] - %tmp123 = zext i64 %tfrm.0.5 to i512 ; [#uses=1] - %tmp124 = shl i512 %tmp123, 320 ; [#uses=1] - %tmp96 = zext i64 %tfrm.0.6 to i512 ; [#uses=1] - %tmp97 = shl i512 %tmp96, 384 ; [#uses=1] - %tmp128 = zext i64 %tfrm.0.7 to i512 ; [#uses=1] - %tmp129 = shl i512 %tmp128, 448 ; [#uses=1] - %mask.masked = or i512 %tmp124, %tmp100 ; [#uses=1] - %ins131 = or i512 %tmp129, %tmp97 ; [#uses=1] - %tmp109132 = zext i64 %tfrm.0.0 to i128 ; [#uses=1] - %tmp113134 = zext i64 %tfrm.0.1 to i128 ; [#uses=1] - %tmp114133 = shl i128 %tmp113134, 64 ; [#uses=1] - %tmp94 = or i128 %tmp114133, %tmp109132 ; [#uses=1] - %tmp95 = bitcast i128 %tmp94 to <4 x float> ; <<4 x float>> [#uses=0] - %tmp82 = lshr i512 %ins121, 128 ; [#uses=1] - %tmp83 = trunc i512 %tmp82 to i128 ; [#uses=1] - %tmp84 = bitcast i128 %tmp83 to <4 x float> ; <<4 x float>> [#uses=0] - %tmp86 = lshr i512 %mask.masked, 256 ; [#uses=1] - %tmp87 = trunc i512 %tmp86 to i128 ; [#uses=1] - %tmp88 = bitcast i128 %tmp87 to <4 x float> ; <<4 x float>> [#uses=0] - %tmp90 = lshr i512 %ins131, 384 ; [#uses=1] - %tmp91 = trunc i512 %tmp90 to i128 ; [#uses=1] - %tmp92 = bitcast i128 %tmp91 to <4 x float> ; <<4 x float>> [#uses=1] - %tmp = fsub <4 x float> , %tmp92 ; <<4 x float>> [#uses=1] - %tmp28 = getelementptr inbounds %eee* %agg.result, i32 0, i32 3, i32 0, i32 0 ; <<4 x float>*> [#uses=1] - store <4 x float> %tmp, <4 x float>* %tmp28, align 16 - ret void -} diff --git a/test/CodeGen/ARM/fnegs.ll b/test/CodeGen/ARM/fnegs.ll index 99146b923a4..65fe9e36fa1 100644 --- a/test/CodeGen/ARM/fnegs.ll +++ b/test/CodeGen/ARM/fnegs.ll @@ -73,3 +73,49 @@ entry: ; CORTEXA9-LABEL: test2: ; CORTEXA9: vneg.f32 s{{.*}}, s{{.*}} +; If we're bitcasting an integer to an FP vector, we should avoid the FP/vector unit entirely. +; Make sure that we're flipping the sign bit and only the sign bit of each float (PR20354). +; So instead of something like this: +; vmov d16, r0, r1 +; vneg.f32 d16, d16 +; vmov r0, r1, d16 +; +; We should generate: +; eor r0, r0, #-214783648 +; eor r1, r1, #-214783648 + +define <2 x float> @fneg_bitcast(i64 %i) { + %bitcast = bitcast i64 %i to <2 x float> + %fneg = fsub <2 x float> , %bitcast + ret <2 x float> %fneg +} +; VFP2-LABEL: fneg_bitcast: +; VFP2-DAG: eor r0, r0, #-2147483648 +; VFP2-DAG: eor r1, r1, #-2147483648 +; VFP2-NOT: vneg.f32 + +; NFP1-LABEL: fneg_bitcast: +; NFP1-DAG: eor r0, r0, #-2147483648 +; NFP1-DAG: eor r1, r1, #-2147483648 +; NFP1-NOT: vneg.f32 + +; NFP0-LABEL: fneg_bitcast: +; NFP0-DAG: eor r0, r0, #-2147483648 +; NFP0-DAG: eor r1, r1, #-2147483648 +; NFP0-NOT: vneg.f32 + +; CORTEXA8-LABEL: fneg_bitcast: +; CORTEXA8-DAG: eor r0, r0, #-2147483648 +; CORTEXA8-DAG: eor r1, r1, #-2147483648 +; CORTEXA8-NOT: vneg.f32 + +; CORTEXA8U-LABEL: fneg_bitcast: +; CORTEXA8U-DAG: eor r0, r0, #-2147483648 +; CORTEXA8U-DAG: eor r1, r1, #-2147483648 +; CORTEXA8U-NOT: vneg.f32 + +; CORTEXA9-LABEL: fneg_bitcast: +; CORTEXA9-DAG: eor r0, r0, #-2147483648 +; CORTEXA9-DAG: eor r1, r1, #-2147483648 +; CORTEXA9-NOT: vneg.f32 + diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll index 00383be02c9..9743f7148c6 100644 --- a/test/CodeGen/X86/vec_fneg.ll +++ b/test/CodeGen/X86/vec_fneg.ll @@ -21,3 +21,25 @@ define <4 x float> @t2(<4 x float> %Q) { %tmp = fsub <4 x float> zeroinitializer, %Q ret <4 x float> %tmp } + +; If we're bitcasting an integer to an FP vector, we should avoid the FPU/vector unit entirely. +; Make sure that we're flipping the sign bit and only the sign bit of each float. +; So instead of something like this: +; movd %rdi, %xmm0 +; xorps .LCPI2_0(%rip), %xmm0 +; +; We should generate: +; movabsq (put sign bit mask in integer register)) +; xorq (flip sign bits) +; movd (move to xmm return register) + +define <2 x float> @fneg_bitcast(i64 %i) { +; CHECK-LABEL: fneg_bitcast: +; CHECK: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000 +; CHECK-NEXT: xorq %rdi, %rax +; CHECK-NEXT: movd %rax, %xmm0 +; CHECK-NEXT: retq + %bitcast = bitcast i64 %i to <2 x float> + %fneg = fsub <2 x float> , %bitcast + ret <2 x float> %fneg +}