2014-04-27 18:47:41 +00:00
|
|
|
; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=SSE41
|
|
|
|
; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE
|
2014-04-26 12:06:28 +00:00
|
|
|
; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX
|
|
|
|
|
|
|
|
define <4 x i32> @test1(<4 x i32> %a) {
|
|
|
|
%div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
|
|
|
|
ret <4 x i32> %div
|
|
|
|
|
2014-04-27 18:47:41 +00:00
|
|
|
; SSE41-LABEL: test1:
|
|
|
|
; SSE41: pmuludq
|
2014-07-09 11:12:39 +00:00
|
|
|
; SSE41: pshufd $49
|
2014-04-27 18:47:41 +00:00
|
|
|
; SSE41: pmuludq
|
|
|
|
; SSE41: shufps $-35
|
|
|
|
; SSE41: psubd
|
|
|
|
; SSE41: psrld $1
|
|
|
|
; SSE41: padd
|
|
|
|
; SSE41: psrld $2
|
2014-04-26 12:06:28 +00:00
|
|
|
|
|
|
|
; AVX-LABEL: test1:
|
|
|
|
; AVX: vpmuludq
|
2014-07-09 11:12:39 +00:00
|
|
|
; AVX: vpshufd $49
|
2014-04-26 12:06:28 +00:00
|
|
|
; AVX: vpmuludq
|
|
|
|
; AVX: vshufps $-35
|
|
|
|
; AVX: vpsubd
|
|
|
|
; AVX: vpsrld $1
|
|
|
|
; AVX: vpadd
|
|
|
|
; AVX: vpsrld $2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i32> @test2(<8 x i32> %a) {
|
|
|
|
%div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
|
|
|
|
ret <8 x i32> %div
|
|
|
|
|
|
|
|
; AVX-LABEL: test2:
|
2014-07-09 11:12:39 +00:00
|
|
|
; AVX: vpbroadcastd
|
|
|
|
; AVX: vpalignr $4
|
2014-04-26 12:06:28 +00:00
|
|
|
; AVX: vpmuludq
|
|
|
|
; AVX: vpmuludq
|
2014-07-09 11:12:39 +00:00
|
|
|
; AVX: vpblendd $170
|
2014-04-26 12:06:28 +00:00
|
|
|
; AVX: vpsubd
|
|
|
|
; AVX: vpsrld $1
|
|
|
|
; AVX: vpadd
|
|
|
|
; AVX: vpsrld $2
|
|
|
|
}
|
|
|
|
|
2014-04-26 13:01:03 +00:00
|
|
|
define <8 x i16> @test3(<8 x i16> %a) {
|
|
|
|
%div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
ret <8 x i16> %div
|
|
|
|
|
2014-04-27 18:47:41 +00:00
|
|
|
; SSE41-LABEL: test3:
|
|
|
|
; SSE41: pmulhuw
|
|
|
|
; SSE41: psubw
|
|
|
|
; SSE41: psrlw $1
|
|
|
|
; SSE41: paddw
|
|
|
|
; SSE41: psrlw $2
|
2014-04-26 13:01:03 +00:00
|
|
|
|
|
|
|
; AVX-LABEL: test3:
|
|
|
|
; AVX: vpmulhuw
|
|
|
|
; AVX: vpsubw
|
|
|
|
; AVX: vpsrlw $1
|
|
|
|
; AVX: vpaddw
|
|
|
|
; AVX: vpsrlw $2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i16> @test4(<16 x i16> %a) {
|
|
|
|
%div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
|
|
|
|
ret <16 x i16> %div
|
|
|
|
|
|
|
|
; AVX-LABEL: test4:
|
|
|
|
; AVX: vpmulhuw
|
|
|
|
; AVX: vpsubw
|
|
|
|
; AVX: vpsrlw $1
|
|
|
|
; AVX: vpaddw
|
|
|
|
; AVX: vpsrlw $2
|
|
|
|
; AVX-NOT: vpmulhuw
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @test5(<8 x i16> %a) {
|
|
|
|
%div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
|
|
|
ret <8 x i16> %div
|
|
|
|
|
2014-04-27 18:47:41 +00:00
|
|
|
; SSE41-LABEL: test5:
|
|
|
|
; SSE41: pmulhw
|
|
|
|
; SSE41: psrlw $15
|
|
|
|
; SSE41: psraw $1
|
|
|
|
; SSE41: paddw
|
2014-04-26 13:01:03 +00:00
|
|
|
|
|
|
|
; AVX-LABEL: test5:
|
|
|
|
; AVX: vpmulhw
|
|
|
|
; AVX: vpsrlw $15
|
|
|
|
; AVX: vpsraw $1
|
|
|
|
; AVX: vpaddw
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i16> @test6(<16 x i16> %a) {
|
|
|
|
%div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
|
|
|
|
ret <16 x i16> %div
|
|
|
|
|
|
|
|
; AVX-LABEL: test6:
|
|
|
|
; AVX: vpmulhw
|
|
|
|
; AVX: vpsrlw $15
|
|
|
|
; AVX: vpsraw $1
|
|
|
|
; AVX: vpaddw
|
|
|
|
; AVX-NOT: vpmulhw
|
|
|
|
}
|
|
|
|
|
2014-04-26 14:12:19 +00:00
|
|
|
define <16 x i8> @test7(<16 x i8> %a) {
|
|
|
|
%div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
|
|
|
|
ret <16 x i8> %div
|
2014-07-09 11:12:39 +00:00
|
|
|
|
|
|
|
; FIXME: scalarized
|
|
|
|
; SSE41-LABEL: test7:
|
|
|
|
; SSE41: pext
|
|
|
|
; AVX-LABEL: test7:
|
|
|
|
; AVX: pext
|
2014-04-26 14:12:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @test8(<4 x i32> %a) {
|
|
|
|
%div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
|
|
|
|
ret <4 x i32> %div
|
|
|
|
|
2014-04-27 18:47:41 +00:00
|
|
|
; SSE41-LABEL: test8:
|
|
|
|
; SSE41: pmuldq
|
2014-07-09 11:12:39 +00:00
|
|
|
; SSE41: pshufd $49
|
2014-07-24 22:15:28 +00:00
|
|
|
; SSE41: pshufd $49
|
2014-04-27 18:47:41 +00:00
|
|
|
; SSE41: pmuldq
|
|
|
|
; SSE41: shufps $-35
|
|
|
|
; SSE41: pshufd $-40
|
|
|
|
; SSE41: padd
|
|
|
|
; SSE41: psrld $31
|
|
|
|
; SSE41: psrad $2
|
|
|
|
; SSE41: padd
|
|
|
|
|
2014-04-26 14:12:19 +00:00
|
|
|
; SSE-LABEL: test8:
|
2014-07-26 02:14:54 +00:00
|
|
|
; SSE: pmuludq
|
|
|
|
; SSE: pshufd $49
|
|
|
|
; SSE: pshufd $49
|
|
|
|
; SSE: pmuludq
|
|
|
|
; SSE: shufps $-35
|
|
|
|
; SSE: pshufd $-40
|
|
|
|
; SSE: psubd
|
|
|
|
; SSE: padd
|
|
|
|
; SSE: psrld $31
|
|
|
|
; SSE: psrad $2
|
|
|
|
; SSE: padd
|
2014-04-26 14:12:19 +00:00
|
|
|
|
|
|
|
; AVX-LABEL: test8:
|
|
|
|
; AVX: vpmuldq
|
2014-07-09 11:12:39 +00:00
|
|
|
; AVX: vpshufd $49
|
2014-07-24 22:15:28 +00:00
|
|
|
; AVX: vpshufd $49
|
2014-04-26 14:12:19 +00:00
|
|
|
; AVX: vpmuldq
|
|
|
|
; AVX: vshufps $-35
|
|
|
|
; AVX: vpshufd $-40
|
|
|
|
; AVX: vpadd
|
|
|
|
; AVX: vpsrld $31
|
|
|
|
; AVX: vpsrad $2
|
|
|
|
; AVX: vpadd
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i32> @test9(<8 x i32> %a) {
|
|
|
|
%div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
|
|
|
|
ret <8 x i32> %div
|
|
|
|
|
|
|
|
; AVX-LABEL: test9:
|
2014-04-27 11:41:06 +00:00
|
|
|
; AVX: vpbroadcastd
|
2014-07-24 22:15:28 +00:00
|
|
|
; AVX: vpalignr $4
|
|
|
|
; AVX: vpalignr $4
|
2014-04-26 14:12:19 +00:00
|
|
|
; AVX: vpmuldq
|
|
|
|
; AVX: vpmuldq
|
2014-07-24 22:15:28 +00:00
|
|
|
; AVX: vpalignr $4
|
2014-07-09 11:12:39 +00:00
|
|
|
; AVX: vpblendd $170
|
2014-04-26 14:12:19 +00:00
|
|
|
; AVX: vpadd
|
|
|
|
; AVX: vpsrld $31
|
|
|
|
; AVX: vpsrad $2
|
|
|
|
; AVX: vpadd
|
|
|
|
}
|
2014-04-26 23:09:49 +00:00
|
|
|
|
|
|
|
define <8 x i32> @test10(<8 x i32> %a) {
|
|
|
|
%rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
|
|
|
|
ret <8 x i32> %rem
|
|
|
|
|
|
|
|
; AVX-LABEL: test10:
|
2014-04-27 11:41:06 +00:00
|
|
|
; AVX: vpbroadcastd
|
2014-07-09 11:12:39 +00:00
|
|
|
; AVX: vpalignr $4
|
2014-04-26 23:09:49 +00:00
|
|
|
; AVX: vpmuludq
|
|
|
|
; AVX: vpmuludq
|
2014-07-09 11:12:39 +00:00
|
|
|
; AVX: vpblendd $170
|
2014-04-26 23:09:49 +00:00
|
|
|
; AVX: vpsubd
|
|
|
|
; AVX: vpsrld $1
|
|
|
|
; AVX: vpadd
|
|
|
|
; AVX: vpsrld $2
|
|
|
|
; AVX: vpmulld
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i32> @test11(<8 x i32> %a) {
|
|
|
|
%rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
|
|
|
|
ret <8 x i32> %rem
|
|
|
|
|
|
|
|
; AVX-LABEL: test11:
|
2014-04-27 11:41:06 +00:00
|
|
|
; AVX: vpbroadcastd
|
2014-07-24 22:15:28 +00:00
|
|
|
; AVX: vpalignr $4
|
|
|
|
; AVX: vpalignr $4
|
2014-04-26 23:09:49 +00:00
|
|
|
; AVX: vpmuldq
|
|
|
|
; AVX: vpmuldq
|
2014-07-24 22:15:28 +00:00
|
|
|
; AVX: vpalignr $4
|
2014-07-09 11:12:39 +00:00
|
|
|
; AVX: vpblendd $170
|
2014-04-26 23:09:49 +00:00
|
|
|
; AVX: vpadd
|
|
|
|
; AVX: vpsrld $31
|
|
|
|
; AVX: vpsrad $2
|
|
|
|
; AVX: vpadd
|
|
|
|
; AVX: vpmulld
|
|
|
|
}
|
2014-05-02 12:35:22 +00:00
|
|
|
|
|
|
|
define <2 x i16> @test12() {
|
|
|
|
%I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0
|
|
|
|
%I9 = insertelement <2 x i16> %I8, i16 -1, i32 1
|
|
|
|
%B9 = urem <2 x i16> %I9, %I9
|
|
|
|
ret <2 x i16> %B9
|
|
|
|
|
|
|
|
; AVX-LABEL: test12:
|
|
|
|
; AVX: xorps
|
|
|
|
}
|
[x86] Fix PR20355 (for real). There are many layers to this bug.
The tale starts with r212808 which attempted to fix inversion of the low
and high bits when lowering MUL_LOHI. Sadly, that commit did not include
any positive test cases, and just removed some operations from a test
case where the actual logic being changed isn't fully visible from the
test.
What this commit did was two things. First, it reversed the low and high
results in the formation of the MERGE_VALUES node for the multiple
results. This is entirely correct.
Second it changed the shuffles for extracting the low and high
components from the i64 results of the multiplies to extract them
assuming a big-endian-style encoding of the multiply results. This
second change is wrong. There is no big-endian encoding in x86, the
results of the multiplies are normal v2i64s: when cast to v4i32, the low
i32s are at offsets 0 and 2, and the high i32s are at offsets 1 and 3.
However, the first change wasn't enough to actually fix the bug, which
is (I assume) why the second change was also made. There was another bug
in the MERGE_VALUES formation: we weren't using a VTList, and so were
getting a single result node! When grabbing the *second* result from the
node, we got... well.. colud be anything. I think this *appeared* to
invert things, but had to be causing other problems as well.
Fortunately, I fixed the MERGE_VALUES issue in r213931, so we should
have been fine, right? NOOOPE! Because the core bug was never addressed,
the test in vector-idiv failed when I fixed the MERGE_VALUES node.
Because there are essentially no docs for this node, I had to guess at
how to fix it and tried swapping the operands, restoring the order of
the original code before r212808. While this "fixed" the test case (in
that we produced the write instructions) we were still extracting the
wrong elements of the i64s, and thus PR20355 was still broken.
This commit essentially reverts the big-endian-style extraction part of
r212808 and goes back to the original masks which were correct. Now that
the MERGE_VALUES node formation is also correct, everything works. I've
also included a more detailed test from PR20355 to make sure this stays
fixed.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214011 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-26 03:46:57 +00:00
|
|
|
|
|
|
|
define <4 x i32> @PR20355(<4 x i32> %a) {
|
|
|
|
; SSE-LABEL: PR20355:
|
2014-07-26 04:47:01 +00:00
|
|
|
; SSE: movdqa {{(.*LCPI|__xmm@55555556555555565555555655555556).*}}, %[[X1:xmm[0-9]+]]
|
[x86] Fix PR20355 (for real). There are many layers to this bug.
The tale starts with r212808 which attempted to fix inversion of the low
and high bits when lowering MUL_LOHI. Sadly, that commit did not include
any positive test cases, and just removed some operations from a test
case where the actual logic being changed isn't fully visible from the
test.
What this commit did was two things. First, it reversed the low and high
results in the formation of the MERGE_VALUES node for the multiple
results. This is entirely correct.
Second it changed the shuffles for extracting the low and high
components from the i64 results of the multiplies to extract them
assuming a big-endian-style encoding of the multiply results. This
second change is wrong. There is no big-endian encoding in x86, the
results of the multiplies are normal v2i64s: when cast to v4i32, the low
i32s are at offsets 0 and 2, and the high i32s are at offsets 1 and 3.
However, the first change wasn't enough to actually fix the bug, which
is (I assume) why the second change was also made. There was another bug
in the MERGE_VALUES formation: we weren't using a VTList, and so were
getting a single result node! When grabbing the *second* result from the
node, we got... well.. colud be anything. I think this *appeared* to
invert things, but had to be causing other problems as well.
Fortunately, I fixed the MERGE_VALUES issue in r213931, so we should
have been fine, right? NOOOPE! Because the core bug was never addressed,
the test in vector-idiv failed when I fixed the MERGE_VALUES node.
Because there are essentially no docs for this node, I had to guess at
how to fix it and tried swapping the operands, restoring the order of
the original code before r212808. While this "fixed" the test case (in
that we produced the write instructions) we were still extracting the
wrong elements of the i64s, and thus PR20355 was still broken.
This commit essentially reverts the big-endian-style extraction part of
r212808 and goes back to the original masks which were correct. Now that
the MERGE_VALUES node formation is also correct, everything works. I've
also included a more detailed test from PR20355 to make sure this stays
fixed.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214011 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-26 03:46:57 +00:00
|
|
|
; SSE-NEXT: movdqa %[[X1]], %[[X2:xmm[0-9]+]]
|
|
|
|
; SSE-NEXT: psrad $31, %[[X2]]
|
|
|
|
; SSE-NEXT: pand %xmm0, %[[X2]]
|
|
|
|
; SSE-NEXT: movdqa %xmm0, %[[X3:xmm[0-9]+]]
|
|
|
|
; SSE-NEXT: psrad $31, %[[X3]]
|
|
|
|
; SSE-NEXT: pand %[[X1]], %[[X3]]
|
|
|
|
; SSE-NEXT: paddd %[[X2]], %[[X3]]
|
|
|
|
; SSE-NEXT: pshufd {{.*}} # [[X4:xmm[0-9]+]] = xmm0[1,0,3,0]
|
|
|
|
; SSE-NEXT: pmuludq %[[X1]], %xmm0
|
|
|
|
; SSE-NEXT: pshufd {{.*}} # [[X1]] = [[X1]][1,0,3,0]
|
|
|
|
; SSE-NEXT: pmuludq %[[X4]], %[[X1]]
|
|
|
|
; SSE-NEXT: shufps {{.*}} # xmm0 = xmm0[1,3],[[X1]][1,3]
|
|
|
|
; SSE-NEXT: pshufd {{.*}} # [[X5:xmm[0-9]+]] = xmm0[0,2,1,3]
|
|
|
|
; SSE-NEXT: psubd %[[X3]], %[[X5]]
|
|
|
|
; SSE-NEXT: movdqa %[[X5]], %xmm0
|
|
|
|
; SSE-NEXT: psrld $31, %xmm0
|
|
|
|
; SSE-NEXT: paddd %[[X5]], %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: PR20355:
|
2014-07-26 04:47:01 +00:00
|
|
|
; SSE41: movdqa {{(.*LCPI|__xmm@55555556555555565555555655555556).*}}, %[[X1:xmm[0-9]+]]
|
[x86] Fix PR20355 (for real). There are many layers to this bug.
The tale starts with r212808 which attempted to fix inversion of the low
and high bits when lowering MUL_LOHI. Sadly, that commit did not include
any positive test cases, and just removed some operations from a test
case where the actual logic being changed isn't fully visible from the
test.
What this commit did was two things. First, it reversed the low and high
results in the formation of the MERGE_VALUES node for the multiple
results. This is entirely correct.
Second it changed the shuffles for extracting the low and high
components from the i64 results of the multiplies to extract them
assuming a big-endian-style encoding of the multiply results. This
second change is wrong. There is no big-endian encoding in x86, the
results of the multiplies are normal v2i64s: when cast to v4i32, the low
i32s are at offsets 0 and 2, and the high i32s are at offsets 1 and 3.
However, the first change wasn't enough to actually fix the bug, which
is (I assume) why the second change was also made. There was another bug
in the MERGE_VALUES formation: we weren't using a VTList, and so were
getting a single result node! When grabbing the *second* result from the
node, we got... well.. colud be anything. I think this *appeared* to
invert things, but had to be causing other problems as well.
Fortunately, I fixed the MERGE_VALUES issue in r213931, so we should
have been fine, right? NOOOPE! Because the core bug was never addressed,
the test in vector-idiv failed when I fixed the MERGE_VALUES node.
Because there are essentially no docs for this node, I had to guess at
how to fix it and tried swapping the operands, restoring the order of
the original code before r212808. While this "fixed" the test case (in
that we produced the write instructions) we were still extracting the
wrong elements of the i64s, and thus PR20355 was still broken.
This commit essentially reverts the big-endian-style extraction part of
r212808 and goes back to the original masks which were correct. Now that
the MERGE_VALUES node formation is also correct, everything works. I've
also included a more detailed test from PR20355 to make sure this stays
fixed.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214011 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-26 03:46:57 +00:00
|
|
|
; SSE41-NEXT: pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,0,3,0]
|
|
|
|
; SSE41-NEXT: pmuldq %[[X1]], %xmm0
|
|
|
|
; SSE41-NEXT: pshufd {{.*}} # [[X1]] = [[X1]][1,0,3,0]
|
|
|
|
; SSE41-NEXT: pmuldq %[[X2]], %[[X1]]
|
|
|
|
; SSE41-NEXT: shufps {{.*}} # xmm0 = xmm0[1,3],[[X1]][1,3]
|
|
|
|
; SSE41-NEXT: pshufd {{.*}} # [[X3:xmm[0-9]+]] = xmm0[0,2,1,3]
|
|
|
|
; SSE41-NEXT: movdqa %[[X3]], %xmm0
|
|
|
|
; SSE41-NEXT: psrld $31, %xmm0
|
|
|
|
; SSE41-NEXT: paddd %[[X3]], %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%sdiv = sdiv <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
|
|
|
|
ret <4 x i32> %sdiv
|
|
|
|
}
|