diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b526523063b..379ea7f7f09 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -104,6 +104,12 @@ namespace {
     /// stable indices of nodes within the worklist.
     DenseMap<SDNode *, unsigned> WorklistMap;
 
+    /// \brief Set of nodes which have been combined (at least once).
+    ///
+    /// This is used to allow us to reliably add any operands of a DAG node
+    /// which have not yet been combined to the worklist.
+    SmallPtrSet<SDNode *, 64> CombinedNodes;
+
     // AA - Used for DAG load/store alias analysis.
     AliasAnalysis &AA;
 
@@ -136,6 +142,8 @@ namespace {
     /// removeFromWorklist - remove all instances of N from the worklist.
     ///
     void removeFromWorklist(SDNode *N) {
+      CombinedNodes.erase(N);
+
       auto It = WorklistMap.find(N);
       if (It == WorklistMap.end())
         return; // Not in the worklist.
@@ -1152,6 +1160,17 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
     if (recursivelyDeleteUnusedNodes(N))
       continue;
 
+    DEBUG(dbgs() << "\nCombining: ";
+          N->dump(&DAG));
+
+    // Add any operands of the new node which have not yet been combined to the
+    // worklist as well. Because the worklist uniques things already, this
+    // won't repeatedly process the same operand.
+    CombinedNodes.insert(N);
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      if (!CombinedNodes.count(N->getOperand(i).getNode()))
+        AddToWorklist(N->getOperand(i).getNode());
+
     WorklistRemover DeadNodes(*this);
 
     SDValue RV = combine(N);
@@ -1172,11 +1191,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
            RV.getNode()->getOpcode() != ISD::DELETED_NODE &&
            "Node was deleted but visit returned new node!");
 
-    DEBUG(dbgs() << "\nReplacing.3 ";
-          N->dump(&DAG);
-          dbgs() << "\nWith: ";
-          RV.getNode()->dump(&DAG);
-          dbgs() << '\n');
+    DEBUG(dbgs() << " ... into: ";
+          RV.getNode()->dump(&DAG));
 
     // Transfer debug value.
     DAG.TransferDbgValues(SDValue(N, 0), RV);
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
deleted file mode 100644
index ce132c6afa4..00000000000
--- a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -O3 < %s | FileCheck %s
-; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s
-; Test case for a DAG combiner bug where we combined an indexed load
-; with an extension (sext, zext, or any) into a regular extended load,
-; i.e., dropping the indexed value.
-; <rdar://problem/16389332>
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios"
-
-%class.A = type { i64, i64 }
-%class.C = type { i64 }
-
-; CHECK-LABEL: XX:
-; CHECK: ldr
-define i32 @XX(%class.A* %K, i1 %tst, i32* %addr, %class.C** %ppC, %class.C* %pC) {
-entry:
-  br i1 %tst, label %if.then, label %lor.rhs.i
-
-lor.rhs.i:                                        ; preds = %entry
-  %tmp = load i32* %addr, align 4
-  %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
-  %tmp1 = load i64* %y.i.i.i, align 8
-  %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
-  %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17
-  %add12.i = add nsw i32 0, %div11.i
-  %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32
-  %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32
-  %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13
-  %add16.i = add nsw i32 %add12.i, %div15.i
-  %rem.i.i = srem i32 %add16.i, %tmp
-  %idxprom = sext i32 %rem.i.i to i64
-  %arrayidx = getelementptr inbounds %class.C** %ppC, i64 %idxprom
-  %tobool533 = icmp eq %class.C* %pC, null
-  br i1 %tobool533, label %while.end, label %while.body
-
-if.then:                                          ; preds = %entry
-  ret i32 42
-
-while.body:                                       ; preds = %lor.rhs.i
-  ret i32 5
-
-while.end:                                        ; preds = %lor.rhs.i
-  %tmp3 = load %class.C** %arrayidx, align 8
-  ret i32 50
-}
diff --git a/test/CodeGen/ARM/aapcs-hfa-code.ll b/test/CodeGen/ARM/aapcs-hfa-code.ll
index 396e83816cc..41ea6137b3d 100644
--- a/test/CodeGen/ARM/aapcs-hfa-code.ll
+++ b/test/CodeGen/ARM/aapcs-hfa-code.ll
@@ -92,12 +92,10 @@ define arm_aapcs_vfpcc void @test_1double_misaligned([4 x double], [4 x double],
   call arm_aapcs_vfpcc void @test_1double_misaligned([4 x double] undef, [4 x double] undef, float undef, double 1.0)
 
 ; CHECK-LABEL: test_1double_misaligned:
-; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
-; CHECK-DAG: mov r[[BASE:[0-9]+]], sp
 ; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0
+; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
 ; CHECK-DAG: movt [[ONEHI]], #16368
-; CHECK-DAG: str [[ONELO]], [r[[BASE]], #8]!
-; CHECK-DAG: str [[ONEHI]], [r[[BASE]], #4]
+; CHECK-DAG: strd [[ONELO]], [[ONEHI]], [sp, #8]
 
 ; CHECK-M4F-LABEL: test_1double_misaligned:
 ; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index 0c13fb1adfb..48558a38e3a 100644
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll
@@ -757,24 +757,9 @@ define i32 @slti6(i32 %a) nounwind readnone {
 
 ; ALL-LABEL: slti6:
 
-; 32-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; 32-CMOV-DAG: xori [[R1]], [[R1]], 1
-; 32-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 32-CMOV-NOT: movn
-
-; 32-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
-; 32-CMP-DAG:  xori [[R1]], [[R1]], 1
-; 32-CMP-DAG:  addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 32-CMP-NOT:  seleqz
-; 32-CMP-NOT:  selnez
-
-; 64-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; 64-CMOV-DAG: xori [[R1]], [[R1]], 1
-; 64-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 64-CMOV-NOT: movn
-
-; 64-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
-; 64-CMP-DAG:  xori [[R1]], [[R1]], 1
-; 64-CMP-DAG:  addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 64-CMP-NOT:  seleqz
-; 64-CMP-NOT:  selnez
+; ALL-DAG: addiu [[R1:\$[0-9]+]], $zero, 6
+; ALL-DAG: slt [[R1]], [[R1]], $4
+; ALL-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
+; ALL-NOT: movn
+; ALL-NOT:  seleqz
+; ALL-NOT:  selnez
diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
index f733d904042..dac4f173e31 100644
--- a/test/CodeGen/R600/add_i64.ll
+++ b/test/CodeGen/R600/add_i64.ll
@@ -70,8 +70,8 @@ define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> add
 }
 
 ; SI-LABEL: @trunc_i64_add_to_i32
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]]
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]]
+; SI: S_LOAD_DWORD s[[SREG0:[0-9]+]]
+; SI: S_LOAD_DWORD s[[SREG1:[0-9]+]]
 ; SI: S_ADD_I32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
 ; SI-NOT: ADDC
 ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index 3c3b475d077..a2b7e47f4d4 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll
@@ -116,10 +116,10 @@ define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64
 }
 
 ; SI-LABEL: @trunc_i64_or_to_i32
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]]
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]]
-; SI: S_OR_B32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: S_LOAD_DWORD s[[SREG0:[0-9]+]]
+; SI: S_LOAD_DWORD s[[SREG1:[0-9]+]]
+; SI: S_OR_B32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
 ; SI: BUFFER_STORE_DWORD [[VRESULT]],
 define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = or i64 %b, %a
diff --git a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
index 5372bc52278..60025bfcdc8 100644
--- a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
+++ b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
@@ -7,7 +7,7 @@ entry:
         %tmp1 = bitcast double %a to <8 x i8>
         %tmp2 = bitcast double %b to <8 x i8>
         %tmp3 = add <8 x i8> %tmp1, %tmp2
-; CHECK:  paddw
+; CHECK:  paddb
         store <8 x i8> %tmp3, <8 x i8>* null
         ret void
 }
@@ -18,7 +18,7 @@ entry:
         %tmp1 = bitcast double %a to <4 x i16>
         %tmp2 = bitcast double %b to <4 x i16>
         %tmp3 = add <4 x i16> %tmp1, %tmp2
-; CHECK:  paddd
+; CHECK:  paddw
         store <4 x i16> %tmp3, <4 x i16>* null
         ret void
 }
@@ -29,7 +29,7 @@ entry:
         %tmp1 = bitcast double %a to <2 x i32>
         %tmp2 = bitcast double %b to <2 x i32>
         %tmp3 = add <2 x i32> %tmp1, %tmp2
-; CHECK:  paddq
+; CHECK:  paddd
         store <2 x i32> %tmp3, <2 x i32>* null
         ret void
 }
diff --git a/test/CodeGen/X86/i8-umulo.ll b/test/CodeGen/X86/i8-umulo.ll
index ba846f3e9be..1d70f4a8754 100644
--- a/test/CodeGen/X86/i8-umulo.ll
+++ b/test/CodeGen/X86/i8-umulo.ll
@@ -3,7 +3,7 @@
 
 declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
 define i8 @testumulo(i32 %argc) {
-; CHECK: imulw
+; CHECK: imull
 ; CHECK: testb %{{.+}}, %{{.+}}
 ; CHECK: je [[NOOVERFLOWLABEL:.+]]
 ; CHECK: {{.*}}[[NOOVERFLOWLABEL]]:
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index d4174539f2f..dfa8aed4646 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -284,7 +284,7 @@ entry:
 define i32 @func_test1(i32 %p1) nounwind uwtable {
 entry:
 ; CHECK-LABEL: func_test1:
-; CHECK: testb
+; CHECK: andb
 ; CHECK: j
 ; CHECK: ret
   %0 = load i32* @b, align 4
diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll
index f47161e5520..edb8433ec30 100644
--- a/test/CodeGen/X86/lower-bitcast.ll
+++ b/test/CodeGen/X86/lower-bitcast.ll
@@ -68,13 +68,13 @@ define i64 @test4(i64 %A) {
   %2 = bitcast <2 x i32> %add to i64
   ret i64 %2
 }
-; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd.
+; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd.
 ; Ideally, we should fold that sequence into a single paddd. This is fixed with
 ; the widening legalization.
 ;
 ; CHECK-LABEL: test4
 ; CHECK: pshufd
-; CHECK-NEXT: paddq
+; CHECK-NEXT: paddd
 ; CHECK-NEXT: pshufd
 ; CHECK: ret
 ;
diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll
index c8aaf327a7d..b4dc5fd4716 100644
--- a/test/CodeGen/X86/pr15267.ll
+++ b/test/CodeGen/X86/pr15267.ll
@@ -48,19 +48,22 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind {
 
 ; CHECK: test3
 ; CHECK: movzbl
-; CHECK: shrl
-; CHECK: andl $1
-; CHECK: andl $1
-; CHECK: vmovd
-; CHECK: pinsrd $1
-; CHECK: shrl $2
-; CHECK: andl $1
-; CHECK: pinsrd $2
-; CHECK: shrl $3
-; CHECK: andl $1
-; CHECK: pinsrd $3
-; CHECK: pslld
-; CHECK: psrad
-; CHECK: pmovsxdq
-; CHECK: pmovsxdq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: vpunpcklqdq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: vpunpcklqdq
+; CHECK: vinsertf128
 ; CHECK: ret
diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll
index 51f6fb0dbbe..e3cc2fa668e 100644
--- a/test/CodeGen/X86/store-narrow.ll
+++ b/test/CodeGen/X86/store-narrow.ll
@@ -34,7 +34,7 @@ entry:
 ; X64: movb	%sil, 1(%rdi)
 
 ; X32-LABEL: test2:
-; X32: movzbl	8(%esp), %e[[REG:[abcd]]]x
+; X32: movb	8(%esp), %[[REG:[abcd]]]l
 ; X32: movb	%[[REG]]l, 1(%{{.*}})
 }
 
@@ -67,8 +67,8 @@ entry:
 ; X64: movw	%si, 2(%rdi)
 
 ; X32-LABEL: test4:
-; X32: movl	8(%esp), %e[[REG:[abcd]x]]
-; X32: movw	%[[REG]], 2(%{{.*}})
+; X32: movw	8(%esp), %[[REG:[abcd]]]x
+; X32: movw	%[[REG]]x, 2(%{{.*}})
 }
 
 define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@@ -84,8 +84,8 @@ entry:
 ; X64: movw	%si, 2(%rdi)
 
 ; X32-LABEL: test5:
-; X32: movzwl	8(%esp), %e[[REG:[abcd]x]]
-; X32: movw	%[[REG]], 2(%{{.*}})
+; X32: movw	8(%esp), %[[REG:[abcd]]]x
+; X32: movw	%[[REG]]x, 2(%{{.*}})
 }
 
 define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index d230f1f7e2c..b981871d94b 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -32,7 +32,7 @@ define void @load_2_i16(<2 x i16>* %A)  {
 
 ;CHECK-LABEL: load_2_i32:
 ;CHECK: pmovzxdq
-;CHECK: paddq
+;CHECK: paddd
 ;CHECK: pshufd
 ;CHECK: ret
 define void @load_2_i32(<2 x i32>* %A)  {
@@ -56,7 +56,7 @@ define void @load_4_i8(<4 x i8>* %A)  {
 
 ;CHECK-LABEL: load_4_i16:
 ;CHECK: pmovzxwd
-;CHECK: paddd
+;CHECK: paddw
 ;CHECK: pshufb
 ;CHECK: ret
 define void @load_4_i16(<4 x i16>* %A)  {
@@ -68,7 +68,7 @@ define void @load_4_i16(<4 x i16>* %A)  {
 
 ;CHECK-LABEL: load_8_i8:
 ;CHECK: pmovzxbw
-;CHECK: paddw
+;CHECK: paddb
 ;CHECK: pshufb
 ;CHECK: ret
 define void @load_8_i8(<8 x i8>* %A)  {
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index a3229073751..ec1ce3da5e1 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -122,7 +122,7 @@ define <4 x i32> @test8(<4 x i32> %a) {
 ; SSE41-LABEL: test8:
 ; SSE41: pmuldq
 ; SSE41: pshufd	$49
-; SSE41-NOT: pshufd	$49
+; SSE41: pshufd	$49
 ; SSE41: pmuldq
 ; SSE41: shufps	$-35
 ; SSE41: pshufd	$-40
@@ -134,7 +134,7 @@ define <4 x i32> @test8(<4 x i32> %a) {
 ; SSE-LABEL: test8:
 ; SSE: pmuludq
 ; SSE: pshufd	$49
-; SSE-NOT: pshufd	$49
+; SSE: pshufd	$49
 ; SSE: pmuludq
 ; SSE: shufps	$-35
 ; SSE: pshufd	$-40
@@ -147,7 +147,7 @@ define <4 x i32> @test8(<4 x i32> %a) {
 ; AVX-LABEL: test8:
 ; AVX: vpmuldq
 ; AVX: vpshufd	$49
-; AVX-NOT: vpshufd	$49
+; AVX: vpshufd	$49
 ; AVX: vpmuldq
 ; AVX: vshufps	$-35
 ; AVX: vpshufd	$-40
@@ -162,10 +162,12 @@ define <8 x i32> @test9(<8 x i32> %a) {
   ret <8 x i32> %div
 
 ; AVX-LABEL: test9:
-; AVX: vpalignr $4
 ; AVX: vpbroadcastd
+; AVX: vpalignr $4
+; AVX: vpalignr $4
 ; AVX: vpmuldq
 ; AVX: vpmuldq
+; AVX: vpalignr $4
 ; AVX: vpblendd $170
 ; AVX: vpadd
 ; AVX: vpsrld $31
@@ -195,10 +197,12 @@ define <8 x i32> @test11(<8 x i32> %a) {
   ret <8 x i32> %rem
 
 ; AVX-LABEL: test11:
-; AVX: vpalignr $4
 ; AVX: vpbroadcastd
+; AVX: vpalignr $4
+; AVX: vpalignr $4
 ; AVX: vpmuldq
 ; AVX: vpmuldq
+; AVX: vpalignr $4
 ; AVX: vpblendd $170
 ; AVX: vpadd
 ; AVX: vpsrld $31
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index d115929f5aa..e0b861f29de 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: movl
-; CHECK: paddd
+; CHECK: paddw
 ; CHECK: movlpd
 
 ; Scheduler causes produce a different instruction order
 ; ATOM: movl
-; ATOM: paddd
+; ATOM: paddw
 ; ATOM: movlpd
 
 ; bitcast a v4i16 to v2i32
diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll
index 9f6778cff59..3f54ab694c0 100644
--- a/test/CodeGen/X86/widen_conv-1.ll
+++ b/test/CodeGen/X86/widen_conv-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: paddq
+; CHECK: paddd
 
 ; truncate v2i64 to v2i32
 
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 9d298adcc81..0ec3574d69e 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -91,10 +91,9 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
 %i16vec4 = type <4 x i16>
 define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
 ; CHECK-LABEL: add4i16:
-; CHECK:         pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT:    pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT:    paddd   %[[R0]], %[[R1]]
-; CHECK-NEXT:    pshufb  {{.*}}, %[[R1]]
+; CHECK:         movq    (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movq    (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddw   %[[R0]], %[[R1]]
 ; CHECK-NEXT:    movq    %[[R1]], (%{{.*}})
 	%a = load %i16vec4* %ap, align 16
 	%b = load %i16vec4* %bp, align 16
diff --git a/test/CodeGen/X86/x86-64-tls-1.ll b/test/CodeGen/X86/x86-64-tls-1.ll
index 641786f5a91..2879fb4e1e7 100644
--- a/test/CodeGen/X86/x86-64-tls-1.ll
+++ b/test/CodeGen/X86/x86-64-tls-1.ll
@@ -1,10 +1,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 @tm_nest_level = internal thread_local global i32 0
 define i64 @z() nounwind {
-; FIXME: The codegen here is primitive at best and could be much better.
-; The add and the moves can be folded together.
-; CHECK-DAG: movq    $tm_nest_level@TPOFF, %rcx
-; CHECK-DAG: movq    %fs:0, %rax
-; CHECK: addl    %ecx, %eax
+; CHECK:      movq    $tm_nest_level@TPOFF, %r[[R0:[abcd]]]x
+; CHECK-NEXT: addl    %fs:0, %e[[R0]]x
+; CHECK-NEXT: andq    $100, %r[[R0]]x
+
   ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100)
 }
diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
index 4fe6c664df5..4317d8ab6a2 100644
--- a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
+++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
@@ -2,10 +2,10 @@
 
 define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind {
 ; CHECK-LABEL: LCPI0_0:
-; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
-; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
-; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
-; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
 ; CHECK-LABEL: foo:
 ; CHECK: cmpeqps %xmm1, %xmm0
 ; CHECK-NEXT: andps LCPI0_0(%rip), %xmm0
@@ -59,10 +59,10 @@ define void @foo2(<4 x float>* noalias %result) nounwind {
 ; scalar value like what the zext creates.
 define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind {
 ; CHECK-LABEL: LCPI3_0:
-; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
-; CHECK-NEXT: .long 0                       ## float 0.000000e+00
-; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
-; CHECK-NEXT: .long 0                       ## float 0.000000e+00
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 0                       ## 0x0
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 0                       ## 0x0
 ; CHECK-LABEL: foo3:
 ; CHECK: cmpeqps %xmm1, %xmm0
 ; CHECK-NEXT: andps LCPI3_0(%rip), %xmm0