diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 88502549a59..4e66738f680 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -325,6 +325,44 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG)
   return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, JTI);
 }
 
+static bool
+IsWordAlignedBasePlusConstantOffset(SDValue Addr, SDValue &AlignedBase,
+                                    int64_t &Offset)
+{
+  if (Addr.getOpcode() != ISD::ADD) {
+    return false;
+  }
+  ConstantSDNode *CN = 0;
+  if (!(CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    return false;
+  }
+  int64_t off = CN->getSExtValue();
+  const SDValue &Base = Addr.getOperand(0);
+  const SDValue *Root = &Base;
+  if (Base.getOpcode() == ISD::ADD &&
+      Base.getOperand(1).getOpcode() == ISD::SHL) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Base.getOperand(1)
+                                                      .getOperand(1));
+    if (CN && (CN->getSExtValue() >= 2)) {
+      Root = &Base.getOperand(0);
+    }
+  }
+  if (isa<FrameIndexSDNode>(*Root)) {
+    // All frame indicies are word aligned
+    AlignedBase = Base;
+    Offset = off;
+    return true;
+  }
+  if (Root->getOpcode() == XCoreISD::DPRelativeWrapper ||
+      Root->getOpcode() == XCoreISD::CPRelativeWrapper) {
+    // All dp / cp relative addresses are word aligned
+    AlignedBase = Base;
+    Offset = off;
+    return true;
+  }
+  return false;
+}
+
 SDValue XCoreTargetLowering::
 LowerLOAD(SDValue Op, SelectionDAG &DAG)
 {
@@ -344,6 +382,61 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG)
   SDValue BasePtr = LD->getBasePtr();
   DebugLoc dl = Op.getDebugLoc();
   
+  SDValue Base;
+  int64_t Offset;
+  if (!LD->isVolatile() &&
+      IsWordAlignedBasePlusConstantOffset(BasePtr, Base, Offset)) {
+    if (Offset % 4 == 0) {
+      // We've managed to infer better alignment information than the load
+      // already has. Use an aligned load.
+      return DAG.getLoad(getPointerTy(), dl, Chain, BasePtr, NULL, 4);
+    }
+    // Lower to
+    // ldw low, base[offset >> 2]
+    // ldw high, base[(offset >> 2) + 1]
+    // shr low_shifted, low, (offset & 0x3) * 8
+    // shl high_shifted, high, 32 - (offset & 0x3) * 8
+    // or result, low_shifted, high_shifted
+    SDValue LowOffset = DAG.getConstant(Offset & ~0x3, MVT::i32);
+    SDValue HighOffset = DAG.getConstant((Offset & ~0x3) + 4, MVT::i32);
+    SDValue LowShift = DAG.getConstant((Offset & 0x3) * 8, MVT::i32);
+    SDValue HighShift = DAG.getConstant(32 - (Offset & 0x3) * 8, MVT::i32);
+    
+    SDValue LowAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Base, LowOffset);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Base, HighOffset);
+    
+    SDValue Low = DAG.getLoad(getPointerTy(), dl, Chain,
+                               LowAddr, NULL, 4);
+    SDValue High = DAG.getLoad(getPointerTy(), dl, Chain,
+                               HighAddr, NULL, 4);
+    SDValue LowShifted = DAG.getNode(ISD::SRL, dl, MVT::i32, Low, LowShift);
+    SDValue HighShifted = DAG.getNode(ISD::SHL, dl, MVT::i32, High, HighShift);
+    SDValue Result = DAG.getNode(ISD::OR, dl, MVT::i32, LowShifted, HighShifted);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Low.getValue(1),
+                             High.getValue(1));
+    SDValue Ops[] = { Result, Chain };
+    return DAG.getMergeValues(Ops, 2, dl);
+  }
+  
+  if (LD->getAlignment() == 2) {
+    int SVOffset = LD->getSrcValueOffset();
+    SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, Chain,
+                                 BasePtr, LD->getSrcValue(), SVOffset, MVT::i16,
+                                 LD->isVolatile(), 2);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
+                                   DAG.getConstant(1, MVT::i32));
+    SDValue High = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::i32, Chain,
+                                  HighAddr, LD->getSrcValue(), SVOffset + 2,
+                                  MVT::i16, LD->isVolatile(), 2);
+    SDValue HighShifted = DAG.getNode(ISD::SHL, dl, MVT::i32, High,
+                                      DAG.getConstant(16, MVT::i32));
+    SDValue Result = DAG.getNode(ISD::OR, dl, MVT::i32, Low, HighShifted);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Low.getValue(1),
+                             High.getValue(1));
+    SDValue Ops[] = { Result, Chain };
+    return DAG.getMergeValues(Ops, 2, dl);
+  }
+  
   // Lower to a call to __misaligned_load(BasePtr).
   const Type *IntPtrTy = getTargetData()->getIntPtrType();
   TargetLowering::ArgListTy Args;
@@ -385,6 +478,22 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG)
   SDValue Value = ST->getValue();
   DebugLoc dl = Op.getDebugLoc();
   
+  if (ST->getAlignment() == 2) {
+    int SVOffset = ST->getSrcValueOffset();
+    SDValue Low = Value;
+    SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
+                                      DAG.getConstant(16, MVT::i32));
+    SDValue StoreLow = DAG.getTruncStore(Chain, dl, Low, BasePtr,
+                                         ST->getSrcValue(), SVOffset, MVT::i16,
+                                         ST->isVolatile(), 2);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
+                                   DAG.getConstant(1, MVT::i32));
+    SDValue StoreHigh = DAG.getTruncStore(Chain, dl, High, HighAddr,
+                                          ST->getSrcValue(), SVOffset + 2,
+                                          MVT::i16, ST->isVolatile(), 2);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh);
+  }
+  
   // Lower to a call to __misaligned_store(BasePtr, Value).
   const Type *IntPtrTy = getTargetData()->getIntPtrType();
   TargetLowering::ArgListTy Args;
diff --git a/test/CodeGen/XCore/unaligned_load.ll b/test/CodeGen/XCore/unaligned_load.ll
index a6a50893b91..c1372ed42f9 100644
--- a/test/CodeGen/XCore/unaligned_load.ll
+++ b/test/CodeGen/XCore/unaligned_load.ll
@@ -1,5 +1,11 @@
 ; RUN: llvm-as < %s | llc -march=xcore > %t1.s
 ; RUN: grep "bl __misaligned_load" %t1.s | count 1
+; RUN: grep ld16s %t1.s | count 2
+; RUN: grep ldw %t1.s | count 2
+; RUN: grep shl %t1.s | count 2
+; RUN: grep shr %t1.s | count 1
+; RUN: grep zext %t1.s | count 1
+; RUN: grep "or " %t1.s | count 2
 
 ; Byte aligned load. Expands to call to __misaligned_load.
 define i32 @align1(i32* %p) nounwind {
@@ -7,3 +13,19 @@ entry:
 	%0 = load i32* %p, align 1		; <i32> [#uses=1]
 	ret i32 %0
 }
+
+; Half word aligned load. Expands to two 16bit loads.
+define i32 @align2(i32* %p) nounwind {
+entry:
+	%0 = load i32* %p, align 2		; <i32> [#uses=1]
+	ret i32 %0
+}
+
+@a = global [5 x i8] zeroinitializer, align 4
+
+; Constant offset from word aligned base. Expands to two 32bit loads.
+define i32 @align3() nounwind {
+entry:
+	%0 = load i32* bitcast (i8* getelementptr ([5 x i8]* @a, i32 0, i32 1) to i32*), align 1
+	ret i32 %0
+}
diff --git a/test/CodeGen/XCore/unaligned_store.ll b/test/CodeGen/XCore/unaligned_store.ll
index b7a519299fd..120d6529ece 100644
--- a/test/CodeGen/XCore/unaligned_store.ll
+++ b/test/CodeGen/XCore/unaligned_store.ll
@@ -1,5 +1,7 @@
 ; RUN: llvm-as < %s | llc -march=xcore > %t1.s
 ; RUN: grep "bl __misaligned_store" %t1.s | count 1
+; RUN: grep st16 %t1.s | count 2
+; RUN: grep shr %t1.s | count 1
 
 ; Byte aligned store. Expands to call to __misaligned_store.
 define void @align1(i32* %p, i32 %val) nounwind {
@@ -7,3 +9,10 @@ entry:
 	store i32 %val, i32* %p, align 1
 	ret void
 }
+
+; Half word aligned store. Expands to two 16bit stores.
+define void @align2(i32* %p, i32 %val) nounwind {
+entry:
+	store i32 %val, i32* %p, align 2
+	ret void
+}