diff --git a/codeCore/src/prog8/code/ast/AstExpressions.kt b/codeCore/src/prog8/code/ast/AstExpressions.kt
index 9c58e9910..897523c68 100644
--- a/codeCore/src/prog8/code/ast/AstExpressions.kt
+++ b/codeCore/src/prog8/code/ast/AstExpressions.kt
@@ -379,6 +379,16 @@ class PtString(val value: String, val encoding: Encoding, position: Position) :
 class PtTypeCast(type: BaseDataType, position: Position) : PtExpression(DataType.forDt(type), position) {
     val value: PtExpression
         get() = children.single() as PtExpression
+
+    fun copy(): PtTypeCast {
+        val copy = PtTypeCast(type.base, position)
+        if(children[0] is PtIdentifier) {
+            copy.add((children[0] as PtIdentifier).copy())
+        } else {
+            TODO("cannot copy node ${children[0]}")
+        }
+        return copy
+    }
 }
 
 
diff --git a/codeCore/src/prog8/code/optimize/Optimizer.kt b/codeCore/src/prog8/code/optimize/Optimizer.kt
index 77d3ef3ff..a3c09776e 100644
--- a/codeCore/src/prog8/code/optimize/Optimizer.kt
+++ b/codeCore/src/prog8/code/optimize/Optimizer.kt
@@ -4,12 +4,14 @@ import prog8.code.StExtSub
 import prog8.code.SymbolTable
 import prog8.code.ast.*
 import prog8.code.core.*
+import prog8.code.target.VMTarget
 
 
 fun optimizeSimplifiedAst(program: PtProgram, options: CompilationOptions, st: SymbolTable, errors: IErrorReporter) {
     if (!options.optimize)
         return
-    while (errors.noErrors() && optimizeAssignTargets(program, st) > 0) {
+    while (errors.noErrors() &&
+        optimizeAssignTargets(program, st) + optimizeWordPlusTimesTwo(program, options) > 0) {
         // keep rolling
     }
 }
@@ -96,3 +98,42 @@ internal fun isSame(identifier: PtIdentifier, type: DataType, returnedRegister:
     }
     return false   // there are no identifiers directly corresponding to cpu registers
 }
+
+
+private fun optimizeWordPlusTimesTwo(program: PtProgram, options: CompilationOptions): Int {
+    if(options.compTarget.name== VMTarget.NAME)
+        return 0
+    var changes = 0
+    walkAst(program) { node: PtNode, depth: Int ->
+        if (node is PtBinaryExpression) {
+            if(node.operator=="*" && node.right.type.isWord && node.right.asConstValue()==2.0) {
+                TODO("optimize word + byte*2 (usually already replaced by w+b<<2)")
+            }
+            else if(node.operator=="<<" && node.right.asConstValue()==1.0) {
+                val typecast=node.left as? PtTypeCast
+                if(typecast!=null && typecast.type.isWord && typecast.value is PtIdentifier) {
+                    val addition = node.parent as? PtBinaryExpression
+                    if(addition!=null && (addition.operator=="+" || addition.operator=="-") && addition.type.isWord) {
+                        // word + (byte<<1 as uword) (== word + byte*2)  -->  (word + (byte as word)) + (byte as word)
+                        val parent = addition.parent
+                        val index = parent.children.indexOf(addition)
+                        val addFirst = PtBinaryExpression(addition.operator, addition.type, addition.position)
+                        val addSecond = PtBinaryExpression(addition.operator, addition.type, addition.position)
+                        if(addition.left===node)
+                            addFirst.add(addition.right)
+                        else
+                            addFirst.add(addition.left)
+                        addFirst.add(typecast)
+                        addSecond.add(addFirst)
+                        addSecond.add(typecast.copy())
+                        parent.children[index] = addSecond
+                        addSecond.parent = parent
+                        changes++
+                    }
+                }
+            }
+        }
+        true
+    }
+    return changes
+}
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index 4f620d10b..8198d489d 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -1,9 +1,6 @@
 TODO
 ====
 
-- word+byte*2 -> word +byte +byte,  word-byte*2 -> word-byte-byte   (check that it gets properly word-extended!)
-- optimize pokew and peekw to no longer do a jsr
-
 - Make some of the target machine config externally configurable (for 1 new target, the existing ones should stay as they are for the time being)
 
 - add paypal donation button as well?
@@ -71,7 +68,7 @@ IR/VM
 
 Libraries
 ---------
-- Sorting module gnomesort_uw could be optimized more, rewrite in asm? Shellshort seems consistently faster even if most of the words are already sorted.
+- Sorting module gnomesort_uw could be optimized more by fully rewriting it in asm? Shellshort seems consistently faster even if most of the words are already sorted.
 - Add split-word array sorting routines to sorting module?
 - add even more general raster irq routines to build some sort of "copper list" , like Oscar64 has?
 - pet32 target: make syslib more complete (missing kernal routines)?
diff --git a/examples/test.p8 b/examples/test.p8
index 3211c3401..b657dfc08 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -1,3 +1,4 @@
+;%import emudbg
 %import textio
 %option no_sysinit
 %zeropage basicsafe
@@ -5,51 +6,146 @@
 
 
 main {
-
-    ubyte @nozp @shared staticvar=51
-
     sub start() {
-        ubyte x,y
+        uword @shared w1, w2
+        ubyte @shared b
 
-        x = 88
-        y = 99
-        ubyte a,b,c,d = multi4()
-        ubyte e,f = multi2()
-
-        txt.print_ub(a)
-        txt.spc()
-        txt.print_ub(b)
-        txt.spc()
-        txt.print_ub(c)
-        txt.spc()
-        txt.print_ub(d)
-        txt.spc()
-        txt.print_ub(e)
-        txt.spc()
-        txt.print_ub(f)
-        txt.nl()
+        w1 = w2 + b*$0002
+        w1 = w2 + b + b
+;        w2 = (w1 + b as uword) + (b as uword)
 
 
-    }
-
-    sub single() -> ubyte {
-        return cx16.r0L+cx16.r1L
-    }
-    asmsub multi1() -> ubyte @A, ubyte @Y {
-        %asm {{
-            lda  #1
-            ldy  #2
-            rts
-        }}
-    }
-
-    sub multi2() -> ubyte, ubyte {
-        cx16.r0++
-        return 33,44
-    }
-
-    sub multi4() -> ubyte, ubyte, ubyte, ubyte {
-        cx16.r0++
-        return 3,4,5,6
+;        cx16.r0 = cx16.r1 + cx16.r0L*2
+;        cx16.r0 = cx16.r1 + cx16.r0L*$0002
+;        cx16.r0 = cx16.r1 + cx16.r0L + cx16.r0L
+;        cx16.r0 = cx16.r1 - cx16.r0L*2
+;        cx16.r0 = cx16.r1 - cx16.r0L*$0002
+;        cx16.r0 = cx16.r1 - cx16.r0L - cx16.r0L
     }
 }
+
+
+/*
+mainxxx {
+
+    uword[50] @nosplit warray1
+    uword[50] @nosplit warray2
+
+    sub fill_arrays() {
+        math.rndseed(999,1234)
+        for cx16.r0L in 0 to len(warray1)-1 {
+            warray1[cx16.r0L] = math.rndw()
+            warray2[cx16.r0L] = cx16.r0L * (100 as uword)
+        }
+        warray2[40] = 9900
+        warray2[44] = 9910
+        warray2[48] = 9920
+    }
+
+    sub perf_reset() {
+        emudbg.reset_cpu_cycles()
+    }
+
+    sub perf_print() {
+        cx16.r4, cx16.r5 = emudbg.cpu_cycles()
+        txt.print_uwhex(cx16.r5, true)
+        txt.print_uwhex(cx16.r4, false)
+        txt.nl()
+    }
+
+    sub start() {
+        sys.set_irqd()
+        fill_arrays()
+
+        txt.print("\ngnomesort (words):\n")
+        perf_reset()
+        gnomesort_uw(warray1, len(warray1))
+        perf_print()
+        for cx16.r0L in 0 to len(warray1)-1 {
+            txt.print_uw(warray1[cx16.r0L])
+            txt.chrout(',')
+        }
+        txt.nl()
+
+        txt.print("\ngnomesort (words) almost sorted:\n")
+        perf_reset()
+        gnomesort_uw(warray2, len(warray2))
+        perf_print()
+        for cx16.r0L in 0 to len(warray2)-1 {
+            txt.print_uw(warray2[cx16.r0L])
+            txt.chrout(',')
+        }
+        txt.nl()
+        txt.nl()
+
+        fill_arrays()
+
+        txt.print("\ngnomesort_opt (words):\n")
+        perf_reset()
+        gnomesort_uw_opt(warray1, len(warray1))
+        perf_print()
+        for cx16.r0L in 0 to len(warray1)-1 {
+            txt.print_uw(warray1[cx16.r0L])
+            txt.chrout(',')
+        }
+        txt.nl()
+
+        txt.print("\ngnomesort_opt (words) almost sorted:\n")
+        perf_reset()
+        gnomesort_uw_opt(warray2, len(warray2))
+        perf_print()
+        for cx16.r0L in 0 to len(warray2)-1 {
+            txt.print_uw(warray2[cx16.r0L])
+            txt.chrout(',')
+        }
+        txt.nl()
+        sys.clear_irqd()
+        repeat {
+        }
+    }
+
+
+    sub gnomesort_uw(uword values, ubyte num_elements) {
+        ; TODO optimize this more, rewrite in asm?
+        ubyte @zp pos = 1
+        while pos != num_elements {
+            uword @requirezp ptr = values+(pos*$0002)
+            cx16.r0 = peekw(ptr-2)
+            cx16.r1 = peekw(ptr)
+            if cx16.r0<=cx16.r1
+                pos++
+            else {
+                ; swap elements
+                pokew(ptr-2, cx16.r1)
+                pokew(ptr, cx16.r0)
+                pos--
+                if_z
+                    pos++
+            }
+        }
+    }
+
+    sub gnomesort_uw_opt(uword values, ubyte num_elements) {
+        ; TODO optimize this more, rewrite in asm?
+        ubyte @zp pos = 1
+        uword @requirezp ptr = values+2
+        while pos != num_elements {
+            cx16.r0 = peekw(ptr-2)
+            cx16.r1 = peekw(ptr)
+            if cx16.r0<=cx16.r1 {
+                pos++
+                ptr+=2
+            }
+            else {
+                ; swap elements
+                pokew(ptr-2, cx16.r1)
+                pokew(ptr, cx16.r0)
+                if pos>1 {
+                    pos--
+                    ptr-=2
+                }
+            }
+        }
+    }
+}
+*/