diff --git a/mandel.s b/mandel.s
index 3ff91d1..d198989 100644
--- a/mandel.s
+++ b/mandel.s
@@ -347,14 +347,6 @@ fill_masks:
     neg 4, arg
 .endmacro
 
-; 518 - 828 cyc
-.macro imul16 dest, arg1, arg2
-    copy16 FR0, arg1  ; 12 cyc
-    copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; 470-780 cyc
-    copy32 dest, FR2  ; 24 cyc
-.endmacro
-
 .macro shift_round_16 arg, shift
     .repeat shift
         shl32 arg
@@ -365,7 +357,7 @@ fill_masks:
 .macro imul16_round dest, arg1, arg2, shift
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; 470-780 cyc
+    jsr imul16_func   ; ? cyc
     shift_round_16 FR2, shift
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
@@ -505,6 +497,30 @@ done:
 
 .proc imul8xe_init
 
+    bank_switch 0
+    lda #0
+    sta EXTENDED_RAM
+    bank_switch 1
+    lda #1
+    sta EXTENDED_RAM
+    bank_switch 0
+    lda EXTENDED_RAM
+    beq init
+
+    ; no bank switching available, we just overwrite the value in base ram
+    rts
+
+init:
+
+    ; patch imul16_func into a forwarding thunk to imul16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta imul16_func
+    lda #.lobyte(imul16xe_func)
+    sta imul16_func + 1
+    lda #.hibyte(imul16xe_func)
+    sta imul16_func + 2
+
+    ; create the lookup table
     ; go through the input set, in four 16KB chunks
 
     arg1 = FR1
@@ -615,6 +631,47 @@ inner_loop:
     ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
     ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
 
+    imul8 result, arg1, arg2
+    lda #0
+    sta result + 2
+    sta result + 3
+
+    imul8 inter, arg1 + 1, arg2
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1, arg2 + 1
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, inter
+
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg1 + 1
+    bpl arg1_pos
+    sub16 result + 2, result + 2, arg2
+arg1_pos:
+    lda arg2 + 1
+    bpl arg2_pos
+    sub16 result + 2, result + 2, arg1
+arg2_pos:
+
+    rts ; 6 cyc
+.endproc
+
+.proc imul16xe_func
+    arg1 = FR0   ; 16-bit arg (clobbered)
+    arg2 = FR1   ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+    inter = temp2
+
+    ; h1l1 * h2l2
+    ; (h1*256 + l1) * (h2*256 + l2)
+    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
+    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
+
     imul8xe result, arg1, arg2
     lda #0
     sta result + 2