diff --git a/imul8xe.s b/imul8xe.s
new file mode 100644
index 0000000..15adf64
--- /dev/null
+++ b/imul8xe.s
@@ -0,0 +1,75 @@
+FR0    = $d4 ; float48
+PORTB = $d301
+
+
+EXTENDED_RAM = $4000 ; 16KiB bank on the XE
+
+; lookup table for top byte -> PORTB value for bank-switch
+.align 256
+bankswitch:
+    .repeat 256, i
+        .byte ((i & $c0) >> 5) | $c1
+    .endrepeat
+
+; 58-77 cycles
+; clobbers x, y, dest to dest + 3
+.macro imul8xe dest, arg1, arg2
+.local done
+.local output
+.local ptr
+
+    output = dest
+    ptr = dest + 2 ; scratch space assumed
+
+    ; bottom 14 bits except the LSB are the per-bank table index
+    ; add $4000 for the bank pointer
+    lda arg1     ; 3 cyc
+    and #$fe     ; 2 cyc
+    sta ptr      ; 3 cyc
+    lda arg2     ; 3 cyc
+    and #$3f     ; 2 cyc
+    clc          ; 2 cyc
+    adc #$40     ; 2 cyc
+    sta ptr + 1  ; 3 cyc
+    
+    ; top 2 bits are the table bank selector
+    ldx arg2          ; 3 cyc
+    lda bank_switch,x ; 4 cyc
+    sta PORTB         ; 4 cyc
+
+
+    ; copy the entry into output
+    ldy #0       ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output   ; 3 cyc
+    iny          ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output+1 ; 3 cyc
+
+    ; note: we are not restoring memory to save 6 cycles!
+    ; this means those 16kb have to be switched back to base RAM
+    ; if we need to use them anywhere else
+    ;;; restore memory
+    ;;lda #$81     ; 2 cyc - disabled
+    ;;sta PORTB    ; 4 cyc - disabled
+
+    ; check that 1 bit we skipped to fit into space
+    lda arg1     ; 3 cyc
+    and #1       ; 2 cyc
+    beq done     ; 2 cyc
+
+    ; add the second param one last time for the skipped bit
+    clc          ; 2 cyc
+    lda arg2     ; 3 cyc
+    adc output   ; 3 cyc
+    sta output   ; 3 cyc
+    lda #0       ; 2 cyc
+    adc output+1 ; 3 cyc
+    sta output+1 ; 3 cyc
+
+done:
+.endmacro
+
+proc imul8xe_init
+    rts
+endproc
diff --git a/mandel.s b/mandel.s
index 3622995..e0a8570 100644
--- a/mandel.s
+++ b/mandel.s
@@ -372,51 +372,59 @@ fill_masks:
     .local under256
     .local next
     .local small_product
+    ; circa 92 cycles? this doesn't seem right
+    ; 81-92 cycles
     .scope
         mul_factor_a   = arg1
         mul_factor_x   = arg2
         mul_product_lo = dest
         mul_product_hi = dest + 1
 
-        lda mul_factor_a      ; setup: 6 cycles
-        ;ldx mul_factor_x
+        lda mul_factor_a      ; 3 cyc
 
-        clc                   ; (a + x)^2/2: 23 cycles
-        adc mul_factor_x
-        tax
-        bcc under256
-        lda mul_hibyte512,x
-        bcs next
+        ; (a + x)^2/2
+        clc                   ; 2 cyc         
+        adc mul_factor_x      ; 3 cyc
+        tax                   ; 2 cyc
+        bcc under256          ; 2 cyc
+        lda mul_hibyte512,x   ; 4 cyc
+        bcs next              ; 2 cyc
     under256:
-        lda mul_hibyte256,x
-        sec
+        lda mul_hibyte256,x   ; 4 cyc
+        sec                   ; 2 cyc
     next:
-        sta mul_product_hi
-        lda mul_lobyte256,x
+        sta mul_product_hi    ; 3 cyc
+        lda mul_lobyte256,x   ; 4 cyc
 
-        ldx mul_factor_a      ; - a^2/2: 20 cycles
-        sbc mul_lobyte256,x
-        sta mul_product_lo
-        lda mul_product_hi
-        sbc mul_hibyte256,x
-        sta mul_product_hi
+        ; - a^2/2
+        ldx mul_factor_a      ; 3 cyc
+        sbc mul_lobyte256,x   ; 4 cyc
+        sta mul_product_lo    ; 3 cyc
+        lda mul_product_hi    ; 3 cyc
+        sbc mul_hibyte256,x   ; 4 cyc
+        sta mul_product_hi    ; 3 cyc
 
-        ldx mul_factor_x      ; + x & a & 1: 22 cycles
-        txa                   ; (this is a kludge to correct a
-        and mul_factor_a      ; roundoff error that makes odd * odd too low)
-        and #1
+        ; + x & a & 1:
+        ; (this is a kludge to correct a
+        ; roundoff error that makes odd * odd too low)
+        ldx mul_factor_x      ; 3 cyc
+        txa                   ; 2 cyc
+        and mul_factor_a      ; 3 cyc
+        and #1                ; 2 cyc
 
-        clc
-        adc mul_product_lo
-        bcc small_product
-        inc mul_product_hi
+        clc                   ; 2 cyc
+        adc mul_product_lo    ; 3 cyc
+        bcc small_product     ; 2 cyc
+        inc mul_product_hi    ; 5 cyc
+
+        ; - x^2/2
     small_product:
-        sec                   ; - x^2/2: 25 cycles
-        sbc mul_lobyte256,x
-        sta mul_product_lo
-        lda mul_product_hi
-        sbc mul_hibyte256,x
-        sta mul_product_hi
+        sec                   ; 2 cyc
+        sbc mul_lobyte256,x   ; 4 cyc
+        sta mul_product_lo    ; 3 cyc
+        lda mul_product_hi    ; 3 cyc
+        sbc mul_hibyte256,x   ; 4 cyc
+        sta mul_product_hi    ; 3 cyc
     .endscope
 .endmacro