diff --git a/imul8xe.s b/imul8xe.s
new file mode 100644
index 0000000..855e044
--- /dev/null
+++ b/imul8xe.s
@@ -0,0 +1,175 @@
+FR0    = $d4 ; float48
+PORTB = $d301
+
+
+EXTENDED_RAM = $4000 ; 16KiB bank on the XE
+
+; lookup table for top byte -> PORTB value for bank-switch
+.align 256
+bankswitch:
+    .repeat 256, i
+        .byte ((i & $c0) >> 5) | $c1
+    .endrepeat
+
+; 58-77 cycles
+; clobbers x, y, dest to dest + 3
+.macro imul8xe dest, arg1, arg2
+.local done
+.local output
+.local ptr
+
+    output = dest
+    ptr = dest + 2 ; scratch space assumed
+
+    ; bottom 14 bits except the LSB are the per-bank table index
+    ; add $4000 for the bank pointer
+    lda arg1     ; 3 cyc
+    and #$fe     ; 2 cyc
+    sta ptr      ; 3 cyc
+    lda arg2     ; 3 cyc
+    and #$3f     ; 2 cyc
+    clc          ; 2 cyc
+    adc #$40     ; 2 cyc
+    sta ptr + 1  ; 3 cyc
+    
+    ; top 2 bits are the table bank selector
+    ldx arg2          ; 3 cyc
+    lda bank_switch,x ; 4 cyc
+    sta PORTB         ; 4 cyc
+
+
+    ; copy the entry into output
+    ldy #0       ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output   ; 3 cyc
+    iny          ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output+1 ; 3 cyc
+
+    ; note: we are not restoring memory to save 6 cycles!
+    ; this means those 16kb have to be switched back to base RAM
+    ; if we need to use them anywhere else
+    ;;; restore memory
+    ;;lda #$81     ; 2 cyc - disabled
+    ;;sta PORTB    ; 4 cyc - disabled
+
+    ; check that 1 bit we skipped to fit into space
+    lda arg1     ; 3 cyc
+    and #1       ; 2 cyc
+    beq done     ; 2 cyc
+
+    ; add the second param one last time for the skipped bit
+    clc          ; 2 cyc
+    lda arg2     ; 3 cyc
+    adc output   ; 3 cyc
+    sta output   ; 3 cyc
+    lda #0       ; 2 cyc
+    adc output+1 ; 3 cyc
+    sta output+1 ; 3 cyc
+
+done:
+.endmacro
+
+.macro bank_switch bank
+    lda #((bank << 1) | $c1)
+    sta PORTB
+.endmacro
+
+proc imul8xe_init
+
+    ; go through the input set, in four 16KB chunks
+
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+
+    lda #$00
+    sta arg1
+    sta arg2
+
+    ; $00 * $00 -> $3f * $ff
+    bank_switch 0
+    jsr imul8xe_init_section
+
+    ; $40 * $00 -> $7f * $ff
+    bank_switch 1
+    jsr imul8xe_init_section
+
+    ; $80 * $00 -> $bf * $ff
+    bank_switch 2
+    jsr imul8xe_init_section
+
+    ; $c0 * $00 -> $ff * $ff
+    bank_switch 3
+    jsr imul8xe_init_section
+
+    rts
+endproc
+
+; Initialize a 16 KB chunk of the table
+; input: multipliers in temp
+; output: new multipliers in temp
+; clobbers: temp, temp2
+proc imul8xe_init_section
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+    ptr = temp2
+
+    lda #$00
+    sta ptr
+    lda #$40
+    sta ptr + 1
+
+    ldx #0
+    ldy #0
+
+    ; outer loop: $00 -> $3f
+outer_loop:
+
+    ; reset result to 0
+    lda #0
+    sta result
+    sta result + 1
+
+    ; inner loop: $00 -> $ff
+inner_loop:
+
+    ; copy result to data set
+    lda result
+    sta (ptr),y
+    lda result + 1
+    sta (ptr),y
+
+    ; result += 2 * arg2
+    clc
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result
+
+    ; inner loop check
+    inc arg1
+    inc arg1
+    inc ptr
+    inc ptr
+    bne inner_loop
+
+    ; outer loop check
+    inc arg2
+    inc ptr + 1
+    lda ptr + 1
+    cmp #$40
+    bne outer_loop
+
+    rts
+
+endproc
diff --git a/mandel.s b/mandel.s
index 3622995..d198989 100644
--- a/mandel.s
+++ b/mandel.s
@@ -74,6 +74,9 @@ width = 160
 half_width = width >> 1
 stride = width >> 2
 
+EXTENDED_RAM = $4000 ; 16KiB bank on the XE
+PORTB  = $D301 ; memory & bank-switch for XL/XE
+
 DMACTL = $D400
 DLISTL = $D402
 DLISTH = $D403
@@ -344,14 +347,6 @@ fill_masks:
     neg 4, arg
 .endmacro
 
-; 518 - 828 cyc
-.macro imul16 dest, arg1, arg2
-    copy16 FR0, arg1  ; 12 cyc
-    copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; 470-780 cyc
-    copy32 dest, FR2  ; 24 cyc
-.endmacro
-
 .macro shift_round_16 arg, shift
     .repeat shift
         shl32 arg
@@ -362,7 +357,7 @@ fill_masks:
 .macro imul16_round dest, arg1, arg2, shift
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; 470-780 cyc
+    jsr imul16_func   ; ? cyc
     shift_round_16 FR2, shift
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
@@ -372,54 +367,259 @@ fill_masks:
     .local under256
     .local next
     .local small_product
+    ; circa 92 cycles? this doesn't seem right
+    ; 81-92 cycles
     .scope
         mul_factor_a   = arg1
         mul_factor_x   = arg2
         mul_product_lo = dest
         mul_product_hi = dest + 1
 
-        lda mul_factor_a      ; setup: 6 cycles
-        ;ldx mul_factor_x
+        lda mul_factor_a      ; 3 cyc
 
-        clc                   ; (a + x)^2/2: 23 cycles
-        adc mul_factor_x
-        tax
-        bcc under256
-        lda mul_hibyte512,x
-        bcs next
+        ; (a + x)^2/2
+        clc                   ; 2 cyc         
+        adc mul_factor_x      ; 3 cyc
+        tax                   ; 2 cyc
+        bcc under256          ; 2 cyc
+        lda mul_hibyte512,x   ; 4 cyc
+        bcs next              ; 2 cyc
     under256:
-        lda mul_hibyte256,x
-        sec
+        lda mul_hibyte256,x   ; 4 cyc
+        sec                   ; 2 cyc
     next:
-        sta mul_product_hi
-        lda mul_lobyte256,x
+        sta mul_product_hi    ; 3 cyc
+        lda mul_lobyte256,x   ; 4 cyc
 
-        ldx mul_factor_a      ; - a^2/2: 20 cycles
-        sbc mul_lobyte256,x
-        sta mul_product_lo
-        lda mul_product_hi
-        sbc mul_hibyte256,x
-        sta mul_product_hi
+        ; - a^2/2
+        ldx mul_factor_a      ; 3 cyc
+        sbc mul_lobyte256,x   ; 4 cyc
+        sta mul_product_lo    ; 3 cyc
+        lda mul_product_hi    ; 3 cyc
+        sbc mul_hibyte256,x   ; 4 cyc
+        sta mul_product_hi    ; 3 cyc
 
-        ldx mul_factor_x      ; + x & a & 1: 22 cycles
-        txa                   ; (this is a kludge to correct a
-        and mul_factor_a      ; roundoff error that makes odd * odd too low)
-        and #1
+        ; + x & a & 1:
+        ; (this is a kludge to correct a
+        ; roundoff error that makes odd * odd too low)
+        ldx mul_factor_x      ; 3 cyc
+        txa                   ; 2 cyc
+        and mul_factor_a      ; 3 cyc
+        and #1                ; 2 cyc
 
-        clc
-        adc mul_product_lo
-        bcc small_product
-        inc mul_product_hi
+        clc                   ; 2 cyc
+        adc mul_product_lo    ; 3 cyc
+        bcc small_product     ; 2 cyc
+        inc mul_product_hi    ; 5 cyc
+
+        ; - x^2/2
     small_product:
-        sec                   ; - x^2/2: 25 cycles
-        sbc mul_lobyte256,x
-        sta mul_product_lo
-        lda mul_product_hi
-        sbc mul_hibyte256,x
-        sta mul_product_hi
+        sec                   ; 2 cyc
+        sbc mul_lobyte256,x   ; 4 cyc
+        sta mul_product_lo    ; 3 cyc
+        lda mul_product_hi    ; 3 cyc
+        sbc mul_hibyte256,x   ; 4 cyc
+        sta mul_product_hi    ; 3 cyc
     .endscope
 .endmacro
 
+; lookup table for top byte -> PORTB value for bank-switch
+;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
+bank_switch_table:
+    .repeat 256, i
+        .byte ((i & $c0) >> 4) | $e1
+    .endrepeat
+
+.macro bank_switch bank
+    lda #((bank << 2) | $e1)
+    sta PORTB
+.endmacro
+
+
+; 58-77 cycles
+; clobbers x, y, dest to dest + 3
+.macro imul8xe dest, arg1, arg2
+.local done
+.local output
+.local ptr
+
+    output = dest
+    ptr = dest + 2 ; scratch space assumed
+
+    ; bottom 14 bits except the LSB are the per-bank table index
+    ; add $4000 for the bank pointer
+    lda arg1     ; 3 cyc
+    and #$fe     ; 2 cyc
+    sta ptr      ; 3 cyc
+    lda arg2     ; 3 cyc
+    and #$3f     ; 2 cyc
+    clc          ; 2 cyc
+    adc #$40     ; 2 cyc
+    sta ptr + 1  ; 3 cyc
+    
+    ; top 2 bits are the table bank selector
+    ldx arg2                ; 3 cyc
+    lda bank_switch_table,x ; 4 cyc
+    sta PORTB               ; 4 cyc
+
+
+    ; copy the entry into output
+    ldy #0       ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output   ; 3 cyc
+    iny          ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output+1 ; 3 cyc
+
+    ; note: we are not restoring memory to save 6 cycles!
+    ; this means those 16kb have to be switched back to base RAM
+    ; if we need to use them anywhere else
+    ;;; restore memory
+    ;;lda #$81     ; 2 cyc - disabled
+    ;;sta PORTB    ; 4 cyc - disabled
+
+    ; check that 1 bit we skipped to fit into space
+    lda arg1     ; 3 cyc
+    and #1       ; 2 cyc
+    beq done     ; 2 cyc
+
+    ; add the second param one last time for the skipped bit
+    clc          ; 2 cyc
+    lda arg2     ; 3 cyc
+    adc output   ; 3 cyc
+    sta output   ; 3 cyc
+    lda #0       ; 2 cyc
+    adc output+1 ; 3 cyc
+    sta output+1 ; 3 cyc
+
+done:
+.endmacro
+
+.proc imul8xe_init
+
+    bank_switch 0
+    lda #0
+    sta EXTENDED_RAM
+    bank_switch 1
+    lda #1
+    sta EXTENDED_RAM
+    bank_switch 0
+    lda EXTENDED_RAM
+    beq init
+
+    ; no bank switching available, we just overwrite the value in base ram
+    rts
+
+init:
+
+    ; patch imul16_func into a forwarding thunk to imul16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta imul16_func
+    lda #.lobyte(imul16xe_func)
+    sta imul16_func + 1
+    lda #.hibyte(imul16xe_func)
+    sta imul16_func + 2
+
+    ; create the lookup table
+    ; go through the input set, in four 16KB chunks
+
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+
+    lda #$00
+    sta arg1
+    sta arg2
+
+    ; $00 * $00 -> $3f * $ff
+    bank_switch 0
+    jsr imul8xe_init_section
+
+    ; $40 * $00 -> $7f * $ff
+    bank_switch 1
+    jsr imul8xe_init_section
+
+    ; $80 * $00 -> $bf * $ff
+    bank_switch 2
+    jsr imul8xe_init_section
+
+    ; $c0 * $00 -> $ff * $ff
+    bank_switch 3
+    jsr imul8xe_init_section
+
+    rts
+.endproc
+
+; Initialize a 16 KB chunk of the table
+; input: multipliers in temp
+; output: new multipliers in temp
+; clobbers: temp, temp2
+.proc imul8xe_init_section
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+    ptr = temp2
+
+    lda #$00
+    sta ptr
+    lda #$40
+    sta ptr + 1
+
+    ldy #0
+
+    ; outer loop: $00 -> $3f
+outer_loop:
+
+    ; reset result to 0
+    lda #0
+    sta result
+    sta result + 1
+
+    ; inner loop: $00 -> $ff
+inner_loop:
+
+    ; copy result to data set
+    lda result
+    sta (ptr),y
+    lda result + 1
+    iny
+    sta (ptr),y
+    dey
+
+    ; result += 2 * arg2
+    clc
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result + 1
+    clc
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result + 1
+
+    ; inner loop check
+    inc arg1
+    inc arg1
+    inc ptr
+    inc ptr
+    bne inner_loop
+
+    ; outer loop check
+    inc arg2
+    inc ptr + 1
+    lda ptr + 1
+    cmp #$80
+    bne outer_loop
+
+    rts
+
+.endproc
+
 .proc imul16_func
     arg1 = FR0   ; 16-bit arg (clobbered)
     arg2 = FR1   ; 16-bit arg (clobbered)
@@ -461,6 +661,47 @@ arg2_pos:
     rts ; 6 cyc
 .endproc
 
+.proc imul16xe_func
+    arg1 = FR0   ; 16-bit arg (clobbered)
+    arg2 = FR1   ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+    inter = temp2
+
+    ; h1l1 * h2l2
+    ; (h1*256 + l1) * (h2*256 + l2)
+    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
+    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
+
+    imul8xe result, arg1, arg2
+    lda #0
+    sta result + 2
+    sta result + 3
+
+    imul8xe inter, arg1 + 1, arg2
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8xe inter, arg1, arg2 + 1
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8xe inter, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, inter
+
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg1 + 1
+    bpl arg1_pos
+    sub16 result + 2, result + 2, arg2
+arg1_pos:
+    lda arg2 + 1
+    bpl arg2_pos
+    sub16 result + 2, result + 2, arg1
+arg2_pos:
+
+    rts ; 6 cyc
+.endproc
+
 .macro round16 arg
     ; Round top 16 bits of 32-bit fixed-point number in-place
     .local increment
@@ -966,6 +1207,8 @@ zero_byte_loop:
 
 .proc start
 
+    jsr imul8xe_init
+
     ; ox = 0; oy = 0; zoom = 0
     ; count_frames = 0; count_pixels = 0
     lda #0