diff --git a/Makefile b/Makefile
index 008bf8c..bd14c7d 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,8 @@
 
 all : mandel.xex
 
-mandel.xex : mandel.o tables.o
-	ld65 -C ./atari-asm-xex.cfg -o $@ $+
+mandel.xex : mandel.o tables.o atari-asm-xex.cfg
+	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
 
 %.o : %.s
 	ca65 -o $@ $<
diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg
index 6e6498d..93b80f3 100644
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@@ -6,7 +6,10 @@ SYMBOLS {
 }
 MEMORY {
     ZP:      file = "", define = yes, start = $0082, size = $007E;
-    MAIN:    file = %O, define = yes, start = %S,    size = $BC20 - %S;
+    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
+    # Keep $4000-7fff clear for expanded RAM access window
+    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
+    # Keep $a000-$bfff clear for BASIC cartridge
 }
 FILES {
     %O: format = atari;
@@ -21,5 +24,5 @@ SEGMENTS {
     RODATA:   load = MAIN,    type = ro   optional = yes;
     DATA:     load = MAIN,    type = rw   optional = yes;
     BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
-    TABLES:   load = MAIN,    type = ro,  optional = yes, align = 256;
+    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
 }
diff --git a/mandel.s b/mandel.s
index 3579b0f..9f594e8 100644
--- a/mandel.s
+++ b/mandel.s
@@ -62,11 +62,11 @@ FST0R  = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
 FMOVE  = $DDB6 ; MOVE FR0 TO FR1
 
 ; High data
-framebuffer_top    = $8000
-textbuffer         = $8f00
-framebuffer_bottom = $9000
-display_list       = $9f00
-framebuffer_end    = $a000
+framebuffer_top    = $a000
+textbuffer         = $af00
+framebuffer_bottom = $b000
+display_list       = $bf00
+framebuffer_end    = $c000
 
 height = 184
 half_height = height >> 1
@@ -107,12 +107,14 @@ KEY_RIGHT = $87
 
 .struct float48
     exponent .byte
-    mantissa .byte 6
+    mantissa .byte 5
 .endstruct
 
 .import mul_lobyte256
 .import mul_hibyte256
 .import mul_hibyte512
+.import sqr_lobyte
+.import sqr_hibyte
 
 .data
 
@@ -374,137 +376,150 @@ viewport_oy:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
-; Adapted from https://everything2.com/title/Fast+6502+multiplication
-.macro imul8 dest, arg1, arg2
-    .local under256
-    .local next
-    .local small_product
-    ; circa 92 cycles? this doesn't seem right
-    ; 81-92 cycles
-    .scope
-        mul_factor_a   = arg1
-        mul_factor_x   = arg2
-        mul_product_lo = dest
-        mul_product_hi = dest + 1
+.macro sqr16_round dest, arg, shift
+    ;imul16_round dest, arg, arg, shift
+    copy16 FR0, arg   ; 12 cyc
+    jsr sqr16_func      ; ? cyc
+    shift_round_16 FR2, shift
+    copy16 dest, FR2 + 2  ; 12 cyc
+.endmacro
 
-        lda mul_factor_a      ; 3 cyc
-
-        ; (a + x)^2/2
-        clc                   ; 2 cyc         
-        adc mul_factor_x      ; 3 cyc
-        tax                   ; 2 cyc
-        bcc under256          ; 2 cyc
-        lda mul_hibyte512,x   ; 4 cyc
-        bcs next              ; 2 cyc
-    under256:
-        lda mul_hibyte256,x   ; 4 cyc
-        sec                   ; 2 cyc
-    next:
-        sta mul_product_hi    ; 3 cyc
-        lda mul_lobyte256,x   ; 4 cyc
-
-        ; - a^2/2
-        ldx mul_factor_a      ; 3 cyc
-        sbc mul_lobyte256,x   ; 4 cyc
-        sta mul_product_lo    ; 3 cyc
-        lda mul_product_hi    ; 3 cyc
-        sbc mul_hibyte256,x   ; 4 cyc
-        sta mul_product_hi    ; 3 cyc
-
-        ; + x & a & 1:
-        ; (this is a kludge to correct a
-        ; roundoff error that makes odd * odd too low)
-        ldx mul_factor_x      ; 3 cyc
-        txa                   ; 2 cyc
-        and mul_factor_a      ; 3 cyc
-        and #1                ; 2 cyc
-
-        clc                   ; 2 cyc
-        adc mul_product_lo    ; 3 cyc
-        bcc small_product     ; 2 cyc
-        inc mul_product_hi    ; 5 cyc
-
-        ; - x^2/2
-    small_product:
-        sec                   ; 2 cyc
-        sbc mul_lobyte256,x   ; 4 cyc
-        sta mul_product_lo    ; 3 cyc
-        lda mul_product_hi    ; 3 cyc
-        sbc mul_hibyte256,x   ; 4 cyc
-        sta mul_product_hi    ; 3 cyc
-    .endscope
+; clobbers a, x
+.macro sqr8 dest, arg
+    ldx arg
+    lda sqr_lobyte,x
+    sta dest
+    lda sqr_hibyte,x
+    sta dest + 1
 .endmacro
 
 ; lookup table for top byte -> PORTB value for bank-switch
 ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
 bank_switch_table:
     .repeat 256, i
-        .byte ((i & $c0) >> 4) | $e1
+        .byte ((i & $c0) >> 4) | $e3
     .endrepeat
 
 .macro bank_switch bank
-    lda #((bank << 2) | $e1)
+    lda #((bank << 2) | $e3)
     sta PORTB
 .endmacro
 
+.macro imul8 dest, arg1, arg2, xe
+    .if xe
+        ; using 64KB lookup table
+        ; 58-77 cycles
+        ; clobbers x, y, dest to dest + 3
+        .scope
+            output = dest
+            ptr = dest + 2 ; scratch space assumed
 
-; 58-77 cycles
-; clobbers x, y, dest to dest + 3
-.macro imul8xe dest, arg1, arg2
-.local done
-.local output
-.local ptr
-
-    output = dest
-    ptr = dest + 2 ; scratch space assumed
-
-    ; bottom 14 bits except the LSB are the per-bank table index
-    ; add $4000 for the bank pointer
-    lda arg1     ; 3 cyc
-    and #$fe     ; 2 cyc
-    sta ptr      ; 3 cyc
-    lda arg2     ; 3 cyc
-    and #$3f     ; 2 cyc
-    clc          ; 2 cyc
-    adc #$40     ; 2 cyc
-    sta ptr + 1  ; 3 cyc
-    
-    ; top 2 bits are the table bank selector
-    ldx arg2                ; 3 cyc
-    lda bank_switch_table,x ; 4 cyc
-    sta PORTB               ; 4 cyc
+            ; bottom 14 bits except the LSB are the per-bank table index
+            ; add $4000 for the bank pointer
+            lda arg1     ; 3 cyc
+            and #$fe     ; 2 cyc
+            sta ptr      ; 3 cyc
+            lda arg2     ; 3 cyc
+            and #$3f     ; 2 cyc
+            clc          ; 2 cyc
+            adc #$40     ; 2 cyc
+            sta ptr + 1  ; 3 cyc
+            
+            ; top 2 bits are the table bank selector
+            ldx arg2                ; 3 cyc
+            lda bank_switch_table,x ; 4 cyc
+            sta PORTB               ; 4 cyc
 
 
-    ; copy the entry into output
-    ldy #0       ; 2 cyc
-    lda (ptr),y  ; 5 cyc
-    sta output   ; 3 cyc
-    iny          ; 2 cyc
-    lda (ptr),y  ; 5 cyc
-    sta output+1 ; 3 cyc
+            ; copy the entry into output
+            ldy #0       ; 2 cyc
+            lda (ptr),y  ; 5 cyc
+            sta output   ; 3 cyc
+            iny          ; 2 cyc
+            lda (ptr),y  ; 5 cyc
+            sta output+1 ; 3 cyc
 
-    ; note: we are not restoring memory to save 6 cycles!
-    ; this means those 16kb have to be switched back to base RAM
-    ; if we need to use them anywhere else
-    ;;; restore memory
-    ;;lda #$81     ; 2 cyc - disabled
-    ;;sta PORTB    ; 4 cyc - disabled
+            ; note: we are not restoring memory to save 6 cycles!
+            ; this means those 16kb have to be switched back to base RAM
+            ; if we need to use them anywhere else
+            ;;; restore memory
+            ;;lda #$81     ; 2 cyc - disabled
+            ;;sta PORTB    ; 4 cyc - disabled
 
-    ; check that 1 bit we skipped to fit into space
-    lda arg1     ; 3 cyc
-    and #1       ; 2 cyc
-    beq done     ; 2 cyc
+            ; check that 1 bit we skipped to fit into space
+            lda arg1     ; 3 cyc
+            and #1       ; 2 cyc
+            beq done     ; 2 cyc
 
-    ; add the second param one last time for the skipped bit
-    clc          ; 2 cyc
-    lda arg2     ; 3 cyc
-    adc output   ; 3 cyc
-    sta output   ; 3 cyc
-    lda #0       ; 2 cyc
-    adc output+1 ; 3 cyc
-    sta output+1 ; 3 cyc
+            ; add the second param one last time for the skipped bit
+            clc          ; 2 cyc
+            lda arg2     ; 3 cyc
+            adc output   ; 3 cyc
+            sta output   ; 3 cyc
+            lda #0       ; 2 cyc
+            adc output+1 ; 3 cyc
+            sta output+1 ; 3 cyc
 
-done:
+        done:
+        .endscope
+    .else
+        ; Using base 48k RAM compatibility mode
+        ; Small table of half squares
+        ; Adapted from https://everything2.com/title/Fast+6502+multiplication
+        ; 81-92 cycles
+        .scope
+            mul_factor_a   = arg1
+            mul_factor_x   = arg2
+            mul_product_lo = dest
+            mul_product_hi = dest + 1
+
+            lda mul_factor_a      ; 3 cyc
+
+            ; (a + x)^2/2
+            clc                   ; 2 cyc         
+            adc mul_factor_x      ; 3 cyc
+            tax                   ; 2 cyc
+            bcc under256          ; 2 cyc
+            lda mul_hibyte512,x   ; 4 cyc
+            bcs next              ; 2 cyc
+        under256:
+            lda mul_hibyte256,x   ; 4 cyc
+            sec                   ; 2 cyc
+        next:
+            sta mul_product_hi    ; 3 cyc
+            lda mul_lobyte256,x   ; 4 cyc
+
+            ; - a^2/2
+            ldx mul_factor_a      ; 3 cyc
+            sbc mul_lobyte256,x   ; 4 cyc
+            sta mul_product_lo    ; 3 cyc
+            lda mul_product_hi    ; 3 cyc
+            sbc mul_hibyte256,x   ; 4 cyc
+            sta mul_product_hi    ; 3 cyc
+
+            ; + x & a & 1:
+            ; (this is a kludge to correct a
+            ; roundoff error that makes odd * odd too low)
+            ldx mul_factor_x      ; 3 cyc
+            txa                   ; 2 cyc
+            and mul_factor_a      ; 3 cyc
+            and #1                ; 2 cyc
+
+            clc                   ; 2 cyc
+            adc mul_product_lo    ; 3 cyc
+            bcc small_product     ; 2 cyc
+            inc mul_product_hi    ; 5 cyc
+
+            ; - x^2/2
+        small_product:
+            sec                   ; 2 cyc
+            sbc mul_lobyte256,x   ; 4 cyc
+            sta mul_product_lo    ; 3 cyc
+            lda mul_product_hi    ; 3 cyc
+            sbc mul_hibyte256,x   ; 4 cyc
+            sta mul_product_hi    ; 3 cyc
+        .endscope
+    .endif
 .endmacro
 
 .proc imul8xe_init
@@ -532,6 +547,14 @@ init:
     lda #.hibyte(imul16xe_func)
     sta imul16_func + 2
 
+    ; ditto for sqr16_func -> sqr16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta sqr16_func
+    lda #.lobyte(sqr16xe_func)
+    sta sqr16_func + 1
+    lda #.hibyte(sqr16xe_func)
+    sta sqr16_func + 2
+
     ; create the lookup table
     ; go through the input set, in four 16KB chunks
 
@@ -632,7 +655,13 @@ inner_loop:
 
 .endproc
 
-.proc imul16_func
+.macro imul16_impl xe
+    .local arg1
+    .local arg2
+    .local result
+    .local inter
+    .local arg1_pos
+    .local arg2_pos
     arg1 = FR0   ; 16-bit arg (clobbered)
     arg2 = FR1   ; 16-bit arg (clobbered)
     result = FR2 ; 32-bit result
@@ -643,20 +672,20 @@ inner_loop:
     ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
     ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
 
-    imul8 result, arg1, arg2
+    imul8 result, arg1, arg2, xe
     lda #0
     sta result + 2
     sta result + 3
 
-    imul8 inter, arg1 + 1, arg2
+    imul8 inter, arg1 + 1, arg2, xe
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
-    imul8 inter, arg1, arg2 + 1
+    imul8 inter, arg1, arg2 + 1, xe
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
-    imul8 inter, arg1 + 1, arg2 + 1
+    imul8 inter, arg1 + 1, arg2 + 1, xe
     add16 result + 2, result + 2, inter
 
     ; In case of negative inputs, adjust high word
@@ -671,47 +700,59 @@ arg1_pos:
 arg2_pos:
 
     rts ; 6 cyc
+.endmacro
+
+.macro sqr16_impl xe
+    .scope
+        arg = FR0    ; 16-bit arg (clobbered)
+        result = FR2 ; 32-bit result
+        ;inter = temp2
+        inter = FR1
+
+        lda arg + 1
+        bpl arg_pos
+        neg16 arg
+    arg_pos:
+
+        ; hl * hl
+        ; (h*256 + l) * (h*256 + l)
+        ; h*256*(h*256 + l) + l*(h*256 + l)
+        ; h*h*256*256 + h*l*256 + h*l*256 + l*l
+
+        sqr8 result, arg
+        ;imul8 inter, arg, arg, xe
+        lda #0
+        sta result + 2
+        sta result + 3
+
+        imul8 inter, arg + 1, arg, xe
+        add16 result + 1, result + 1, inter
+        add_carry result + 3
+        add16 result + 1, result + 1, inter
+        add_carry result + 3
+
+        sqr8 inter, arg + 1
+        ;imul8 inter, arg + 1, arg + 1, xe
+        add16 result + 2, result + 2, inter
+
+        rts ; 6 cyc
+    .endscope
+.endmacro
+
+.proc imul16_func
+    imul16_impl 0
 .endproc
 
 .proc imul16xe_func
-    arg1 = FR0   ; 16-bit arg (clobbered)
-    arg2 = FR1   ; 16-bit arg (clobbered)
-    result = FR2 ; 32-bit result
-    inter = temp2
+    imul16_impl 1
+.endproc
 
-    ; h1l1 * h2l2
-    ; (h1*256 + l1) * (h2*256 + l2)
-    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
-    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
+.proc sqr16_func
+    sqr16_impl 0
+.endproc
 
-    imul8xe result, arg1, arg2
-    lda #0
-    sta result + 2
-    sta result + 3
-
-    imul8xe inter, arg1 + 1, arg2
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-
-    imul8xe inter, arg1, arg2 + 1
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-
-    imul8xe inter, arg1 + 1, arg2 + 1
-    add16 result + 2, result + 2, inter
-
-    ; In case of negative inputs, adjust high word
-    ; https://stackoverflow.com/a/28827013
-    lda arg1 + 1
-    bpl arg1_pos
-    sub16 result + 2, result + 2, arg2
-arg1_pos:
-    lda arg2 + 1
-    bpl arg2_pos
-    sub16 result + 2, result + 2, arg1
-arg2_pos:
-
-    rts ; 6 cyc
+.proc sqr16xe_func
+    sqr16_impl 1
 .endproc
 
 .macro round16 arg
@@ -825,10 +866,10 @@ keep_going:
     quick_exit zy, 2
 
     ; zx_2 = zx * zx
-    imul16_round zx_2, zx, zx, 4
+    sqr16_round zx_2, zx, 4
 
     ; zy_2 = zy * zy
-    imul16_round zy_2, zy, zy, 4
+    sqr16_round zy_2, zy, 4
 
     ; zx_zy = zx * zy
     imul16_round zx_zy, zx, zy, 4
diff --git a/tables.js b/tables.js
index c772f81..50cbef9 100644
--- a/tables.js
+++ b/tables.js
@@ -22,7 +22,10 @@ console.log(
 .export mul_lobyte256
 .export mul_hibyte256
 .export mul_hibyte512
+.export sqr_lobyte
+.export sqr_hibyte
 
+; (i * i + 1) / 2 for the multiplier
 .align 256
 mul_lobyte256:
 ${db((i) => squares[i] & 0xff)}
@@ -35,4 +38,13 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
 mul_hibyte512:
 ${db((i) => (squares[i + 256] >> 8) & 0xff)}
 
+; (i * i) for the plain squares
+.align 256
+sqr_lobyte:
+${db((i) => (i * i) & 0xff)}
+
+.align 256
+sqr_hibyte:
+${db((i) => ((i * i) >> 8) & 0xff)}
+
 `);