diff --git a/Makefile b/Makefile
index bd14c7d..008bf8c 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,8 @@
 
 all : mandel.xex
 
-mandel.xex : mandel.o tables.o atari-asm-xex.cfg
-	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
+mandel.xex : mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+
 
 %.o : %.s
 	ca65 -o $@ $<
diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg
index 93b80f3..6e6498d 100644
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@@ -6,10 +6,7 @@ SYMBOLS {
 }
 MEMORY {
     ZP:      file = "", define = yes, start = $0082, size = $007E;
-    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
-    # Keep $4000-7fff clear for expanded RAM access window
-    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
-    # Keep $a000-$bfff clear for BASIC cartridge
+    MAIN:    file = %O, define = yes, start = %S,    size = $BC20 - %S;
 }
 FILES {
     %O: format = atari;
@@ -24,5 +21,5 @@ SEGMENTS {
     RODATA:   load = MAIN,    type = ro   optional = yes;
     DATA:     load = MAIN,    type = rw   optional = yes;
     BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
-    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
+    TABLES:   load = MAIN,    type = ro,  optional = yes, align = 256;
 }
diff --git a/mandel.s b/mandel.s
index 9f594e8..3579b0f 100644
--- a/mandel.s
+++ b/mandel.s
@@ -62,11 +62,11 @@ FST0R  = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
 FMOVE  = $DDB6 ; MOVE FR0 TO FR1
 
 ; High data
-framebuffer_top    = $a000
-textbuffer         = $af00
-framebuffer_bottom = $b000
-display_list       = $bf00
-framebuffer_end    = $c000
+framebuffer_top    = $8000
+textbuffer         = $8f00
+framebuffer_bottom = $9000
+display_list       = $9f00
+framebuffer_end    = $a000
 
 height = 184
 half_height = height >> 1
@@ -107,14 +107,12 @@ KEY_RIGHT = $87
 
 .struct float48
     exponent .byte
-    mantissa .byte 5
+    mantissa .byte 6
 .endstruct
 
 .import mul_lobyte256
 .import mul_hibyte256
 .import mul_hibyte512
-.import sqr_lobyte
-.import sqr_hibyte
 
 .data
 
@@ -376,150 +374,137 @@ viewport_oy:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
-.macro sqr16_round dest, arg, shift
-    ;imul16_round dest, arg, arg, shift
-    copy16 FR0, arg   ; 12 cyc
-    jsr sqr16_func      ; ? cyc
-    shift_round_16 FR2, shift
-    copy16 dest, FR2 + 2  ; 12 cyc
-.endmacro
+; Adapted from https://everything2.com/title/Fast+6502+multiplication
+.macro imul8 dest, arg1, arg2
+    .local under256
+    .local next
+    .local small_product
+    ; circa 92 cycles? this doesn't seem right
+    ; 81-92 cycles
+    .scope
+        mul_factor_a   = arg1
+        mul_factor_x   = arg2
+        mul_product_lo = dest
+        mul_product_hi = dest + 1
 
-; clobbers a, x
-.macro sqr8 dest, arg
-    ldx arg
-    lda sqr_lobyte,x
-    sta dest
-    lda sqr_hibyte,x
-    sta dest + 1
+        lda mul_factor_a      ; 3 cyc
+
+        ; (a + x)^2/2
+        clc                   ; 2 cyc         
+        adc mul_factor_x      ; 3 cyc
+        tax                   ; 2 cyc
+        bcc under256          ; 2 cyc
+        lda mul_hibyte512,x   ; 4 cyc
+        bcs next              ; 2 cyc
+    under256:
+        lda mul_hibyte256,x   ; 4 cyc
+        sec                   ; 2 cyc
+    next:
+        sta mul_product_hi    ; 3 cyc
+        lda mul_lobyte256,x   ; 4 cyc
+
+        ; - a^2/2
+        ldx mul_factor_a      ; 3 cyc
+        sbc mul_lobyte256,x   ; 4 cyc
+        sta mul_product_lo    ; 3 cyc
+        lda mul_product_hi    ; 3 cyc
+        sbc mul_hibyte256,x   ; 4 cyc
+        sta mul_product_hi    ; 3 cyc
+
+        ; + x & a & 1:
+        ; (this is a kludge to correct a
+        ; roundoff error that makes odd * odd too low)
+        ldx mul_factor_x      ; 3 cyc
+        txa                   ; 2 cyc
+        and mul_factor_a      ; 3 cyc
+        and #1                ; 2 cyc
+
+        clc                   ; 2 cyc
+        adc mul_product_lo    ; 3 cyc
+        bcc small_product     ; 2 cyc
+        inc mul_product_hi    ; 5 cyc
+
+        ; - x^2/2
+    small_product:
+        sec                   ; 2 cyc
+        sbc mul_lobyte256,x   ; 4 cyc
+        sta mul_product_lo    ; 3 cyc
+        lda mul_product_hi    ; 3 cyc
+        sbc mul_hibyte256,x   ; 4 cyc
+        sta mul_product_hi    ; 3 cyc
+    .endscope
 .endmacro
 
 ; lookup table for top byte -> PORTB value for bank-switch
 ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
 bank_switch_table:
     .repeat 256, i
-        .byte ((i & $c0) >> 4) | $e3
+        .byte ((i & $c0) >> 4) | $e1
     .endrepeat
 
 .macro bank_switch bank
-    lda #((bank << 2) | $e3)
+    lda #((bank << 2) | $e1)
     sta PORTB
 .endmacro
 
-.macro imul8 dest, arg1, arg2, xe
-    .if xe
-        ; using 64KB lookup table
-        ; 58-77 cycles
-        ; clobbers x, y, dest to dest + 3
-        .scope
-            output = dest
-            ptr = dest + 2 ; scratch space assumed
 
-            ; bottom 14 bits except the LSB are the per-bank table index
-            ; add $4000 for the bank pointer
-            lda arg1     ; 3 cyc
-            and #$fe     ; 2 cyc
-            sta ptr      ; 3 cyc
-            lda arg2     ; 3 cyc
-            and #$3f     ; 2 cyc
-            clc          ; 2 cyc
-            adc #$40     ; 2 cyc
-            sta ptr + 1  ; 3 cyc
-            
-            ; top 2 bits are the table bank selector
-            ldx arg2                ; 3 cyc
-            lda bank_switch_table,x ; 4 cyc
-            sta PORTB               ; 4 cyc
+; 58-77 cycles
+; clobbers x, y, dest to dest + 3
+.macro imul8xe dest, arg1, arg2
+.local done
+.local output
+.local ptr
+
+    output = dest
+    ptr = dest + 2 ; scratch space assumed
+
+    ; bottom 14 bits except the LSB are the per-bank table index
+    ; add $4000 for the bank pointer
+    lda arg1     ; 3 cyc
+    and #$fe     ; 2 cyc
+    sta ptr      ; 3 cyc
+    lda arg2     ; 3 cyc
+    and #$3f     ; 2 cyc
+    clc          ; 2 cyc
+    adc #$40     ; 2 cyc
+    sta ptr + 1  ; 3 cyc
+    
+    ; top 2 bits are the table bank selector
+    ldx arg2                ; 3 cyc
+    lda bank_switch_table,x ; 4 cyc
+    sta PORTB               ; 4 cyc
 
 
-            ; copy the entry into output
-            ldy #0       ; 2 cyc
-            lda (ptr),y  ; 5 cyc
-            sta output   ; 3 cyc
-            iny          ; 2 cyc
-            lda (ptr),y  ; 5 cyc
-            sta output+1 ; 3 cyc
+    ; copy the entry into output
+    ldy #0       ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output   ; 3 cyc
+    iny          ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output+1 ; 3 cyc
 
-            ; note: we are not restoring memory to save 6 cycles!
-            ; this means those 16kb have to be switched back to base RAM
-            ; if we need to use them anywhere else
-            ;;; restore memory
-            ;;lda #$81     ; 2 cyc - disabled
-            ;;sta PORTB    ; 4 cyc - disabled
+    ; note: we are not restoring memory to save 6 cycles!
+    ; this means those 16kb have to be switched back to base RAM
+    ; if we need to use them anywhere else
+    ;;; restore memory
+    ;;lda #$81     ; 2 cyc - disabled
+    ;;sta PORTB    ; 4 cyc - disabled
 
-            ; check that 1 bit we skipped to fit into space
-            lda arg1     ; 3 cyc
-            and #1       ; 2 cyc
-            beq done     ; 2 cyc
+    ; check that 1 bit we skipped to fit into space
+    lda arg1     ; 3 cyc
+    and #1       ; 2 cyc
+    beq done     ; 2 cyc
 
-            ; add the second param one last time for the skipped bit
-            clc          ; 2 cyc
-            lda arg2     ; 3 cyc
-            adc output   ; 3 cyc
-            sta output   ; 3 cyc
-            lda #0       ; 2 cyc
-            adc output+1 ; 3 cyc
-            sta output+1 ; 3 cyc
+    ; add the second param one last time for the skipped bit
+    clc          ; 2 cyc
+    lda arg2     ; 3 cyc
+    adc output   ; 3 cyc
+    sta output   ; 3 cyc
+    lda #0       ; 2 cyc
+    adc output+1 ; 3 cyc
+    sta output+1 ; 3 cyc
 
-        done:
-        .endscope
-    .else
-        ; Using base 48k RAM compatibility mode
-        ; Small table of half squares
-        ; Adapted from https://everything2.com/title/Fast+6502+multiplication
-        ; 81-92 cycles
-        .scope
-            mul_factor_a   = arg1
-            mul_factor_x   = arg2
-            mul_product_lo = dest
-            mul_product_hi = dest + 1
-
-            lda mul_factor_a      ; 3 cyc
-
-            ; (a + x)^2/2
-            clc                   ; 2 cyc         
-            adc mul_factor_x      ; 3 cyc
-            tax                   ; 2 cyc
-            bcc under256          ; 2 cyc
-            lda mul_hibyte512,x   ; 4 cyc
-            bcs next              ; 2 cyc
-        under256:
-            lda mul_hibyte256,x   ; 4 cyc
-            sec                   ; 2 cyc
-        next:
-            sta mul_product_hi    ; 3 cyc
-            lda mul_lobyte256,x   ; 4 cyc
-
-            ; - a^2/2
-            ldx mul_factor_a      ; 3 cyc
-            sbc mul_lobyte256,x   ; 4 cyc
-            sta mul_product_lo    ; 3 cyc
-            lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte256,x   ; 4 cyc
-            sta mul_product_hi    ; 3 cyc
-
-            ; + x & a & 1:
-            ; (this is a kludge to correct a
-            ; roundoff error that makes odd * odd too low)
-            ldx mul_factor_x      ; 3 cyc
-            txa                   ; 2 cyc
-            and mul_factor_a      ; 3 cyc
-            and #1                ; 2 cyc
-
-            clc                   ; 2 cyc
-            adc mul_product_lo    ; 3 cyc
-            bcc small_product     ; 2 cyc
-            inc mul_product_hi    ; 5 cyc
-
-            ; - x^2/2
-        small_product:
-            sec                   ; 2 cyc
-            sbc mul_lobyte256,x   ; 4 cyc
-            sta mul_product_lo    ; 3 cyc
-            lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte256,x   ; 4 cyc
-            sta mul_product_hi    ; 3 cyc
-        .endscope
-    .endif
+done:
 .endmacro
 
 .proc imul8xe_init
@@ -547,14 +532,6 @@ init:
     lda #.hibyte(imul16xe_func)
     sta imul16_func + 2
 
-    ; ditto for sqr16_func -> sqr16xe_func
-    lda #$4c ; 'jmp' opcode
-    sta sqr16_func
-    lda #.lobyte(sqr16xe_func)
-    sta sqr16_func + 1
-    lda #.hibyte(sqr16xe_func)
-    sta sqr16_func + 2
-
     ; create the lookup table
     ; go through the input set, in four 16KB chunks
 
@@ -655,13 +632,7 @@ inner_loop:
 
 .endproc
 
-.macro imul16_impl xe
-    .local arg1
-    .local arg2
-    .local result
-    .local inter
-    .local arg1_pos
-    .local arg2_pos
+.proc imul16_func
     arg1 = FR0   ; 16-bit arg (clobbered)
     arg2 = FR1   ; 16-bit arg (clobbered)
     result = FR2 ; 32-bit result
@@ -672,20 +643,20 @@ inner_loop:
     ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
     ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
 
-    imul8 result, arg1, arg2, xe
+    imul8 result, arg1, arg2
     lda #0
     sta result + 2
     sta result + 3
 
-    imul8 inter, arg1 + 1, arg2, xe
+    imul8 inter, arg1 + 1, arg2
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
-    imul8 inter, arg1, arg2 + 1, xe
+    imul8 inter, arg1, arg2 + 1
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
-    imul8 inter, arg1 + 1, arg2 + 1, xe
+    imul8 inter, arg1 + 1, arg2 + 1
     add16 result + 2, result + 2, inter
 
     ; In case of negative inputs, adjust high word
@@ -700,59 +671,47 @@ arg1_pos:
 arg2_pos:
 
     rts ; 6 cyc
-.endmacro
-
-.macro sqr16_impl xe
-    .scope
-        arg = FR0    ; 16-bit arg (clobbered)
-        result = FR2 ; 32-bit result
-        ;inter = temp2
-        inter = FR1
-
-        lda arg + 1
-        bpl arg_pos
-        neg16 arg
-    arg_pos:
-
-        ; hl * hl
-        ; (h*256 + l) * (h*256 + l)
-        ; h*256*(h*256 + l) + l*(h*256 + l)
-        ; h*h*256*256 + h*l*256 + h*l*256 + l*l
-
-        sqr8 result, arg
-        ;imul8 inter, arg, arg, xe
-        lda #0
-        sta result + 2
-        sta result + 3
-
-        imul8 inter, arg + 1, arg, xe
-        add16 result + 1, result + 1, inter
-        add_carry result + 3
-        add16 result + 1, result + 1, inter
-        add_carry result + 3
-
-        sqr8 inter, arg + 1
-        ;imul8 inter, arg + 1, arg + 1, xe
-        add16 result + 2, result + 2, inter
-
-        rts ; 6 cyc
-    .endscope
-.endmacro
-
-.proc imul16_func
-    imul16_impl 0
 .endproc
 
 .proc imul16xe_func
-    imul16_impl 1
-.endproc
+    arg1 = FR0   ; 16-bit arg (clobbered)
+    arg2 = FR1   ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+    inter = temp2
 
-.proc sqr16_func
-    sqr16_impl 0
-.endproc
+    ; h1l1 * h2l2
+    ; (h1*256 + l1) * (h2*256 + l2)
+    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
+    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
 
-.proc sqr16xe_func
-    sqr16_impl 1
+    imul8xe result, arg1, arg2
+    lda #0
+    sta result + 2
+    sta result + 3
+
+    imul8xe inter, arg1 + 1, arg2
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8xe inter, arg1, arg2 + 1
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8xe inter, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, inter
+
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg1 + 1
+    bpl arg1_pos
+    sub16 result + 2, result + 2, arg2
+arg1_pos:
+    lda arg2 + 1
+    bpl arg2_pos
+    sub16 result + 2, result + 2, arg1
+arg2_pos:
+
+    rts ; 6 cyc
 .endproc
 
 .macro round16 arg
@@ -866,10 +825,10 @@ keep_going:
     quick_exit zy, 2
 
     ; zx_2 = zx * zx
-    sqr16_round zx_2, zx, 4
+    imul16_round zx_2, zx, zx, 4
 
     ; zy_2 = zy * zy
-    sqr16_round zy_2, zy, 4
+    imul16_round zy_2, zy, zy, 4
 
     ; zx_zy = zx * zy
     imul16_round zx_zy, zx, zy, 4
diff --git a/tables.js b/tables.js
index 50cbef9..c772f81 100644
--- a/tables.js
+++ b/tables.js
@@ -22,10 +22,7 @@ console.log(
 .export mul_lobyte256
 .export mul_hibyte256
 .export mul_hibyte512
-.export sqr_lobyte
-.export sqr_hibyte
 
-; (i * i + 1) / 2 for the multiplier
 .align 256
 mul_lobyte256:
 ${db((i) => squares[i] & 0xff)}
@@ -38,13 +35,4 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
 mul_hibyte512:
 ${db((i) => (squares[i + 256] >> 8) & 0xff)}
 
-; (i * i) for the plain squares
-.align 256
-sqr_lobyte:
-${db((i) => (i * i) & 0xff)}
-
-.align 256
-sqr_hibyte:
-${db((i) => ((i * i) >> 8) & 0xff)}
-
 `);