diff --git a/Makefile b/Makefile
index bd14c7d..008bf8c 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,8 @@
 
 all : mandel.xex
 
-mandel.xex : mandel.o tables.o atari-asm-xex.cfg
-	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
+mandel.xex : mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+
 
 %.o : %.s
 	ca65 -o $@ $<
diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg
deleted file mode 100644
index 93b80f3..0000000
--- a/atari-asm-xex.cfg
+++ /dev/null
@@ -1,28 +0,0 @@
-FEATURES {
-    STARTADDRESS: default = $2E00;
-}
-SYMBOLS {
-    __STARTADDRESS__: type = export, value = %S;
-}
-MEMORY {
-    ZP:      file = "", define = yes, start = $0082, size = $007E;
-    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
-    # Keep $4000-7fff clear for expanded RAM access window
-    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
-    # Keep $a000-$bfff clear for BASIC cartridge
-}
-FILES {
-    %O: format = atari;
-}
-FORMATS {
-    atari: runad = start;
-}
-SEGMENTS {
-    ZEROPAGE: load = ZP,      type = zp,  optional = yes;
-    EXTZP:    load = ZP,      type = zp,  optional = yes; # to enable modules to be able to link to C and assembler programs
-    CODE:     load = MAIN,    type = rw,                  define = yes;
-    RODATA:   load = MAIN,    type = ro   optional = yes;
-    DATA:     load = MAIN,    type = rw   optional = yes;
-    BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
-    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
-}
diff --git a/mandel.s b/mandel.s
index b8985b3..fcc7867 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1,46 +1,39 @@
 ; Our zero-page vars
-ox              = $80 ; fixed8.24: center point x
-oy              = $84 ; fixed8.24: center point y
-cx              = $88 ; fixed8.24: c_x
-cy              = $8c ; fixed8.24: c_y
+sx    = $80     ; i16: screen pixel x
+sy    = $82     ; i16: screen pixel y
+ox    = $84     ; fixed4.12: center point x
+oy    = $86     ; fixed4.12: center point y
+cx    = $88     ; fixed4.12: c_x
+cy    = $8a     ; fixed4.12: c_y
+zx    = $8c     ; fixed4.12: z_x
+zy    = $8e     ; fixed4.12: z_y
 
-zx              = $90 ; fixed8.24: z_x
-zy              = $94 ; fixed8.24: z_y
-zx_2            = $98 ; fixed8.24: z_x^2
-zy_2            = $9c ; fixed8.24: z_y^2
+zx_2  = $90     ; fixed4.12: z_x^2
+zy_2  = $92     ; fixed4.12: z_y^2
+zx_zy = $94     ; fixed4.12: z_x * z_y
+dist  = $96     ; fixed4.12: z_x^2 + z_y^2
 
-zx_zy           = $a0 ; fixed8.24: z_x * z_y
-dist            = $a4 ; fixed8.24: z_x^2 + z_y^2
-sx              = $a8 ; i16: screen pixel x
-sy              = $aa ; i16: screen pixel y
-z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
-z_buffer_start  = $ad ; u8: index into z_buffer
-z_buffer_end    = $ae ; u8: index into z_buffer
-iter            = $af ; u8: iteration count
+iter         = $a0 ; u8: iteration count
 
-ptr             = $b0 ; u16
-pixel_ptr       = $b2 ; u16
-zoom            = $b4 ; u8: zoom shift level
-fill_level      = $b5 ; u8
-pixel_color     = $b6 ; u8
-pixel_mask      = $b7 ; u8
-pixel_shift     = $b8 ; u8
-pixel_offset    = $b9 ; u8
-palette_offset  = $ba ; u8
-chroma_offset   = $bb ; u8
-palette_ticks   = $bc ; u8
-chroma_ticks    = $bd ; u8
-count_frames    = $be ; u8
-count_pixels    = $bf ; u8
+zoom         = $a1 ; u8: zoom shift level
+count_frames = $a2 ; u8
+count_pixels = $a3 ; u8
+total_ms     = $a4 ; float48
+total_pixels = $aa ; float48
 
-total_pixels    = $c0 ; float48
-total_ms        = $c6 ; float48
-temp            = $cc ; u16
-temp2           = $ce ; u16
-
-palette_delay = 23
-chroma_delay = 137
+z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
+z_buffer_start  = $b1 ; u8: index into z_buffer
+z_buffer_end    = $b2 ; u8: index into z_buffer
+temp            = $b4 ; u16
+temp2           = $b6 ; u16
 
+pixel_ptr       = $b8 ; u16
+pixel_color     = $ba ; u8
+pixel_mask      = $bb ; u8
+pixel_shift     = $bc ; u8
+pixel_offset    = $bd ; u8
+fill_level      = $be ; u8
+palette_offset  = $bf ; u8
 
 ; FP registers in zero page
 FR0    = $d4 ; float48
@@ -70,11 +63,11 @@ FST0R  = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
 FMOVE  = $DDB6 ; MOVE FR0 TO FR1
 
 ; High data
-framebuffer_top    = $a000
-textbuffer         = $af00
-framebuffer_bottom = $b000
-display_list       = $bf00
-framebuffer_end    = $c000
+framebuffer_top    = $8000
+textbuffer         = $8f00
+framebuffer_bottom = $9000
+display_list       = $9f00
+framebuffer_end    = $a000
 
 height = 184
 half_height = height >> 1
@@ -82,9 +75,6 @@ width = 160
 half_width = width >> 1
 stride = width >> 2
 
-EXTENDED_RAM = $4000 ; 16KiB bank on the XE
-PORTB  = $D301 ; memory & bank-switch for XL/XE
-
 DMACTL = $D400
 DLISTL = $D402
 DLISTH = $D403
@@ -112,27 +102,15 @@ KEY_UP    = $8e
 KEY_DOWN  = $8f
 KEY_LEFT  = $86
 KEY_RIGHT = $87
-KEY_1     = $1f
-KEY_2     = $1e
-KEY_3     = $1a
-KEY_4     = 24
-KEY_5     = 29
-KEY_6     = 27
-KEY_7     = 51
-KEY_8     = 53
-KEY_9     = 48
-KEY_0     = 50
 
 .struct float48
     exponent .byte
-    mantissa .byte 5
+    mantissa .byte 6
 .endstruct
 
 .import mul_lobyte256
 .import mul_hibyte256
 .import mul_hibyte512
-.import sqr_lobyte
-.import sqr_hibyte
 
 .data
 
@@ -239,32 +217,14 @@ color_map:
         .byte 3
     .endrepeat
 
-
-palette_start:
-    .byte $0e
-    .byte $08
-    .byte $04
-palette_repeat:
-    .byte $0e
-    .byte $08
-
-palette_entries = 3
-
-palette_chroma:
-    .repeat 15, i
-        .byte (i + 1) << 4
-    .endrepeat
-    .repeat 2, i
-        .byte (i + 1) << 4
-    .endrepeat
-palette_chroma_entries = 15
-
+palette:
+    .byte $00
+    .byte $46
+    .byte $78
+    .byte $b4
 .code
 
-;z_buffer_len = 16 ; 10.863 ms/px
-;z_buffer_len = 12 ; 10.619 ms/px
-z_buffer_len = 8 ; 10.612 ms/px
-;z_buffer_len = 4 ; 12.395 ms/px
+z_buffer_len = 16
 z_buffer_mask = z_buffer_len - 1
 z_buffer:
     ; the last N zx/zy values
@@ -275,34 +235,15 @@ z_buffer:
 
 .export start
 
-;max_fill_level = 6
-max_fill_level = 3
+max_fill_level = 6
 fill_masks:
-;    .byte %00011111
-;    .byte %00001111
-;    .byte %00000111
+    .byte %00011111
+    .byte %00001111
+    .byte %00000111
     .byte %00000011
     .byte %00000001
     .byte %00000000
 
-viewport_zoom:
-    .byte 1
-    .byte 6
-    .byte 8
-    .byte 6
-
-viewport_ox:
-    .dword $00000000
-    .dword $ff110000
-    .dword $ff110000
-    .dword $fe400000
-
-viewport_oy:
-    .dword $00000000
-    .dword $ffb60000
-    .dword $ffbe0000
-    .dword $00000000
-
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
     clc ; 2 cyc
@@ -313,21 +254,18 @@ viewport_oy:
     .endrepeat
 .endmacro
 
-; 20 cycles
 .macro add16 dest, arg1, arg2
     add 2, dest, arg1, arg2
 .endmacro
 
-; 38 cycles
 .macro add32 dest, arg1, arg2
-    add 4, dest, arg1, arg2
+    add 4, dest, arg2, dest
 .endmacro
 
-; 8 cycles
 .macro add_carry dest
-    lda dest ; 3 cyc
-    adc #0   ; 2 cyc
-    sta dest ; 3 cyc
+    lda dest
+    adc #0
+    sta dest
 .endmacro
 
 ; 2 + 9 * byte cycles
@@ -340,35 +278,29 @@ viewport_oy:
     .endrepeat
 .endmacro
 
-; 20 cycles
 .macro sub16 dest, arg1, arg2
     sub 2, dest, arg1, arg2
 .endmacro
 
-; 38 cycles
 .macro sub32 dest, arg1, arg2
     sub 4, dest, arg1, arg2
 .endmacro
 
-; 3 + 5 * bytes cycles
 .macro shl bytes, arg
-    asl arg              ; 3 cyc
+    asl arg
     .repeat bytes-1, i
-        rol arg + 1 + i  ; 5 cyc
+        rol arg + 1 + i
     .endrepeat
 .endmacro
 
-; 13 cycles
 .macro shl16 arg
     shl 2, arg
 .endmacro
 
-; 18 cycles
 .macro shl24 arg
     shl 3, arg
 .endmacro
 
-; 23 cycles
 .macro shl32 arg
     shl 4, arg
 .endmacro
@@ -381,17 +313,14 @@ viewport_oy:
     .endrepeat
 .endmacro
 
-; 12 cycles
 .macro copy16 dest, arg
     copy 2, dest, arg
 .endmacro
 
-; 24 cycles
 .macro copy32 dest, arg
     copy 4, dest, arg
 .endmacro
 
-; 36 cycles
 .macro copyfloat dest, arg
     copy 6, dest, arg
 .endmacro
@@ -416,419 +345,223 @@ viewport_oy:
     neg 4, arg
 .endmacro
 
-; 11-27 + 23 * shift cycles
-; 103-119 cycles for shift=4
-.macro shift_round_16 arg, shift
-    .repeat shift
-        shl32 arg ; 23 cycles
-    .endrepeat
-    round16 arg ; 11-27 cycles
+; inner loop for imul16
+; bitnum < 8: 25 or 41 cycles
+; bitnum >= 8: 30 or 46 cycles
+.macro bitmul16 arg1, arg2, result, bitnum
+    .local zero
+    .local one
+    .local next
+
+    ; does 16-bit adds
+    ; arg1 and arg2 are treated as unsigned
+    ; negative signed inputs must be flipped first
+
+    ; 7 cycles up to the branch
+
+    ; check if arg1 has 0 or 1 bit in this place
+    ; 5 cycles either way
+    .if bitnum < 8
+        lda arg1                 ; 3 cyc
+        and #(1 << (bitnum))       ; 2 cyc
+    .else
+        lda arg1 + 1             ; 3 cyc
+        and #(1 << ((bitnum) - 8)) ; 2 cyc
+    .endif
+    bne one ; 2 cyc
+
+zero: ; 18 cyc, 23 cyc
+    lsr result + 3 ; 5 cyc
+    jmp next       ; 3 cyc
+
+one: ; 32 cyc, 37 cyc
+    ; 16-bit add on the top bits
+    clc            ; 2 cyc
+    lda result + 2 ; 3 cyc
+    adc arg2       ; 3 cyc
+    sta result + 2 ; 3 cyc
+    lda result + 3 ; 3 cyc
+    adc arg2 + 1   ; 3 cyc
+    ror a          ; 2 cyc - get a jump on the shift
+    sta result + 3 ; 3 cyc
+next:
+    ror result + 2 ; 5 cyc
+    ror result + 1 ; 5 cyc
+    .if bitnum >= 8
+        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
+        ; when it's all uninitialized data
+        ror result ; 5 cyc
+    .endif
+
 .endmacro
 
-; input: arg1, arg2 as fixed4.12
-; output: dest as fixed8.24
+; 5 to 25 cycles
+.macro check_sign arg
+    ; Check sign bit and flip argument to postive,
+    ; keeping a count of sign bits in the Y register.
+    .local positive
+    lda arg + 1   ; 3 cyc
+    bpl positive  ; 2 cyc
+    neg16 arg     ; 18 cyc
+    iny           ; 2 cyc
+positive:
+.endmacro
+
+; 518 - 828 cyc
 .macro imul16 dest, arg1, arg2
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; ? cyc
+    jsr imul16_func   ; 470-780 cyc
     copy32 dest, FR2  ; 24 cyc
 .endmacro
 
-; input: arg as fixed4.12
-; output: dest as fixed8.24
-.macro sqr16 dest, arg
-    copy16 FR0, arg   ; 12 cyc
-    jsr sqr16_func    ; ? cyc
-    copy32 dest, FR2  ; 24 cyc
+.macro shift_round_16 arg, shift
+    .repeat shift
+        shl32 arg
+    .endrepeat
+    round16 arg
 .endmacro
 
-; input: arg as u8
-; output: dest as u16
-; clobbers a, x
-.macro sqr8 dest, arg
-    ldx arg
-    lda sqr_lobyte,x
-    sta dest
-    lda sqr_hibyte,x
-    sta dest + 1
+.macro imul16_round dest, arg1, arg2, shift
+    copy16 FR0, arg1  ; 12 cyc
+    copy16 FR1, arg2  ; 12 cyc
+    jsr imul16_func   ; 470-780 cyc
+    shift_round_16 FR2, shift
+    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
-; input: arg as u8
-; input/output: dest as u16
-; clobbers a, x
-.macro sqr8_add16 dest, arg
-    ldx arg
-    clc
-    lda sqr_lobyte,x
-    adc dest
-    sta dest
-    lda sqr_hibyte,x
-    adc dest + 1
-    sta dest + 1
-.endmacro
+; min 470 cycles
+; max 780 cycles
+.proc imul16_func_orig
+    arg1 = FR0   ; 16-bit arg (clobbered)
+    arg2 = FR1   ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
 
-.segment "TABLES"
-; lookup table for top byte -> PORTB value for bank-switch
-.align 256
-bank_switch_table:
-    .repeat 256, i
-        .byte ((i & $c0) >> 4) | $e3
+    ldy #0          ; 2 cyc
+    ; counts the number of sign bits in Y
+    check_sign arg1 ; 5 to 25 cyc
+    check_sign arg2 ; 5 to 25 cyc
+    
+    ; zero out the 32-bit temp's top 16 bits
+    lda #0          ; 2 cyc
+    sta result + 2  ; 3 cyc
+    sta result + 3  ; 3 cyc
+    ; the bottom two bytes will get cleared by the shifts
+
+    ; unrolled loop for maximum speed, at the cost
+    ; of a larger routine
+    ; 440 to 696 cycles
+    .repeat 16, bitnum
+        ; bitnum < 8: 25 or 41 cycles
+        ; bitnum >= 8: 30 or 46 cycles
+        bitmul16 arg1, arg2, result, bitnum
     .endrepeat
 
-.code
+    ; In case of mixed input signs, return a negative result.
+    cpy #1              ; 2 cyc
+    bne positive_result ; 2 cyc
+    neg32 result        ; 34 cyc
+positive_result:
 
-.macro bank_switch bank
-    lda #((bank << 2) | $e3)
-    sta PORTB
-.endmacro
-
-.macro imul8 dest, arg1, arg2, xe
-    .if xe
-        ; using 64KB lookup table
-        ; 51-70 cycles
-        ; clobbers x, y, dest, ptr
-        .scope
-            output = dest
-
-            ; top 2 bits are the table bank selector
-            ldx arg2                ; 3 cyc
-            lda bank_switch_table,x ; 4 cyc
-            sta PORTB               ; 4 cyc
-
-            ; bottom 14 bits except the LSB are the per-bank table index
-            ; add $4000 for the bank pointer
-            txa          ; 2 cyc
-            and #$3f     ; 2 cyc
-            ora #$40     ; 2 cyc
-            sta ptr + 1  ; 3 cyc
-
-            ; copy the entry into output
-            lda arg1     ; 3 cyc
-            and #$fe     ; 2 cyc
-            tay          ; 2 cyc
-            lda (ptr),y  ; 5 cyc
-            sta output   ; 3 cyc
-            iny          ; 2 cyc
-            lda (ptr),y  ; 5 cyc
-            sta output+1 ; 3 cyc
-
-            ; note: we are not restoring memory to save 6 cycles!
-            ; this means those 16kb have to be switched back to base RAM
-            ; if we need to use them anywhere else
-            ;;; restore memory
-            ;;lda #$81     ; 2 cyc - disabled
-            ;;sta PORTB    ; 4 cyc - disabled
-
-            ; check that 1 bit we skipped to fit into space
-            lda arg1     ; 3 cyc
-            and #1       ; 2 cyc
-            beq done     ; 2 cyc
-
-            ; add arg2 one last time for the skipped bit
-            clc          ; 2 cyc
-            txa          ; 2 cyc
-            adc output   ; 3 cyc
-            sta output   ; 3 cyc
-            lda #0       ; 2 cyc
-            adc output+1 ; 3 cyc
-            sta output+1 ; 3 cyc
-
-        done:
-        .endscope
-    .else
-        ; Using base 48k RAM compatibility mode
-        ; Small table of half squares
-        ; Adapted from https://everything2.com/title/Fast+6502+multiplication
-        ; 81-92 cycles
-        .scope
-            mul_factor_a   = arg1
-            mul_factor_x   = arg2
-            mul_product_lo = dest
-            mul_product_hi = dest + 1
-
-            lda mul_factor_a      ; 3 cyc
-
-            ; (a + x)^2/2
-            clc                   ; 2 cyc         
-            adc mul_factor_x      ; 3 cyc
-            tax                   ; 2 cyc
-            bcc under256          ; 2 cyc
-            lda mul_hibyte512,x   ; 4 cyc
-            bcs next              ; 2 cyc
-        under256:
-            lda mul_hibyte256,x   ; 4 cyc
-            sec                   ; 2 cyc
-        next:
-            sta mul_product_hi    ; 3 cyc
-            lda mul_lobyte256,x   ; 4 cyc
-
-            ; - a^2/2
-            ldx mul_factor_a      ; 3 cyc
-            sbc mul_lobyte256,x   ; 4 cyc
-            sta mul_product_lo    ; 3 cyc
-            lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte256,x   ; 4 cyc
-            sta mul_product_hi    ; 3 cyc
-
-            ; + x & a & 1:
-            ; (this is a kludge to correct a
-            ; roundoff error that makes odd * odd too low)
-            ldx mul_factor_x      ; 3 cyc
-            txa                   ; 2 cyc
-            and mul_factor_a      ; 3 cyc
-            and #1                ; 2 cyc
-
-            clc                   ; 2 cyc
-            adc mul_product_lo    ; 3 cyc
-            bcc small_product     ; 2 cyc
-            inc mul_product_hi    ; 5 cyc
-
-            ; - x^2/2
-        small_product:
-            sec                   ; 2 cyc
-            sbc mul_lobyte256,x   ; 4 cyc
-            sta mul_product_lo    ; 3 cyc
-            lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte256,x   ; 4 cyc
-            sta mul_product_hi    ; 3 cyc
-        .endscope
-    .endif
-.endmacro
-
-.proc imul8xe_init
-
-    bank_switch 0
-    lda #0
-    sta EXTENDED_RAM
-    bank_switch 1
-    lda #1
-    sta EXTENDED_RAM
-    bank_switch 0
-    lda EXTENDED_RAM
-    beq init
-
-    ; no bank switching available, we just overwrite the value in base ram
-    rts
-
-init:
-
-    ; patch imul16_func into a forwarding thunk to imul16xe_func
-    lda #$4c ; 'jmp' opcode
-    sta imul16_func
-    lda #.lobyte(imul16xe_func)
-    sta imul16_func + 1
-    lda #.hibyte(imul16xe_func)
-    sta imul16_func + 2
-
-    ; ditto for sqr16_func -> sqr16xe_func
-    lda #$4c ; 'jmp' opcode
-    sta sqr16_func
-    lda #.lobyte(sqr16xe_func)
-    sta sqr16_func + 1
-    lda #.hibyte(sqr16xe_func)
-    sta sqr16_func + 2
-
-    ; create the lookup table
-    ; go through the input set, in four 16KB chunks
-
-    arg1 = FR1
-    arg2 = FR2
-    result = FR0
-
-    lda #$00
-    sta arg1
-    sta arg2
-    sta ptr
-    lda #$40
-    sta ptr + 1
-
-    ; $00 * $00 -> $3f * $ff
-    bank_switch 0
-    jsr imul8xe_init_section
-
-    ; $40 * $00 -> $7f * $ff
-    bank_switch 1
-    jsr imul8xe_init_section
-
-    ; $80 * $00 -> $bf * $ff
-    bank_switch 2
-    jsr imul8xe_init_section
-
-    ; $c0 * $00 -> $ff * $ff
-    bank_switch 3
-    jsr imul8xe_init_section
-
-    rts
+    rts ; 6 cyc
 .endproc
 
-; Initialize a 16 KB chunk of the table
-; input: multipliers in temp
-; output: new multipliers in temp
-; clobbers: temp, temp2
-.proc imul8xe_init_section
-    arg1 = FR1
-    arg2 = FR2
-    result = FR0
-    ptr = temp2
+; Adapted from https://everything2.com/title/Fast+6502+multiplication
+.macro imul8 dest, arg1, arg2
+    .local under256
+    .local next
+    .local small_product
+    .scope
+        mul_factor_a   = arg1
+        mul_factor_x   = arg2
+        mul_product_lo = dest
+        mul_product_hi = dest + 1
 
-    lda #$00
-    sta ptr
-    lda #$40
-    sta ptr + 1
+        lda mul_factor_a      ; setup: 6 cycles
+        ;ldx mul_factor_x
 
-    ldy #0
+        clc                   ; (a + x)^2/2: 23 cycles
+        adc mul_factor_x
+        tax
+        bcc under256
+        lda mul_hibyte512,x
+        bcs next
+    under256:
+        lda mul_hibyte256,x
+        sec
+    next:
+        sta mul_product_hi
+        lda mul_lobyte256,x
 
-    ; outer loop: $00 -> $3f
-outer_loop:
+        ldx mul_factor_a      ; - a^2/2: 20 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
 
-    ; reset result to 0
-    lda #0
-    sta result
-    sta result + 1
+        ldx mul_factor_x      ; + x & a & 1: 22 cycles
+        txa                   ; (this is a kludge to correct a
+        and mul_factor_a      ; roundoff error that makes odd * odd too low)
+        and #1
 
-    ; inner loop: $00 -> $ff
-inner_loop:
+        clc
+        adc mul_product_lo
+        bcc small_product
+        inc mul_product_hi
+    small_product:
+        sec                   ; - x^2/2: 25 cycles
+        sbc mul_lobyte256,x
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+    .endscope
+.endmacro
 
-    ; copy result to data set
-    lda result
-    sta (ptr),y
-    lda result + 1
-    iny
-    sta (ptr),y
-    dey
-
-    ; result += 2 * arg2
-    clc
-    lda arg2
-    adc result
-    sta result
-    lda #0
-    adc result + 1
-    sta result + 1
-    clc
-    lda arg2
-    adc result
-    sta result
-    lda #0
-    adc result + 1
-    sta result + 1
-
-    ; inner loop check
-    inc arg1
-    inc arg1
-    inc ptr
-    inc ptr
-    bne inner_loop
-
-    ; outer loop check
-    inc arg2
-    inc ptr + 1
-    lda ptr + 1
-    cmp #$80
-    bne outer_loop
-
-    rts
-
-.endproc
-
-.macro imul16_impl xe
-    .local arg1
-    .local arg2
-    .local result
-    .local inter
-    .local arg1_pos
-    .local arg2_pos
+.proc imul16_func
     arg1 = FR0   ; 16-bit arg (clobbered)
     arg2 = FR1   ; 16-bit arg (clobbered)
     result = FR2 ; 32-bit result
     inter = temp2
 
+    ldy #0          ; 2 cyc
+    ; counts the number of sign bits in Y
+    check_sign arg1 ; 5 to 25 cyc
+    check_sign arg2 ; 5 to 25 cyc
+
     ; h1l1 * h2l2
     ; (h1*256 + l1) * (h2*256 + l2)
     ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
     ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
 
-    imul8 result, arg1, arg2, xe
     lda #0
+    sta result + 0
+    sta result + 1
     sta result + 2
     sta result + 3
 
-    imul8 inter, arg1 + 1, arg2, xe
+    imul8 inter, arg1, arg2
+    add16 result, result, inter
+
+    imul8 inter, arg1 + 1, arg2
+    add16 result + 1, result + 1, inter
+
+    imul8 inter, arg1, arg2 + 1
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
-    imul8 inter, arg1, arg2 + 1, xe
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-
-    imul8 inter, arg1 + 1, arg2 + 1, xe
+    imul8 inter, arg1 + 1, arg2 + 1
     add16 result + 2, result + 2, inter
 
-    ; In case of negative inputs, adjust high word
-    ; https://stackoverflow.com/a/28827013
-    lda arg1 + 1
-    bpl arg1_pos
-    sub16 result + 2, result + 2, arg2
-arg1_pos:
-    lda arg2 + 1
-    bpl arg2_pos
-    sub16 result + 2, result + 2, arg1
-arg2_pos:
+    ; In case of mixed input signs, return a negative result.
+    cpy #1              ; 2 cyc
+    bne positive_result ; 2 cyc
+    neg32 result        ; 34 cyc
+positive_result:
 
     rts ; 6 cyc
-.endmacro
-
-.macro sqr16_impl xe
-    .scope
-        arg = FR0    ; 16-bit arg (clobbered)
-        result = FR2 ; 32-bit result
-        ;inter = temp2
-        inter = FR1
-
-        lda arg + 1
-        bpl arg_pos
-        neg16 arg
-    arg_pos:
-
-        ; hl * hl
-        ; (h*256 + l) * (h*256 + l)
-        ; h*256*(h*256 + l) + l*(h*256 + l)
-        ; h*h*256*256 + h*l*256 + h*l*256 + l*l
-
-        sqr8 result, arg
-        lda #0
-        sta result + 2
-        sta result + 3
-
-        imul8 inter, arg + 1, arg, xe
-        add16 result + 1, result + 1, inter
-        add_carry result + 3
-        add16 result + 1, result + 1, inter
-        add_carry result + 3
-
-        sqr8_add16 result + 2, arg + 1
-
-        rts ; 6 cyc
-    .endscope
-.endmacro
-
-.proc imul16_func
-    imul16_impl 0
 .endproc
 
-.proc imul16xe_func
-    imul16_impl 1
-.endproc
-
-.proc sqr16_func
-    sqr16_impl 0
-.endproc
-
-.proc sqr16xe_func
-    sqr16_impl 1
-.endproc
-
-; 11-27 cycles
 .macro round16 arg
     ; Round top 16 bits of 32-bit fixed-point number in-place
     .local increment
@@ -841,28 +574,21 @@ arg2_pos:
     ;                   round down if negative
     ;          < $8000: round down
 
-    ; $8000 17
-    ; $8001 27
-    ; $8100 21
-    ; $7fff 11
-
-    lda arg + 1    ; 3 cyc
-    cmp #$80       ; 2 cyc
-    beq high_half  ; 2 cyc
-
-    bpl increment  ; 2 cyc
-
-    bmi next       ; 2 cyc
+    lda arg + 1
+    cmp #$80
+    beq high_half
+    bpl increment
+    bmi next
 
 high_half:
-    lda arg        ; 3 cyc
-    beq check_sign ; 2 cyc
-
-    jmp increment  ; 3 cyc
+    lda arg
+    beq check_sign
+    bpl increment
+    bmi next
 
 check_sign:
-    lda arg + 3  ; 3 cyc
-    bmi next     ; 2 cyc
+    lda arg + 3
+    bmi next
 
 increment:       ; 5-10 cyc
     inc arg + 2  ; 5 cyc
@@ -875,8 +601,8 @@ next:
 
 .proc mandelbrot
     ; input:
-    ; cx: position scaled to 8.24 fixed point - -128..+127.9
-    ; cy: position scaled to 8.24
+    ; cx: position scaled to 4.12 fixed point - -8..+7.9
+    ; cy: position scaled to 4.12
     ;
     ; output:
     ; iter: iteration count at escape or 0
@@ -888,41 +614,12 @@ next:
     ; zx_zy = 0
     ; dist = 0
     ; iter = 0
-;    lda #00
-;    ldx #(iter - zx + 1)
-;initloop:
-;    sta zx - 1,x
-;    dex
-;    bne initloop
-;    sta z_buffer_start
-;    sta z_buffer_end
-
     lda #00
-    sta zx
-    sta zx + 1
-    sta zx + 2
-    sta zx + 3
-    sta zy
-    sta zy + 1
-    sta zy + 2
-    sta zy + 3
-    sta zx_2
-    sta zx_2 + 1
-    sta zx_2 + 2
-    sta zx_2 + 3
-    sta zy_2
-    sta zy_2 + 1
-    sta zy_2 + 2
-    sta zy_2 + 3
-    sta zx_zy
-    sta zx_zy + 1
-    sta zx_zy + 2
-    sta zx_zy + 3
-    sta dist
-    sta dist + 1
-    sta dist + 2
-    sta dist + 3
-    sta iter
+    ldx #(iter - zx + 1)
+initloop:
+    sta zx - 1,x
+    dex
+    bne initloop
     sta z_buffer_start
     sta z_buffer_end
 
@@ -934,8 +631,6 @@ loop:
 keep_going:
 
     .macro quick_exit arg, max
-        ; arg: fixed8.24
-        ; max: integer
         .local positive
         .local negative
         .local nope_out
@@ -943,61 +638,51 @@ keep_going:
         .local all_done
 
         ; check sign bit
-        lda arg + 3
+        lda arg + 1
         bmi negative
 
     positive:
-        cmp #max
+        cmp #((max) << 4)
         bmi all_done ; 'less than'
         jmp exit_path
 
     negative:
-        cmp #(256 - max)
+        cmp #(256 - ((max) << 4))
         beq first_equal ; 'equal' on first byte
         bpl all_done    ; 'greater than'
 
     nope_out:
         jmp exit_path
-
+    
     first_equal:
-        ; following bytes all 0 shows it's really 'equal'
-        lda arg + 2
-        bne all_done
-        lda arg + 1
-        bne all_done
         lda arg
-        bne all_done
-        jmp exit_path
+        beq nope_out  ; 2nd byte 0 shows it's really 'equal'
 
     all_done:
     .endmacro
 
-    ; 8.24: (-128 .. 127.9)
+    ; 4.12: (-8 .. +7.9)
     ; zx = zx_2  - zy_2  + cx
-    sub32 zx, zx_2, zy_2
-    add32 zx, zx, cx
+    sub16 zx, zx_2, zy_2
+    add16 zx, zx, cx
     quick_exit zx, 2
 
     ; zy = zx_zy + zx_zy + cy
-    add32 zy, zx_zy, zx_zy
-    add32 zy, zy, cy
+    add16 zy, zx_zy, zx_zy
+    add16 zy, zy, cy
     quick_exit zy, 2
 
-    ; convert 8.24 -> 4.12: (-8 .. +7.9)
-    shift_round_16 zx, 4
-    shift_round_16 zy, 4
-
     ; zx_2 = zx * zx
-    sqr16 zx_2, zx + 2
+    imul16_round zx_2, zx, zx, 4
 
     ; zy_2 = zy * zy
-    sqr16 zy_2, zy + 2
+    imul16_round zy_2, zy, zy, 4
 
     ; zx_zy = zx * zy
-    imul16 zx_zy, zx + 2, zy + 2
+    imul16_round zx_zy, zx, zy, 4
 
     ; dist = zx_2 + zy_2
-    add32 dist, zx_2, zy_2
+    add16 dist, zx_2, zy_2
     quick_exit dist, 4
 
     ; if may be in the lake, look for looping output with a small buffer
@@ -1034,10 +719,10 @@ z_buffer_loop:
 
     ; Compare the previously stored z values
     ldy #0
-    z_compare zx + 2
-    z_compare zx + 3
-    z_compare zy + 2
-    z_compare zy + 3
+    z_compare zx
+    z_compare zx + 1
+    z_compare zy
+    z_compare zy + 1
 
     cpy #4
     bne z_no_matches
@@ -1052,10 +737,10 @@ z_no_matches:
 z_nothing_to_read:
 
     ; Store and expand
-    z_store zx + 2
-    z_store zx + 3
-    z_store zy + 2
-    z_store zy + 3
+    z_store zx
+    z_store zx + 1
+    z_store zy
+    z_store zy + 1
     z_advance
     stx z_buffer_end
 
@@ -1106,17 +791,14 @@ cont:
 enough:
 .endmacro
 
-.macro zoom_factor dest, src, aspect
-    ; output: dest: fixed8.24
-    ; input: src: fixed4.12
-    ; aspect: fixed4.12
+.macro zoom_factor dest, src, zoom, aspect
     ; clobbers A, X, flags, etc
     copy16 dest, src
     scale_zoom dest
 
     ; cy = cy * (3 / 4)
     ; cx = cx * (5 / 4)
-    imul16 dest, dest, aspect
+    imul16_round dest, dest, aspect, 4
 .endmacro
 
 .proc pset
@@ -1246,68 +928,32 @@ done:
 
 .proc vblank_handler
     inc count_frames
-
-    inc chroma_ticks
-    lda chroma_ticks
-    cmp #(chroma_delay)
-    bne skip_chroma
-
-    lda #0
-    sta chroma_ticks
-
-    inc chroma_offset
-    lda chroma_offset
-    cmp #(palette_chroma_entries)
-    bne skip_chroma
-
-    lda #0
-    sta chroma_offset
-skip_chroma:
-
-    inc palette_ticks
-    lda palette_ticks
-    cmp #(palette_delay)
-    bne skip_luma
-
-    lda #0
-    sta palette_ticks
-
     inc palette_offset
-    lda palette_offset
-    cmp #(palette_entries)
-    bne skip_luma
-
-    lda #0
-    sta palette_offset
-
-skip_luma:
     jsr update_palette
     jmp XITVBV
 .endproc
 
 .proc update_palette
-    lda #0
+    lda palette
     sta COLOR4
 
-    ldx chroma_offset
-    ldy palette_offset
-    lda palette_chroma,x
-    ora palette_start,y
-    sta COLOR2
-
-    ;inx
-    iny
-    lda palette_chroma,x
-    ora palette_start,y
-    sta COLOR1
-
-    ;inx
-    iny
-    lda palette_chroma,x
-    ora palette_start,y
+    clc
+    lda palette_offset
+    and #$f0
+    adc palette + 1
     sta COLOR0
 
-    rts
+    clc
+    lda palette_offset
+    and #$f0
+    adc palette + 2
+    sta COLOR1
+
+    clc
+    lda palette_offset
+    and #$f0
+    adc palette + 3
+    sta COLOR2
 .endproc
 
 .proc update_speed
@@ -1341,15 +987,12 @@ skip_luma:
     cpy #KEY_MINUS
     beq minus
 
-    ; temp+temp2 = $00010000 << (8 - zoom)
-    lda #$00
+    ; temp = $0010 << (8 - zoom)
+    lda #$10
     sta temp
-    sta temp + 1
-    lda #$01
-    sta temp + 2
     lda #$00
-    sta temp + 3
-    scale_zoom temp + 2
+    sta temp + 1
+    scale_zoom temp
 
     cpy #KEY_UP
     beq up
@@ -1359,63 +1002,32 @@ skip_luma:
     beq left
     cpy #KEY_RIGHT
     beq right
-    jmp number_keys
- 
+
 skip_char:
     lda #0
     rts
 
 plus:
-    lda zoom
     cmp #8
     bpl skip_char
     inc zoom
     jmp done
 minus:
-    lda zoom
     cmp #1
     bmi skip_char
     dec zoom
     jmp done
 up:
-    sub32 oy, oy, temp
+    sub16 oy, oy, temp 
     jmp done
 down:
-    add32 oy, oy, temp
+    add16 oy, oy, temp
     jmp done
 left:
-    sub32 ox, ox, temp
+    sub16 ox, ox, temp
     jmp done
 right:
-    add32 ox, ox, temp
-    jmp done
-
-number_keys:
-    cpy #KEY_1
-    beq one
-    cpy #KEY_2
-    beq two
-    cpy #KEY_3
-    beq three
-    cpy #KEY_4
-    beq four
-    jmp skip_char
-
-one:
-    ldx #0
-    jmp load_key_viewport
-two:
-    ldx #1
-    jmp load_key_viewport
-three:
-    ldx #2
-    jmp load_key_viewport
-four:
-    ldx #3
-    ; fall through
-load_key_viewport:
-    jsr load_viewport
-    ; fall through
+    add16 ox, ox, temp
 done:
     lda #255
     rts
@@ -1453,51 +1065,27 @@ zero_byte_loop:
     rts
 .endproc
 
-; input: viewport selector in x
-; clobbers: a, x
-.proc load_viewport
-
-    lda viewport_zoom,x
-    sta zoom
-
-    txa
-    asl a
-    asl a
-
-    tax
-    lda viewport_ox,x
-    sta ox
-    lda viewport_oy,x
-    sta oy
-
-    inx
-    lda viewport_ox,x
-    sta ox + 1
-    lda viewport_oy,x
-    sta oy + 1
-
-    inx
-    lda viewport_ox,x
-    sta ox + 2
-    lda viewport_oy,x
-    sta oy + 2
-
-    inx
-    lda viewport_ox,x
-    sta ox + 3
-    lda viewport_oy,x
-    sta oy + 3
-
-    rts
-.endproc
-
 .proc start
 
-    jsr imul8xe_init
+    ; ox = 0; oy = 0; zoom = 0
+    ; count_frames = 0; count_pixels = 0
+    lda #0
+    sta ox
+    sta ox + 1
+    sta oy
+    sta oy + 1
+    sta count_frames
+    sta count_pixels
 
-    ; initialize viewport
-    ldx #0 ; overview
-    jsr load_viewport
+    ; total_ms = 0.0; total_pixels = 0.0
+    ldx #total_ms
+    jsr ZF1
+    ldx #total_pixels
+    jsr ZF1
+
+    ; zoom = 2x
+    lda #1
+    sta zoom
 
     ; Disable display DMA
     lda #0
@@ -1530,9 +1118,6 @@ copy_byte_loop:
     ; Initialize the palette
     lda #0
     sta palette_offset
-    sta palette_delay
-    sta chroma_offset
-    sta chroma_delay
     jsr update_palette
 
     ; install the vblank handler
@@ -1542,17 +1127,6 @@ copy_byte_loop:
     jsr SETVBV
 
 main_loop:
-    ; count_frames = 0; count_pixels = 0
-    lda #0
-    sta count_frames
-    sta count_pixels
-
-    ; total_ms = 0.0; total_pixels = 0.0
-    ldx #total_ms
-    jsr ZF1
-    ldx #total_pixels
-    jsr ZF1
-
     jsr clear_screen
     jsr status_bar
 
@@ -1609,10 +1183,10 @@ skipped_mask:
 not_skipped_mask:
 
     ; run the fractal!
-    zoom_factor cx, sx, aspect_x
-    add32 cx, cx, ox
-    zoom_factor cy, sy, aspect_y
-    add32 cy, cy, oy
+    zoom_factor cx, sx, zoom, aspect_x
+    add16 cx, cx, ox
+    zoom_factor cy, sy, zoom, aspect_y
+    add16 cy, cy, oy
     jsr mandelbrot
     jsr pset
 
diff --git a/readme.md b/readme.md
index d60644c..6b57378 100644
--- a/readme.md
+++ b/readme.md
@@ -14,37 +14,32 @@ Non-goals:
 
 Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
 
--- brooke, january 2023 - december 2024
+-- brooke, january 2023 - february 2024
 
 ## Current state
 
-Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys.
+Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.
 
-The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
+The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
 
-* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition
-* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
-* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
-* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
+The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.
 
-The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
 
 Iterations are capped at 255.
 
 The pixels are run in a progressive layout to get the basic shape on screen faster.
 
-There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D
+## Next steps
 
-There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
+Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
 
-There's some cute color cycling.
+Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
+
+I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
 
 ## Deps and build instructions
 
 I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.
 
 Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
-
-## Todo
-
-See ideas in `todo.md`.
diff --git a/tables.js b/tables.js
index 50cbef9..5afc3c0 100644
--- a/tables.js
+++ b/tables.js
@@ -11,40 +11,23 @@ function db(func) {
     return lines.join('\n');
 }
 
-let squares = [];
-for (let i = 0; i < 512; i++) {
-    squares.push(Math.trunc((i * i + 1) / 2));
-}
-
 console.log(
 `.segment "TABLES"
 
 .export mul_lobyte256
 .export mul_hibyte256
 .export mul_hibyte512
-.export sqr_lobyte
-.export sqr_hibyte
 
-; (i * i + 1) / 2 for the multiplier
 .align 256
 mul_lobyte256:
-${db((i) => squares[i] & 0xff)}
+${db((x) => Math.round(x * x / 2) & 0xff)}
 
 .align 256
 mul_hibyte256:
-${db((i) => (squares[i] >> 8) & 0xff)}
+${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)}
 
 .align 256
 mul_hibyte512:
-${db((i) => (squares[i + 256] >> 8) & 0xff)}
-
-; (i * i) for the plain squares
-.align 256
-sqr_lobyte:
-${db((i) => (i * i) & 0xff)}
-
-.align 256
-sqr_hibyte:
-${db((i) => ((i * i) >> 8) & 0xff)}
+${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)}
 
 `);
diff --git a/todo.md b/todo.md
deleted file mode 100644
index 284d653..0000000
--- a/todo.md
+++ /dev/null
@@ -1,19 +0,0 @@
-things to try:
-
-* skip add on the top-byte multiply in sqr8/mul8
-  * should save a few cycles, suggestion by jamey
-
-* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
-
-* try 3.13 fixed point instead of 4.12 for more precision
-  * can we get away without the extra bit?
-  * since exit compare space would be 6.26 i think so
-
-* y-axis mirror optimization
-
-* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
-  * maybe redo tiering to just 4x4, 2x2, 1x1?
-
-* extract viewport for display & re-input via keyboard
-
-* fujinet screenshot/viewport uploader