diff --git a/.gitignore b/.gitignore
index 8d2f7ce..771e47a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.o
 *.xex
+tables.s
 .DS_Store
diff --git a/Makefile b/Makefile
index 25148b4..c94074b 100644
--- a/Makefile
+++ b/Makefile
@@ -2,13 +2,21 @@
 
 all : mandel.xex
 
-%.xex : %.o
-	ld65 -C atari-asm-xex.cfg -o $@ $<
+mandel.xex : mandel.o mandel-core.o tables.o atari-xex.cfg
+	ld65 -C ./atari-xex.cfg --mapfile mandel.map -o $@ mandel.o mandel-core.o tables.o atari.lib
+
+mandel.s : mandel.c mandel.h
+	cc65 -o $@ mandel.c
 
 %.o : %.s
 	ca65 -o $@ $<
 
+tables.s : tables.js
+	node tables.js > tables.s
+
 clean :
+	rm -f tables.s
+	rm -f mandel.s
 	rm -f *.o
 	rm -f *.xex
-
+	rm -f mandel.map
diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg
new file mode 100644
index 0000000..93b80f3
--- /dev/null
+++ b/atari-asm-xex.cfg
@@ -0,0 +1,28 @@
+FEATURES {
+    STARTADDRESS: default = $2E00;
+}
+SYMBOLS {
+    __STARTADDRESS__: type = export, value = %S;
+}
+MEMORY {
+    ZP:      file = "", define = yes, start = $0082, size = $007E;
+    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
+    # Keep $4000-7fff clear for expanded RAM access window
+    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
+    # Keep $a000-$bfff clear for BASIC cartridge
+}
+FILES {
+    %O: format = atari;
+}
+FORMATS {
+    atari: runad = start;
+}
+SEGMENTS {
+    ZEROPAGE: load = ZP,      type = zp,  optional = yes;
+    EXTZP:    load = ZP,      type = zp,  optional = yes; # to enable modules to be able to link to C and assembler programs
+    CODE:     load = MAIN,    type = rw,                  define = yes;
+    RODATA:   load = MAIN,    type = ro   optional = yes;
+    DATA:     load = MAIN,    type = rw   optional = yes;
+    BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
+    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
+}
diff --git a/atari-xex.cfg b/atari-xex.cfg
new file mode 100644
index 0000000..467d9d4
--- /dev/null
+++ b/atari-xex.cfg
@@ -0,0 +1,69 @@
+# Sample linker configuration for C programs using the Atari binary file support.
+# Use with: cl65 -tatari -Catari-xex.cfg prog.c -o prog.xex
+FEATURES {
+    STARTADDRESS: default = $8000;
+}
+SYMBOLS {
+    __SYSTEM_CHECK__:    type = import;  # force inclusion of "system check" load chunk
+    __STACKSIZE__:       type = weak, value = $0800; # 2k stack
+    __STARTADDRESS__:    type = export, value = %S;
+    __RESERVED_MEMORY__: type = weak, value = $0000;
+    __SYSCHKHDR__:       type = export, value = 0; # Disable system check header
+    __SYSCHKTRL__:       type = export, value = 0; # Disable system check trailer
+    __TABLESEG_START__:    type = weak, value = $2E00 + $0300;
+    __TABLESEG_SIZE__:     type = weak, value = 6 * $100;
+    __BANKSY_START__:  type = weak, value = $4000;
+    __BANKSY_SIZE__:   type = weak, value = $4000;
+    __FRAMEBUFFER_START__: type = weak, value = $A000;
+}
+MEMORY {
+# Note -- $80 and $81 (LOMEM) appear to be reserved in ZP.
+    ZP:         file = "", define = yes, start = $0082, size = $007E;
+# "system check" load chunk
+    SYSCHKCHNK: file = %O,               start = $2E00, size = $0300;
+# Note $a000-$bfff is against the BASIC cartridge, may require booting with OPTION.
+    TABLES:     file = %O, define = yes, start = __TABLESEG_START__, size = __TABLESEG_SIZE__;
+# We reserve $4000-7fff for the bank-switch window.
+# In theory we could keep data and code here that we only use on 48k/64k systems.
+    BANKSWITCH: file = "", define = yes, start = __BANKSY_START__, size = __BANKSY_SIZE__;
+# "main program" load chunk
+    MAIN:       file = %O, define = yes, start = %S, size = __FRAMEBUFFER_START__ - __STACKSIZE__ - __RESERVED_MEMORY__ - %S;
+}
+FILES {
+    %O: format = atari;
+}
+FORMATS {
+    atari: runad = start,
+           initad = SYSCHKCHNK: __SYSTEM_CHECK__;
+}
+SEGMENTS {
+    ZEROPAGE:  load = ZP,         type = zp;
+    EXTZP:     load = ZP,         type = zp,                optional = yes;
+    SYSCHK:    load = SYSCHKCHNK, type = rw,  define = yes, optional = yes;
+    TABLES:    load = TABLES,     type = ro,  optional = yes, align = 256;
+    BANKSWICH: load = BANKSWITCH, type = ro,  optional = yes;
+    STARTUP:   load = MAIN,       type = ro,  define = yes;
+    LOWBSS:    load = MAIN,       type = rw,                optional = yes;  # not zero initialized
+    LOWCODE:   load = MAIN,       type = ro,  define = yes, optional = yes;
+    ONCE:      load = MAIN,       type = ro,                optional = yes;
+    CODE:      load = MAIN,       type = ro,  define = yes;
+    RODATA:    load = MAIN,       type = ro;
+    DATA:      load = MAIN,       type = rw;
+    INIT:      load = MAIN,       type = rw,                optional = yes;
+    BSS:       load = MAIN,       type = bss, define = yes;
+}
+FEATURES {
+    CONDES: type    = constructor,
+            label   = __CONSTRUCTOR_TABLE__,
+            count   = __CONSTRUCTOR_COUNT__,
+            segment = ONCE;
+    CONDES: type    = destructor,
+            label   = __DESTRUCTOR_TABLE__,
+            count   = __DESTRUCTOR_COUNT__,
+            segment = RODATA;
+    CONDES: type    = interruptor,
+            label   = __INTERRUPTOR_TABLE__,
+            count   = __INTERRUPTOR_COUNT__,
+            segment = RODATA,
+            import  = __CALLIRQ__;
+}
diff --git a/mandel-core.s b/mandel-core.s
new file mode 100644
index 0000000..6ebb089
--- /dev/null
+++ b/mandel-core.s
@@ -0,0 +1,2181 @@
+; Our zero-page vars
+ox              = $80 ; fixed6.26: center point x
+oy              = $84 ; fixed6.26: center point y
+cx              = $88 ; fixed6.26: c_x
+cy              = $8c ; fixed6.26: c_y
+
+zx              = $90 ; fixed6.26: z_x
+zy              = $94 ; fixed6.26: z_y
+zx_2            = $98 ; fixed6.26: z_x^2
+zy_2            = $9c ; fixed6.26: z_y^2
+
+zx_zy           = $a0 ; fixed6.26: z_x * z_y
+dist            = $a4 ; fixed6.26: z_x^2 + z_y^2
+sx              = $a8 ; i16: screen pixel x
+sy              = $aa ; i16: screen pixel y
+z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
+z_buffer_start  = $ad ; u8: index into z_buffer
+z_buffer_end    = $ae ; u8: index into z_buffer
+iter            = $af ; u8: iteration count
+
+ptr             = $b0 ; u16
+pixel_ptr       = $b2 ; u16
+zoom            = $b4 ; u8: zoom shift level
+fill_level      = $b5 ; u8
+pixel_color     = $b6 ; u8
+pixel_mask      = $b7 ; u8
+pixel_shift     = $b8 ; u8
+pixel_offset    = $b9 ; u8
+palette_offset  = $ba ; u8
+chroma_offset   = $bb ; u8
+palette_ticks   = $bc ; u8
+chroma_ticks    = $bd ; u8
+count_frames    = $be ; u8
+; free space $bf
+
+count_iters     = $c0 ; u16
+text_col        = $c2 ; u8
+text_row        = $c3 ; u8
+; free space c4-cb
+temp            = $cc ; u16
+temp2           = $ce ; u16
+
+palette_delay = 23
+chroma_delay = 137
+
+
+; FP registers in zero page
+FR0    = $d4 ; float48
+FRE    = $da
+FR1    = $e0 ; float48
+FR2    = $e6 ; float48
+CIX    = $f2 ; u8 - index into INBUFF
+INBUFF = $f3 ; u16 - pointer to ascii
+FLPTR  = $fc ; u16 - pointer to user buffer float48
+
+CH1    = $02f2 ; previous character read from keyboard
+CH     = $02fc ; current character read from keyboard
+
+LBUFF  = $0580 ; result buffer for FASC routine
+
+; FP ROM routine vectors
+FASC   = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
+IFP    = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
+FPI    = $D9D2 ; floating point to integer
+FADD   = $DA66 ; ADDITION       (FR0 += FR1)
+FSUB   = $DA60 ; SUBTRACTION    (FR0 -= FR1)
+FMUL   = $DADB ; MULTIPLICATION (FR0 *= FR1)
+FDIV   = $DB28 ; DIVISION       (FR0 /= FR1)
+ZFR0   = $DA44 ; clear FR0
+ZF1    = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
+FLD0R  = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
+FLD1R  = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
+FST0R  = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
+FMOVE  = $DDB6 ; MOVE FR0 TO FR1
+
+; High data
+framebuffer_top    = $a000
+textbuffer         = $af00
+framebuffer_bottom = $b000
+display_list       = $bf00
+framebuffer_end    = $c000
+
+height = 176
+half_height = height >> 1
+width = 160
+half_width = width >> 1
+stride = width >> 2
+
+EXTENDED_RAM = $4000 ; 16KiB bank on the XE
+PORTB  = $D301 ; memory & bank-switch for XL/XE
+
+DMACTL = $D400
+DLISTL = $D402
+DLISTH = $D403
+WSYNC  = $D40A
+
+; OS shadow registers
+SDLSTL = $230
+SDLSTH = $231
+
+; interrupt stuff
+SYSVBV = $E45F
+XITVBV = $E462
+SETVBV = $E45C
+
+COLOR0 = $2C4
+COLOR1 = $2C5
+COLOR2 = $2C6
+COLOR3 = $2C7
+COLOR4 = $2C8
+
+; Keycodes!
+KEY_PLUS  = $06
+KEY_MINUS = $0e
+KEY_UP    = $8e
+KEY_DOWN  = $8f
+KEY_LEFT  = $86
+KEY_RIGHT = $87
+KEY_1     = $1f
+KEY_2     = $1e
+KEY_3     = $1a
+KEY_4     = 24
+KEY_5     = 29
+KEY_6     = 27
+KEY_7     = 51
+KEY_8     = 53
+KEY_9     = 48
+KEY_0     = 50
+KEY_PERIOD = 34
+KEY_E     = 42
+KEY_X     = 22
+KEY_Y     = 43
+
+.struct float48
+    exponent .byte
+    mantissa .byte 5
+.endstruct
+
+.import mul_lobyte256
+.import mul_hibyte256
+.import mul_hibyte512
+.import sqr_lobyte
+.import sqr_hibyte
+
+.data
+
+strings:
+str_self:
+    .byte "MANDEL-6502"
+str_self_end:
+    .byte 0
+str_speed:
+    .byte "us/iter: "
+str_speed_end:
+    .byte 0
+str_run:
+    .byte " RUN"
+str_run_end:
+    .byte 0
+str_done:
+    .byte "DONE"
+str_done_end:
+    .byte 0
+str_padding:
+    .byte "      "
+str_padding_end:
+    .byte 0
+
+str_space:
+    .byte " "
+    .byte 0
+
+str_h:
+    .byte "h"
+    .byte 0
+str_m:
+    .byte "m"
+    .byte 0
+str_s:
+    .byte "s"
+    .byte 0
+
+str_speed_len = str_speed_end - str_speed
+str_run_len = str_run_end - str_run
+str_done_len = str_done_end - str_done
+str_padding_len = str_padding_end - str_padding
+
+; "3h59m59s"
+str_elapsed_spacer = 8
+speed_start = 40 - str_done_len - str_speed_len - str_padding_len - str_elapsed_spacer - 1
+
+col_x = 1
+str_x:
+    .byte "X:"
+    .byte 0
+str_x_len = 2
+str_x_space = 12
+str_x_padding = 2
+
+col_y = col_x + str_x_len + str_x_space + str_x_padding
+str_y:
+    .byte "Y:"
+    .byte 0
+str_y_len = 2
+str_y_space = 12
+str_y_padding = 2
+
+col_zoom = col_y + str_y_len + str_y_space + str_y_padding
+str_zoom:
+    .byte "ZOOM:"
+    .byte 0
+str_zoom_len = 5
+
+char_map:
+    ; Map ATASCII string values to framebuffer font entries
+    ; Sighhhhh
+    .repeat 32, i
+        .byte i + 64
+    .endrepeat
+    .repeat 64, i
+        .byte i
+    .endrepeat
+    .repeat 32, i
+        .byte 96 + i
+    .endrepeat
+
+hex_chars:
+digits_zero:
+    .byte "0123456789abcdef"
+
+digits_space:
+    .byte " 123456789abcdef"
+
+aspect:
+    ; aspect ratio!
+    ; pixels at 320w are 5:6 (narrow)
+    ; pixels at 160w are 5:3 (wide)
+    ;
+    ; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4)
+    ; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4)
+    ;
+    ; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624
+    ; &horizontal range -80 .. 79.9 is -3.125 .. 3.124
+    ;
+    ; 184h is the equiv of 220.8h at square pixels
+    ; 320 / 220.8 = 1.45 display aspect ratio
+aspect_x: ; fixed3.13 5/4
+    .word 5 << (13 - 2)
+
+aspect_y: ; fixed3.13 3/4
+    .word 3 << (13 - 2)
+
+fixed3_13_as_float: ; float48
+    ; 1 << 13
+    ; 8192
+    ; 81 92 . 00 00 00
+    .byte 65 ; exponent/sign - +1 byte
+    .byte $81
+    .byte $92
+    .byte $00
+    .byte $00
+    .byte $00
+
+sec_per_frame: ; float48 00 . 01 66 66 66 67
+    .byte 63  ; exponent/sign - -1 bytes
+    .byte $01 ; BCD digits
+    .byte $66
+    .byte $66
+    .byte $66
+    .byte $67
+
+us_per_sec: ; float48 1e9 01 00 0,0 00 . 00
+    .byte 67  ; exponent/sign +3 bytes
+    .byte $01 ; BCD digits
+    .byte $00
+    .byte $00
+    .byte $00
+    .byte $00
+
+total_iters: ; float48
+    .repeat 6
+        .byte 0
+    .endrepeat
+
+total_sec: ; float48
+    .repeat 6
+        .byte 0
+    .endrepeat
+
+display_list_start:
+    ; 24 lines overscan
+    .repeat 3
+        .byte $70 ; 8 blank lines
+    .endrep
+
+    ; 8 scan lines, 1 row of 40-column text
+    .byte $42
+    .addr textbuffer
+
+    ; 184 lines graphics
+    ; ANTIC mode e (160px 2bpp, 1 scan line per line)
+    .byte $4e
+    .addr framebuffer_top
+    .repeat half_height - 1
+        .byte $0e
+    .endrep
+    .byte $4e
+    .addr framebuffer_bottom
+    .repeat half_height - 1
+        .byte $0e
+    .endrep
+
+    ; 8 scan lines, 1 row of 40-column text
+    .byte $42
+    .addr textbuffer + 40
+
+    .byte $41 ; jump and blank
+    .addr display_list
+display_list_end:
+display_list_len = display_list_end - display_list_start
+
+color_map:
+    .byte 0
+    .repeat 85
+        .byte %01010101
+        .byte %10101010
+        .byte %11111111
+    .endrepeat
+
+
+palette_start:
+    .byte $0e
+    .byte $08
+    .byte $04
+palette_repeat:
+    .byte $0e
+    .byte $08
+
+palette_entries = 3
+
+palette_chroma:
+    .repeat 15, i
+        .byte (i + 1) << 4
+    .endrepeat
+    .repeat 2, i
+        .byte (i + 1) << 4
+    .endrepeat
+palette_chroma_entries = 15
+
+.code
+
+;z_buffer_len = 16 ; 10.863 ms/px
+;z_buffer_len = 12 ; 10.619 ms/px
+z_buffer_len = 8 ; 10.612 ms/px
+;z_buffer_len = 4 ; 12.395 ms/px
+z_buffer_mask = z_buffer_len - 1
+z_buffer:
+    ; the last N zx/zy values
+    .repeat z_buffer_len
+        .word 0
+        .word 0
+    .endrepeat
+
+.export _mandel_start
+
+;max_fill_level = 6
+max_fill_level = 3
+fill_masks:
+;    .byte %00011111
+;    .byte %00001111
+;    .byte %00000111
+    .byte %00000011
+    .byte %00000001
+    .byte %00000000
+
+pixel_masks:
+    .byte %11111111
+    .byte %11110000
+    .byte %11000000
+
+viewport_zoom:
+    .byte 0
+    .byte 5
+    .byte 7
+    .byte 5
+    .byte 7
+    .byte 7
+
+viewport_ox:
+    .dword ($00000000 & $3fffffff) << 2
+    .dword ($ff110000 & $3fffffff) << 2
+    .dword ($ff110000 & $3fffffff) << 2
+    .dword ($fe400000 & $3fffffff) << 2
+    .dword ($fe3b0000 & $3fffffff) << 2
+    .dword $fd220000
+
+viewport_oy:
+    .dword ($00000000 & $3fffffff) << 2
+    .dword ($ffb60000 & $3fffffff) << 2
+    .dword ($ffbe0000 & $3fffffff) << 2
+    .dword ($00000000 & $3fffffff) << 2
+    .dword ($fffe0000 & $3fffffff) << 2
+    .dword $ff000000
+
+elapsed_work:
+    .dword 0
+elapsed_digit:
+    .byte 0
+
+input_col:
+    .byte 0
+input_row:
+    .byte 0
+input_max:
+    .byte 0
+
+; 2 + 9 * byte cycles
+.macro add bytes, dest, arg1, arg2
+    clc ; 2 cyc
+    .repeat bytes, byte ; 9 * byte cycles
+        lda arg1 + byte
+        adc arg2 + byte
+        sta dest + byte
+    .endrepeat
+.endmacro
+
+; 20 cycles
+.macro add16 dest, arg1, arg2
+    add 2, dest, arg1, arg2
+.endmacro
+
+; 38 cycles
+.macro add32 dest, arg1, arg2
+    add 4, dest, arg1, arg2
+.endmacro
+
+; 8 cycles
+.macro add_carry dest
+    lda dest ; 3 cyc
+    adc #0   ; 2 cyc
+    sta dest ; 3 cyc
+.endmacro
+
+; 2 + 9 * byte cycles
+.macro sub bytes, dest, arg1, arg2
+    sec ; 2 cyc
+    .repeat bytes, byte ; 9 * byte cycles
+        lda arg1 + byte
+        sbc arg2 + byte
+        sta dest + byte
+    .endrepeat
+.endmacro
+
+; 20 cycles
+.macro sub16 dest, arg1, arg2
+    sub 2, dest, arg1, arg2
+.endmacro
+
+; 38 cycles
+.macro sub32 dest, arg1, arg2
+    sub 4, dest, arg1, arg2
+.endmacro
+
+; 3 + 5 * (bytes - 1) cycles
+.macro shl bytes, arg
+    asl arg              ; 3 cyc
+    .repeat bytes-1, i
+        rol arg + 1 + i  ; 5 cyc
+    .endrepeat
+.endmacro
+
+; 8 cycles
+.macro shl16 arg
+    shl 2, arg
+.endmacro
+
+; 13 cycles
+.macro shl24 arg
+    shl 3, arg
+.endmacro
+
+; 18 cycles
+.macro shl32 arg
+    shl 4, arg
+.endmacro
+
+; 6 * bytes cycles
+; 4 * bytes bytes
+.macro copy bytes, dest, arg
+    .repeat bytes, byte ; 6 * bytes cycles
+        lda arg + byte  ; 3 cyc
+        sta dest + byte ; 3 cyc
+    .endrepeat
+.endmacro
+
+; 12 cycles
+; 8 bytes
+.macro copy16 dest, arg
+    copy 2, dest, arg
+.endmacro
+
+; 24 cycles
+.macro copy32 dest, arg
+    copy 4, dest, arg
+.endmacro
+
+; 36 cycles
+.macro copyfloat dest, arg
+    copy 6, dest, arg
+.endmacro
+
+; 2 + 8 * byte cycles
+.macro neg bytes, arg
+    sec ; 2 cyc
+    .repeat bytes, byte ; 8 * byte cycles
+        lda #00         ; 2 cyc
+        sbc arg + byte  ; 3 cyc
+        sta arg + byte  ; 3 cyc
+    .endrepeat
+.endmacro
+
+; 18 cycles
+.macro neg16 arg
+    neg 2, arg
+.endmacro
+
+; 34 cycles
+.macro neg32 arg
+    neg 4, arg
+.endmacro
+
+; 11-27 + 18 * shift cycles
+; 65-81 cycles for shift=3
+.macro shift_round_16 arg, shift
+    .repeat shift
+        shl32 arg ; 18 cycles
+    .endrepeat
+    round16 arg ; 11-27 cycles
+.endmacro
+
+; input: arg1, arg2 as fixed4.12
+; output: dest as fixed8.24
+; patch point jsr at 16 bytes in
+imul16_patch_offset = 16
+.macro imul16 dest, arg1, arg2
+    copy16 FR0, arg1  ; 12 cyc
+    copy16 FR1, arg2  ; 12 cyc
+    jsr imul16_func   ; ? cyc
+    copy32 dest, FR2  ; 24 cyc
+.endmacro
+
+; input: arg as fixed4.12
+; output: dest as fixed8.24
+; patch point jsr at 8 bytes in
+sqr16_patch_offset = 8
+.macro sqr16 dest, arg
+    copy16 FR0, arg   ; 12 cyc
+    jsr sqr16_func    ; ? cyc
+    copy32 dest, FR2  ; 24 cyc
+.endmacro
+
+; input: arg as u8
+; output: dest as u16
+; clobbers a, x
+.macro sqr8 dest, arg
+    ldx arg
+    lda sqr_lobyte,x
+    sta dest
+    lda sqr_hibyte,x
+    sta dest + 1
+.endmacro
+
+.segment "TABLES"
+; lookup table for top byte -> PORTB value for bank-switch
+.align 256
+bank_switch_table:
+    .repeat 256, i
+        .byte ((i & $c0) >> 4) | $e3
+    .endrepeat
+
+.code
+
+.macro bank_switch bank
+    lda #((bank << 2) | $e3)
+    sta PORTB
+.endmacro
+
+.macro imul8 dest, arg1, arg2, xe
+    .if xe
+        ; using 64KB lookup table
+        ; 51-70 cycles
+        ; clobbers x, y, dest, ptr
+        .scope
+            output = dest
+
+            ; top 2 bits are the table bank selector
+            ldx arg2                ; 3 cyc
+            lda bank_switch_table,x ; 4 cyc
+            sta PORTB               ; 4 cyc
+
+            ; bottom 14 bits except the LSB are the per-bank table index
+            ; add $4000 for the bank pointer
+            txa          ; 2 cyc
+            and #$3f     ; 2 cyc
+            ora #$40     ; 2 cyc
+            sta ptr + 1  ; 3 cyc
+
+            ; copy the entry into output
+            lda arg1     ; 3 cyc
+            and #$fe     ; 2 cyc
+            tay          ; 2 cyc
+            lda (ptr),y  ; 5 cyc
+            sta output   ; 3 cyc
+            iny          ; 2 cyc
+            lda (ptr),y  ; 5 cyc
+            sta output+1 ; 3 cyc
+
+            ; note: we are not restoring memory to save 6 cycles!
+            ; this means those 16kb have to be switched back to base RAM
+            ; if we need to use them anywhere else
+            ;;; restore memory
+            ;;lda #$81     ; 2 cyc - disabled
+            ;;sta PORTB    ; 4 cyc - disabled
+
+            ; check that 1 bit we skipped to fit into space
+            lda arg1     ; 3 cyc
+            and #1       ; 2 cyc
+            beq done     ; 2 cyc
+
+            ; add arg2 one last time for the skipped bit
+            clc          ; 2 cyc
+            txa          ; 2 cyc
+            adc output   ; 3 cyc
+            sta output   ; 3 cyc
+            lda #0       ; 2 cyc
+            adc output+1 ; 3 cyc
+            sta output+1 ; 3 cyc
+
+        done:
+        .endscope
+    .else
+        ; Using base 48k RAM compatibility mode
+        ; Small table of half squares
+        ; Adapted from https://everything2.com/title/Fast+6502+multiplication
+        ; 81-92 cycles
+        .scope
+            mul_factor_a   = arg1
+            mul_factor_x   = arg2
+            mul_product_lo = dest
+            mul_product_hi = dest + 1
+
+            lda mul_factor_a      ; 3 cyc
+
+            ; (a + x)^2/2
+            clc                   ; 2 cyc         
+            adc mul_factor_x      ; 3 cyc
+            tax                   ; 2 cyc
+            bcc under256          ; 2 cyc
+            lda mul_hibyte512,x   ; 4 cyc
+            bcs next              ; 2 cyc
+        under256:
+            lda mul_hibyte256,x   ; 4 cyc
+            sec                   ; 2 cyc
+        next:
+            sta mul_product_hi    ; 3 cyc
+            lda mul_lobyte256,x   ; 4 cyc
+
+            ; - a^2/2
+            ldx mul_factor_a      ; 3 cyc
+            sbc mul_lobyte256,x   ; 4 cyc
+            sta mul_product_lo    ; 3 cyc
+            lda mul_product_hi    ; 3 cyc
+            sbc mul_hibyte256,x   ; 4 cyc
+            sta mul_product_hi    ; 3 cyc
+
+            ; + x & a & 1:
+            ; (this is a kludge to correct a
+            ; roundoff error that makes odd * odd too low)
+            ldx mul_factor_x      ; 3 cyc
+            txa                   ; 2 cyc
+            and mul_factor_a      ; 3 cyc
+            and #1                ; 2 cyc
+
+            clc                   ; 2 cyc
+            adc mul_product_lo    ; 3 cyc
+            bcc small_product     ; 2 cyc
+            inc mul_product_hi    ; 5 cyc
+
+            ; - x^2/2
+        small_product:
+            sec                   ; 2 cyc
+            sbc mul_lobyte256,x   ; 4 cyc
+            sta mul_product_lo    ; 3 cyc
+            lda mul_product_hi    ; 3 cyc
+            sbc mul_hibyte256,x   ; 4 cyc
+            sta mul_product_hi    ; 3 cyc
+        .endscope
+    .endif
+.endmacro
+
+
+; Initialize a 16 KB chunk of the table
+; input: multipliers in temp
+; output: new multipliers in temp
+; clobbers: temp, temp2
+.proc imul8xe_init_section
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+    ptr = temp2
+
+    lda #$00
+    sta ptr
+    lda #$40
+    sta ptr + 1
+
+    ldy #0
+
+    ; outer loop: $00 -> $3f
+outer_loop:
+
+    ; reset result to 0
+    lda #0
+    sta result
+    sta result + 1
+
+    ; inner loop: $00 -> $ff
+inner_loop:
+
+    ; copy result to data set
+    lda result
+    sta (ptr),y
+    lda result + 1
+    iny
+    sta (ptr),y
+    dey
+
+    ; result += 2 * arg2
+    clc
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result + 1
+    clc
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result + 1
+
+    ; inner loop check
+    inc arg1
+    inc arg1
+    inc ptr
+    inc ptr
+    bne inner_loop
+
+    ; outer loop check
+    inc arg2
+    inc ptr + 1
+    lda ptr + 1
+    cmp #$80
+    bne outer_loop
+
+    rts
+
+.endproc
+
+.macro imul16_impl xe
+    .local arg1
+    .local arg2
+    .local result
+    .local inter
+    .local arg1_pos
+    .local arg2_pos
+    arg1 = FR0   ; 16-bit arg (clobbered)
+    arg2 = FR1   ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+    inter = temp2
+
+    ; h1l1 * h2l2
+    ; (h1*256 + l1) * (h2*256 + l2)
+    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
+    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
+
+    imul8 result, arg1, arg2, xe
+
+    imul8 result + 2, arg1 + 1, arg2 + 1, xe
+
+    imul8 inter, arg1 + 1, arg2, xe
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1, arg2 + 1, xe
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg1 + 1
+    bpl arg1_pos
+    sub16 result + 2, result + 2, arg2
+arg1_pos:
+    lda arg2 + 1
+    bpl arg2_pos
+    sub16 result + 2, result + 2, arg1
+arg2_pos:
+
+    rts ; 6 cyc
+.endmacro
+
+.macro sqr16_impl xe
+    .scope
+        arg = FR0    ; 16-bit arg (clobbered)
+        result = FR2 ; 32-bit result
+        ;inter = temp2
+        inter = FR1
+
+        lda arg + 1
+        bpl arg_pos
+        neg16 arg
+    arg_pos:
+
+        ; hl * hl
+        ; (h*256 + l) * (h*256 + l)
+        ; h*256*(h*256 + l) + l*(h*256 + l)
+        ; h*h*256*256 + h*l*256 + h*l*256 + l*l
+
+        sqr8 result, arg
+
+        sqr8 result + 2, arg + 1
+
+        imul8 inter, arg + 1, arg, xe
+        add16 result + 1, result + 1, inter
+        add_carry result + 3
+        add16 result + 1, result + 1, inter
+        add_carry result + 3
+
+        rts ; 6 cyc
+    .endscope
+.endmacro
+
+.proc imul16_func
+    imul16_impl 0
+.endproc
+
+.proc imul16xe_func
+    imul16_impl 1
+.endproc
+
+.proc sqr16_func
+    sqr16_impl 0
+.endproc
+
+.proc sqr16xe_func
+    sqr16_impl 1
+.endproc
+
+; 11-27 cycles
+.macro round16 arg
+    ; Round top 16 bits of 32-bit fixed-point number in-place
+    .local increment
+    .local high_half
+    .local check_sign
+    .local next
+
+    ; low word > $8000: round up
+    ;          = $8000: round up   if positive
+    ;                   round down if negative
+    ;          < $8000: round down
+
+    ; $8000 17
+    ; $8001 27
+    ; $8100 21
+    ; $7fff 11
+
+    lda arg + 1    ; 3 cyc
+    cmp #$80       ; 2 cyc
+    beq high_half  ; 2 cyc
+
+    bpl increment  ; 2 cyc
+
+    bmi next       ; 2 cyc
+
+high_half:
+    lda arg        ; 3 cyc
+    beq check_sign ; 2 cyc
+
+    jmp increment  ; 3 cyc
+
+check_sign:
+    lda arg + 3  ; 3 cyc
+    bmi next     ; 2 cyc
+
+increment:       ; 5-10 cyc
+    inc arg + 2  ; 5 cyc
+    bne next     ; 2 cyc
+    inc arg + 3  ; 5 cyc
+
+next:
+
+.endmacro
+
+; input in FR0, 16 bits signed 3.13 fixed
+; output in FR0, Atari float
+; clobbers a, x, y, FR0, FR1
+.proc fixed3_13_to_float
+    ldx #.lobyte(fixed3_13_as_float)
+    ldy #.hibyte(fixed3_13_as_float)
+    jsr FLD1R
+
+    ; check sign bit! conversion routine is for unsigned
+    lda FR0 + 1
+    bpl positive
+
+negative:
+    neg16 FR0
+    jsr IFP
+
+    ; set float sign bit
+    lda FR0
+    ora #$80
+    sta FR0
+    jmp common
+
+positive:
+    jsr IFP
+
+common:
+    jsr FDIV
+    rts
+
+.endproc
+
+; rounds to 16-bit first!
+; input in FR0, 32 bits signed 6.26 fixed
+; output in FR0, Atari float
+; clobbers a, x, y, FR0, FR1
+.proc fixed6_26_to_float
+    shift_round_16 FR0, 3
+    copy16 FR0, FR0 + 2
+    jsr fixed3_13_to_float
+    rts
+.endproc
+
+; input in FR0, Atari float
+; output in FR0, 16 bits signed 3.13 fixed
+; clobbers a, x, y, FR0, FR1
+.proc float_to_fixed3_13
+    ldx #.lobyte(fixed3_13_as_float)
+    ldy #.hibyte(fixed3_13_as_float)
+    jsr FLD1R
+    jsr FMUL
+
+    ; check sign bit! conversion routine is for unsigned
+    lda FR0
+    bcc positive
+
+negative:
+    ; clearfloat sign bit
+    lda FR0
+    eor #$80
+    sta FR0
+
+    jsr FPI
+    neg16 FR0
+    jmp common
+
+positive:
+    jsr FPI
+
+common:
+    rts
+
+.endproc
+
+.proc mandelbrot
+    ; input:
+    ; cx: position scaled to 6.26 fixed point - -32..+31.9
+    ; cy: position scaled to 6.26
+    ;
+    ; output:
+    ; iter: iteration count at escape or 0
+
+    ; zx = 0
+    ; zy = 0
+    ; zx_2 = 0
+    ; zy_2 = 0
+    ; zx_zy = 0
+    ; dist = 0
+    ; iter = 0
+;    lda #00
+;    ldx #(iter - zx + 1)
+;initloop:
+;    sta zx - 1,x
+;    dex
+;    bne initloop
+;    sta z_buffer_start
+;    sta z_buffer_end
+
+    lda #00
+    sta zx
+    sta zx + 1
+    sta zx + 2
+    sta zx + 3
+    sta zy
+    sta zy + 1
+    sta zy + 2
+    sta zy + 3
+    sta zx_2
+    sta zx_2 + 1
+    sta zx_2 + 2
+    sta zx_2 + 3
+    sta zy_2
+    sta zy_2 + 1
+    sta zy_2 + 2
+    sta zy_2 + 3
+    sta zx_zy
+    sta zx_zy + 1
+    sta zx_zy + 2
+    sta zx_zy + 3
+    sta dist
+    sta dist + 1
+    sta dist + 2
+    sta dist + 3
+    sta iter
+    sta z_buffer_start
+    sta z_buffer_end
+
+loop:
+    inc count_iters
+    bne low_iters
+    inc count_iters + 1
+low_iters:
+
+    ; iter++ & max-iters break
+    inc iter
+    bne keep_going
+    jmp exit_path
+keep_going:
+
+    .macro quick_exit arg, max
+        ; arg: fixed6.26
+        ; max: integer
+        .local positive
+        .local negative
+        .local nope_out
+        .local first_equal
+        .local all_done
+
+        ; check sign bit
+        lda arg + 3
+        bmi negative
+
+    positive:
+        cmp #(max << 2)
+        bmi all_done ; 'less than'
+        jmp exit_path
+
+    negative:
+        cmp #(256 - (max << 2))
+        beq first_equal ; 'equal' on first byte
+        bpl all_done    ; 'greater than'
+
+    nope_out:
+        jmp exit_path
+
+    first_equal:
+        ; following bytes all 0 shows it's really 'equal'
+        lda arg + 2
+        bne all_done
+        lda arg + 1
+        bne all_done
+        lda arg
+        bne all_done
+        jmp exit_path
+
+    all_done:
+    .endmacro
+
+    ; 6.26: (-32 .. 31.9)
+    ; zx = zx_2  - zy_2  + cx
+    sub32 zx, zx_2, zy_2
+    add32 zx, zx, cx
+    quick_exit zx, 2
+
+    ; zy = zx_zy + zx_zy + cy
+    add32 zy, zx_zy, zx_zy
+    add32 zy, zy, cy
+    quick_exit zy, 2
+
+    ; convert 6.26 -> 3.13: (-4 .. +3.9)
+    shift_round_16 zx, 3
+    shift_round_16 zy, 3
+
+    ; zx_2 = zx * zx
+fixup_sqr16_1:
+    sqr16 zx_2, zx + 2
+
+    ; zy_2 = zy * zy
+fixup_sqr16_2:
+    sqr16 zy_2, zy + 2
+
+    ; zx_zy = zx * zy
+fixup_imul16_1:
+    imul16 zx_zy, zx + 2, zy + 2
+
+    ; dist = zx_2 + zy_2
+    add32 dist, zx_2, zy_2
+    quick_exit dist, 4
+
+    ; if may be in the lake, look for looping output with a small buffer
+    ; as an optimization vs running to max iters
+    lda z_buffer_active
+    beq skip_z_buffer
+
+    ldx z_buffer_start
+    cpx z_buffer_end
+    beq z_nothing_to_read
+
+z_buffer_loop:
+    .macro z_compare arg
+        .local compare_no_match
+        lda z_buffer,x
+        inx
+        cmp arg
+        bne compare_no_match
+        iny
+    compare_no_match:
+    .endmacro
+    .macro z_advance
+        .local skip_reset_x
+        cpx #(z_buffer_len * 4)
+        bmi skip_reset_x
+        ldx #0
+    skip_reset_x:
+    .endmacro
+    .macro z_store arg
+        lda arg
+        sta z_buffer,x
+        inx
+    .endmacro
+
+    ; Compare the previously stored z values
+    ldy #0
+    z_compare zx + 2
+    z_compare zx + 3
+    z_compare zy + 2
+    z_compare zy + 3
+
+    cpy #4
+    bne z_no_matches
+    jmp z_exit
+
+z_no_matches:
+    z_advance
+
+    cpx z_buffer_end
+    bne z_buffer_loop
+
+z_nothing_to_read:
+
+    ; Store and expand
+    z_store zx + 2
+    z_store zx + 3
+    z_store zy + 2
+    z_store zy + 3
+    z_advance
+    stx z_buffer_end
+
+    ; Increment the start roller if necessary (limit size)
+    lda iter
+    cmp #(z_buffer_len * 4)
+    bmi skip_inc_start
+    lda z_buffer_start
+    clc
+    adc #4
+    tax
+    z_advance
+    stx z_buffer_start
+skip_inc_start:
+
+skip_z_buffer:
+
+    jmp loop
+
+z_exit:
+    lda #0
+    sta iter
+
+exit_path:
+    ldx #0
+    lda iter
+    bne next
+    inx
+next:
+    stx z_buffer_active
+    rts
+
+.endproc
+
+.macro scale_zoom dest
+    ; clobbers X, flags
+    .local cont
+    .local enough
+
+    ; cx = (sx << (8 - zoom))
+    ldx zoom
+cont:
+    cpx #8
+    beq enough
+    shl16 dest
+    inx
+    jmp cont
+enough:
+.endmacro
+
+.macro zoom_factor dest, src, aspect
+    ; output: dest: fixed6.26
+    ; input: src: fixed3.13
+    ; aspect: fixed3.13
+    ; clobbers A, X, flags, etc
+    copy16 dest, src
+    scale_zoom dest
+
+    ; cy = cy * (3 / 4)
+    ; cx = cx * (5 / 4)
+    imul16 dest, dest, aspect
+.endmacro
+
+.proc pset
+    ; screen coords in signed sx,sy
+    ; iter holds the target to use
+    ; @todo implement
+
+    ; iter -> color
+    ldx iter
+    lda color_map,x
+    ldx fill_level
+    and pixel_masks,x
+    sta pixel_color
+    lda pixel_masks,x
+    eor #$ff
+    sta pixel_mask
+
+    ; sy -> line base address in temp
+    lda sy
+    bpl positive
+
+negative:
+    ; temp1 = top half
+    lda #.lobyte(framebuffer_top + stride * half_height)
+    sta pixel_ptr
+    lda #.hibyte(framebuffer_top + stride * half_height)
+    sta pixel_ptr + 1
+    jmp point
+
+positive:
+
+    lda #.lobyte(framebuffer_bottom)
+    sta pixel_ptr
+    lda #.hibyte(framebuffer_bottom)
+    sta pixel_ptr + 1
+
+point:
+
+    ; pixel_ptr += sy * stride
+    ;    temp * 40
+    ; =  temp * 32  +  temp * 8
+    ; = (temp << 5) + (temp << 3)
+    copy16 temp, sy
+    shl16 temp
+    shl16 temp
+    shl16 temp
+    add16 pixel_ptr, pixel_ptr, temp
+    shl16 temp
+    shl16 temp
+    add16 pixel_ptr, pixel_ptr, temp
+
+    ; Ok so temp1 points to the start of the line, which is 40 bytes.
+    ; Get the byte and bit offsets
+    lda sx
+    clc
+    adc #half_width
+    sta temp
+
+    ; pixel_shift = temp & 3
+    ; pixel_color <<= pixel_shift (shifting in zeros)
+    ; pixel_mask <<= pixel_shift (shifting in ones)
+    and #3
+    sta pixel_shift
+    tax
+shift_loop:
+    beq shift_done
+    lsr pixel_color
+    lsr pixel_color
+    sec
+    ror pixel_mask
+    sec
+    ror pixel_mask
+    dex
+    jmp shift_loop
+shift_done:
+
+    ldy fill_level
+    ldx fill_masks,y
+    inx
+
+    ; pixel_offset = temp >> 2
+    lda temp
+    lsr a
+    lsr a
+    sta pixel_offset
+    tay
+
+draw_pixel:
+    ; read, mask, or, write
+    lda (pixel_ptr),y
+    and pixel_mask
+    ora pixel_color
+    sta (pixel_ptr),y
+
+    dex
+    beq done
+    clc
+    lda #40
+    adc pixel_ptr
+    sta pixel_ptr
+    lda #0
+    adc pixel_ptr + 1
+    sta pixel_ptr + 1
+    jmp draw_pixel
+
+done:
+    rts
+.endproc
+
+; in/out: column in text_col
+; in: row in text_row
+; in: pointer to string in INBUFF
+; clobbers x/y/a/temp
+.proc draw_string
+    drawptr = temp
+    strptr = INBUFF
+
+    clc
+    lda #.lobyte(textbuffer)
+    adc text_col
+    sta temp
+    lda #.hibyte(textbuffer)
+    adc #0
+    sta temp + 1
+
+    ldx text_row
+    beq done_rows
+continue_rows:
+    clc
+    lda temp
+    adc #40
+    sta temp
+    lda temp + 1
+    adc #0
+    sta temp + 1
+    dex
+    bne continue_rows
+
+done_rows:
+
+    ldy #0
+loop:
+    lda (strptr),y
+    ; if char's null, terminate c-style
+    beq done
+    ; save the char for terminator check
+    pha
+    ; strip the high bit (terminator)
+    and #$7f
+    tax
+    lda char_map,x
+    sta (drawptr),y
+    iny
+
+    pla
+    ; _last_ char has high bit set in atari rom routines
+    bmi done
+    jmp loop
+
+done:
+    ; move the text column pointer
+    tya
+    clc
+    adc text_col
+    sta text_col
+
+    rts
+.endproc
+
+.macro draw_string_const str
+    lda #.lobyte(str)
+    sta INBUFF
+    lda #.hibyte(str)
+    sta INBUFF + 1
+    jsr draw_string
+.endmacro
+
+.proc vblank_handler
+    inc count_frames
+
+    inc chroma_ticks
+    lda chroma_ticks
+    cmp #(chroma_delay)
+    bne skip_chroma
+
+    lda #0
+    sta chroma_ticks
+
+    inc chroma_offset
+    lda chroma_offset
+    cmp #(palette_chroma_entries)
+    bne skip_chroma
+
+    lda #0
+    sta chroma_offset
+skip_chroma:
+
+    inc palette_ticks
+    lda palette_ticks
+    cmp #(palette_delay)
+    bne skip_luma
+
+    lda #0
+    sta palette_ticks
+
+    inc palette_offset
+    lda palette_offset
+    cmp #(palette_entries)
+    bne skip_luma
+
+    lda #0
+    sta palette_offset
+
+skip_luma:
+    jsr update_palette
+    jmp XITVBV
+.endproc
+
+.proc update_palette
+    lda #0
+    sta COLOR4
+
+    ldx chroma_offset
+    ldy palette_offset
+    lda palette_chroma,x
+    ora palette_start,y
+    sta COLOR2
+
+    ;inx
+    iny
+    lda palette_chroma,x
+    ora palette_start,y
+    sta COLOR1
+
+    ;inx
+    iny
+    lda palette_chroma,x
+    ora palette_start,y
+    sta COLOR0
+
+    rts
+.endproc
+
+.proc update_speed
+    ; convert frames (u16) to fp
+    ; add to frames_total
+    ; convert pixels (u16) to fp
+    ; add to pixels_total
+    ; (frames_total * 16.66666667) / pixels_total
+    ; convert to ATASCII
+    ; draw text
+.endproc
+
+.proc keycheck
+    ; clobbers all
+    ; returns 255 in A if state change or 0 if no change
+
+    ; check keyboard buffer
+    lda CH
+    cmp #$ff
+    beq skip_char
+
+    ; Clear the keyboard buffer and re-enable interrupts
+    ldx #$ff
+    stx CH
+
+    tay
+
+    lda zoom
+    cpy #KEY_PLUS
+    beq plus
+    cpy #KEY_MINUS
+    beq minus
+
+    ; temp+temp2 = $00010000 << (8 - zoom)
+    lda #$00
+    sta temp
+    sta temp + 1
+    lda #$01
+    sta temp + 2
+    lda #$00
+    sta temp + 3
+    scale_zoom temp + 2
+
+    cpy #KEY_UP
+    beq up
+    cpy #KEY_DOWN
+    beq down
+    cpy #KEY_LEFT
+    beq left
+    cpy #KEY_RIGHT
+    beq right
+    jmp number_keys
+ 
+skip_char:
+    lda #0
+    rts
+
+plus:
+    lda zoom
+    cmp #7
+    bpl skip_char
+    inc zoom
+    jmp done
+minus:
+    lda zoom
+    cmp #1
+    bmi skip_char
+    dec zoom
+    jmp done
+up:
+    add32 oy, oy, temp
+    jsr display_coords
+    jmp done
+down:
+    sub32 oy, oy, temp
+    jsr display_coords
+    jmp done
+left:
+    sub32 ox, ox, temp
+    jsr display_coords
+    jmp done
+right:
+    add32 ox, ox, temp
+    jsr display_coords
+    jmp done
+
+number_keys:
+    cpy #KEY_1
+    beq one
+    cpy #KEY_2
+    beq two
+    cpy #KEY_3
+    beq three
+    cpy #KEY_4
+    beq four
+    cpy #KEY_5
+    beq five
+    cpy #KEY_6
+    beq six
+    jmp letter_keys
+
+one:
+    ldx #0
+    jmp load_key_viewport
+two:
+    ldx #1
+    jmp load_key_viewport
+three:
+    ldx #2
+    jmp load_key_viewport
+four:
+    ldx #3
+    jmp load_key_viewport
+five:
+    ldx #4
+    jmp load_key_viewport
+six:
+    ldx #5
+    jmp load_key_viewport
+
+letter_keys:
+    cpy #KEY_X
+    bne not_x
+    jsr input_x
+    jmp done
+not_x:
+    cpy #KEY_Y
+    bne not_y
+    jsr input_y
+    jmp done
+not_y:
+    jmp skip_char
+
+load_key_viewport:
+    jsr load_viewport
+    ; fall through
+done:
+    lda #255
+    rts
+
+.endproc
+
+.proc input_x
+    ldx #col_x
+    ldy #1
+    jsr input_number
+
+
+    rts
+.endproc
+
+.proc input_y
+    rts
+.endproc
+
+.proc input_number
+    rts
+.endproc
+
+.proc clear_screen
+    ; zero the range from framebuffer_top to display_list
+    lda #.lobyte(framebuffer_top)
+    sta temp
+    lda #.hibyte(framebuffer_top)
+    sta temp + 1
+
+zero_page_loop:
+    lda #0
+    ldy #0
+zero_byte_loop:
+    sta (temp),y
+    iny
+    bne zero_byte_loop
+
+    inc temp + 1
+    lda temp + 1
+    cmp #.hibyte(display_list)
+    bne zero_page_loop
+
+    rts
+.endproc
+
+.proc status_bar
+    ; Status bar
+
+    lda #0
+    sta text_col
+    lda #0
+    sta text_row
+    draw_string_const str_self
+
+    lda #(40 - str_run_len)
+    sta text_col
+    draw_string_const str_run
+
+    rts
+.endproc
+
+.proc display_coords
+    lda #1
+    sta text_row
+    lda #col_x
+    sta text_col
+    draw_string_const str_x
+
+    copy32 FR0, ox
+    jsr fixed6_26_to_float
+    jsr FASC
+    jsr draw_string
+
+    lda #col_y
+    sta text_col
+    draw_string_const str_y
+
+    copy32 FR0, oy
+    jsr fixed6_26_to_float
+    jsr FASC
+    jsr draw_string
+
+    lda #col_zoom
+    sta text_col
+    draw_string_const str_zoom
+
+    lda zoom
+    clc
+    adc #0
+    sta FR0
+    lda #0
+    sta FR0 + 1
+    jsr IFP
+    jsr FASC
+    jsr draw_string
+
+    rts
+
+.endproc
+
+; input: viewport selector in x
+; clobbers: a, x
+.proc load_viewport
+
+    lda viewport_zoom,x
+    sta zoom
+
+    txa
+    asl a
+    asl a
+
+    tax
+    lda viewport_ox,x
+    sta ox
+    lda viewport_oy,x
+    sta oy
+
+    inx
+    lda viewport_ox,x
+    sta ox + 1
+    lda viewport_oy,x
+    sta oy + 1
+
+    inx
+    lda viewport_ox,x
+    sta ox + 2
+    lda viewport_oy,x
+    sta oy + 2
+
+    inx
+    lda viewport_ox,x
+    sta ox + 3
+    lda viewport_oy,x
+    sta oy + 3
+
+    rts
+.endproc
+
+.proc _mandel_start
+
+    jsr imul8xe_init
+
+    ; initialize viewport
+    ldx #0 ; overview
+    jsr load_viewport
+
+    ; Disable display DMA
+    lda #0
+    sta DMACTL
+
+    jsr clear_screen
+    jsr display_coords
+
+    ; Copy the display list into properly aligned memory
+    ; Can't cross 1024-byte boundaries :D
+    ldx #0
+copy_byte_loop:
+    lda display_list_start,x
+    sta display_list,x
+    inx
+    cpx #display_list_len
+    bne copy_byte_loop
+
+    ; Set up the display list
+    lda #.lobyte(display_list)
+    sta DLISTL ; actual register
+    sta SDLSTL ; shadow register the OS will copy in
+    lda #.hibyte(display_list)
+    sta DLISTH ; actual register
+    sta SDLSTH ; shadow register the OS will copy in
+
+    ; Re-enable display DMA
+    lda #$22
+    sta DMACTL
+
+    ; Initialize the palette
+    lda #0
+    sta palette_offset
+    sta palette_delay
+    sta chroma_offset
+    sta chroma_delay
+    jsr update_palette
+
+    ; install the vblank handler
+    lda #7 ; deferred
+    ldx #.hibyte(vblank_handler)
+    ldy #.lobyte(vblank_handler)
+    jsr SETVBV
+
+main_loop:
+    ; count_frames = 0; count_iters = 0
+    lda #0
+    sta count_frames
+    sta count_iters
+    sta count_iters + 1
+
+    ; total_sec = 0.0; total_iters = 0.0
+    jsr ZFR0
+    ldx #.lobyte(total_sec)
+    ldy #.hibyte(total_sec)
+    jsr FST0R
+    ldx #.lobyte(total_iters)
+    ldy #.hibyte(total_iters)
+    jsr FST0R
+
+    jsr clear_screen
+    jsr status_bar
+    jsr display_coords
+
+    lda #0
+    sta fill_level
+
+fill_loop:
+
+    ; sy = -92 .. 91
+    lda #(256-half_height)
+    sta sy
+    lda #(256-1)
+    sta sy + 1
+
+loop_sy:
+    ; sx = -80 .. 79
+    lda #(256-half_width)
+    sta sx
+    lda #(256-1)
+    sta sx + 1
+
+loop_sx:
+    ; check the fill mask
+    ldy #0
+
+loop_skip_level:
+    cpy fill_level
+    beq current_level
+
+    lda fill_masks,y
+    and sx
+    bne not_skipped_mask1
+
+    lda fill_masks,y
+    and sy
+    beq skipped_mask
+
+not_skipped_mask1:
+    iny
+    jmp loop_skip_level
+
+current_level:
+    lda fill_masks,y
+    and sx
+    bne skipped_mask
+
+    lda fill_masks,y
+    and sy
+    beq not_skipped_mask
+
+skipped_mask:
+    jmp skipped
+
+not_skipped_mask:
+
+    ; run the fractal!
+    zoom_factor cx, sx, aspect_x
+    add32 cx, cx, ox
+    zoom_factor cy, sy, aspect_y
+    neg32 cy
+    add32 cy, cy, oy
+    jsr mandelbrot
+    jsr pset
+
+    jsr keycheck
+    beq no_key
+    ; @fixme clear the pixel stats
+    jmp main_loop
+
+no_key:
+    ; check if we should update the counters
+
+    ; count_frames >= 120? update!
+    lda count_frames
+    cmp #120 ; >= 2 seconds
+    bpl update_status
+    jmp skipped
+
+update_status:
+    ; FR0 = (float)count_iters & clear count_iters
+    copy16 FR0, count_iters
+    jsr IFP
+    lda #0
+    sta count_iters
+    sta count_iters + 1
+
+    ; FR1 = total_iters
+    ldx #.lobyte(total_iters)
+    ldy #.hibyte(total_iters)
+    jsr FLD1R
+
+    ; FR0 += FR1
+    jsr FADD
+
+    ; total_iters = FR0
+    ldx #.lobyte(total_iters)
+    ldy #.hibyte(total_iters)
+    jsr FST0R
+
+
+    ; FR0 = (float)count_frames & clear count_frames
+    ; warning: this should really disable interrupts @TODO
+    lda count_frames
+    sta FR0
+    lda #0
+    sta FR0 + 1
+    sta count_frames
+    jsr IFP
+
+    ; FR0 *= sec_per_frame
+    ldx #.lobyte(sec_per_frame)
+    ldy #.hibyte(sec_per_frame)
+    jsr FLD1R
+    jsr FMUL
+
+    ; FR0 += total_sec
+    ldx #.lobyte(total_sec)
+    ldy #.hibyte(total_sec)
+    jsr FLD1R
+    jsr FADD
+
+    ; total_sec = FR0
+    ldx #.lobyte(total_sec)
+    ldy #.hibyte(total_sec)
+    jsr FST0R
+
+    ; FR0 /= total_iters
+    ldx #.lobyte(total_iters)
+    ldy #.hibyte(total_iters)
+    jsr FLD1R
+    jsr FDIV
+
+    ; FR0 *= us_per_sec
+    ldx #.lobyte(us_per_sec)
+    ldy #.hibyte(us_per_sec)
+    jsr FLD1R
+    jsr FMUL
+
+    ; round (down) to integer
+    jsr FPI
+    clc
+    jsr IFP
+
+    lda #speed_start
+    sta text_col
+    lda #0
+    sta text_row
+    draw_string_const str_speed
+
+    lda text_col
+    pha
+    draw_string_const str_padding
+    pla
+    sta text_col
+
+    ; convert to ASCII in INBUFF and print
+    jsr FASC
+    jsr draw_string
+
+    ; elapsed time
+    ; FR0 = total_sec
+    ldx #.lobyte(total_sec)
+    ldy #.hibyte(total_sec)
+    jsr FLD0R
+    ; FR0 -> integer -> elapsed_work
+    jsr FPI
+    lda FR0
+    sta elapsed_work
+    lda FR0 + 1
+    sta elapsed_work + 1
+
+    draw_string_const str_space
+    
+    .macro do_countdown divisor, digits
+        ldx #.lobyte(divisor)
+        ldy #.hibyte(divisor)
+        lda #.lobyte(digits)
+        sta INBUFF
+        lda #.hibyte(digits)
+        sta INBUFF + 1
+        jsr countdown
+    .endmacro
+    do_countdown 36000, digits_space
+    do_countdown 3600, digits_zero
+    draw_string_const str_h
+    do_countdown 600, digits_zero
+    do_countdown 60, digits_zero
+    draw_string_const str_m
+    do_countdown 10, digits_zero
+    do_countdown 1, digits_zero
+    draw_string_const str_s
+
+skipped:
+
+    ; sx += fill_level[fill_masks] + 1
+    ldx fill_level
+    lda fill_masks,x
+    clc
+    adc #1 ; will never carry
+    adc sx
+    sta sx
+    lda #0
+    adc sx + 1
+    sta sx + 1
+
+    lda sx
+    cmp #half_width
+    beq loop_sx_done
+    jmp loop_sx
+
+loop_sx_done:
+
+    ; sy += fill_level[fill_masks] + 1
+    ldx fill_level
+    lda fill_masks,x
+    clc
+    adc #1 ; will never carry
+    adc sy
+    sta sy
+    lda #0
+    adc sy + 1
+    sta sy + 1
+
+    lda sy
+    cmp #half_height
+    beq loop_sy_done
+    jmp loop_sy
+
+loop_sy_done:
+
+fill_loop_done:
+    inc fill_level
+    lda fill_level
+    cmp #max_fill_level
+    beq loop
+    jmp fill_loop
+
+loop:
+    ; finished
+
+    lda #(40 - str_done_len)
+    sta text_col
+    lda #0
+    sta text_row
+    draw_string_const str_done
+
+    jsr keycheck
+    beq loop
+    jmp main_loop
+
+.endproc
+
+; digit string in INBUFF
+; divisor X/Y
+; clobbers temp, calls draw_string
+.proc countdown
+    divisor = temp
+    stx divisor
+    sty divisor + 1
+
+    ; count the hours
+    ldy #0
+countdown_loop:
+    lda elapsed_work + 1
+    cmp divisor + 1
+    beq countdown_lobyte
+    bcc countdown_done
+    bcs countdown_inc
+countdown_lobyte:
+    lda elapsed_work
+    cmp divisor
+    bcc countdown_done
+countdown_inc:
+    sec
+    lda elapsed_work
+    sbc divisor
+    sta elapsed_work
+    lda elapsed_work + 1
+    sbc divisor + 1
+    sta elapsed_work + 1
+    iny
+    jmp countdown_loop
+countdown_done:
+    lda (INBUFF),y
+    eor #$80
+    sta elapsed_digit
+    lda #.lobyte(elapsed_digit)
+    sta INBUFF
+    lda #.hibyte(elapsed_digit)
+    sta INBUFF + 1
+    jsr draw_string
+    rts
+.endproc
+
+.proc imul8xe_init
+
+    bank_switch 0
+    lda #0
+    sta EXTENDED_RAM
+    bank_switch 1
+    lda #1
+    sta EXTENDED_RAM
+    bank_switch 0
+    lda EXTENDED_RAM
+    beq init
+
+    ; no bank switching available, we just overwrite the value in base ram
+    rts
+
+init:
+
+    ; patch imul16_func into a forwarding thunk to imul16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta imul16_func
+    lda #.lobyte(imul16xe_func)
+    sta imul16_func + 1
+    sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1
+    lda #.hibyte(imul16xe_func)
+    sta imul16_func + 2
+    sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2
+
+    ; ditto for sqr16_func -> sqr16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta sqr16_func
+    lda #.lobyte(sqr16xe_func)
+    sta sqr16_func + 1
+    sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1
+    sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1
+    lda #.hibyte(sqr16xe_func)
+    sta sqr16_func + 2
+    sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2
+    sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2
+
+
+    ; create the lookup table
+    ; go through the input set, in four 16KB chunks
+
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+
+    lda #$00
+    sta arg1
+    sta arg2
+    sta ptr
+    lda #$40
+    sta ptr + 1
+
+    ; $00 * $00 -> $3f * $ff
+    bank_switch 0
+    jsr imul8xe_init_section
+
+    ; $40 * $00 -> $7f * $ff
+    bank_switch 1
+    jsr imul8xe_init_section
+
+    ; $80 * $00 -> $bf * $ff
+    bank_switch 2
+    jsr imul8xe_init_section
+
+    ; $c0 * $00 -> $ff * $ff
+    bank_switch 3
+    jsr imul8xe_init_section
+
+    rts
+.endproc
diff --git a/mandel.c b/mandel.c
new file mode 100644
index 0000000..f287fa3
--- /dev/null
+++ b/mandel.c
@@ -0,0 +1,15 @@
+/**
+ * The UI and I/O wrapper for the Mandelbrot runner, in C.
+ *
+ * For the moment *all* logic is in mandel-core.s, I'm just
+ * trying to get this to run within a cc65 environment.
+ * Eventually just the inner loop fun will live in there.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "mandel.h"
+
+void main(void) {
+    mandel_start();
+}
\ No newline at end of file
diff --git a/mandel.h b/mandel.h
new file mode 100644
index 0000000..e43fad7
--- /dev/null
+++ b/mandel.h
@@ -0,0 +1,4 @@
+#include <inttypes.h>
+
+// From mandel-core.s:
+extern void mandel_start(void);
diff --git a/mandel.s b/mandel.s
deleted file mode 100644
index 3db6a77..0000000
--- a/mandel.s
+++ /dev/null
@@ -1,1213 +0,0 @@
-; Our zero-page vars
-sx    = $80     ; i16: screen pixel x
-sy    = $82     ; i16: screen pixel y
-ox    = $84     ; fixed4.12: center point x
-oy    = $86     ; fixed4.12: center point y
-cx    = $88     ; fixed4.12: c_x
-cy    = $8a     ; fixed4.12: c_y
-zx    = $8c     ; fixed4.12: z_x
-zy    = $8e     ; fixed4.12: z_y
-
-zx_2  = $90     ; fixed4.12: z_x^2
-zy_2  = $92     ; fixed4.12: z_y^2
-zx_zy = $94     ; fixed4.12: z_x * z_y
-dist  = $96     ; fixed4.12: z_x^2 + z_y^2
-
-iter         = $a0 ; u8: iteration count
-
-zoom         = $a1 ; u8: zoom shift level
-count_frames = $a2 ; u8
-count_pixels = $a3 ; u8
-total_ms     = $a4 ; float48
-total_pixels = $aa ; float48
-
-z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
-z_buffer_start  = $b1 ; u8: index into z_buffer
-z_buffer_end    = $b2 ; u8: index into z_buffer
-temp            = $b4 ; u16
-
-pixel_ptr       = $b6 ; u16
-pixel_color     = $b8 ; u8
-pixel_mask      = $b9 ; u8
-pixel_shift     = $ba ; u8
-pixel_offset    = $bb ; u8
-fill_level      = $bc ; u8
-palette_offset  = $bd ; u8
-
-; FP registers in zero page
-FR0    = $d4 ; float48
-FRE    = $da
-FR1    = $e0 ; float48
-FR2    = $e6 ; float48
-CIX    = $f2 ; u8 - index into INBUFF
-INBUFF = $f3 ; u16 - pointer to ascii
-FLPTR  = $fc ; u16 - pointer to user buffer float48
-
-CH1    = $02f2 ; previous character read from keyboard
-CH     = $02fc ; current character read from keyboard
-
-LBUFF  = $0580 ; result buffer for FASC routine
-
-; FP ROM routine vectors
-FASC   = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
-IFP    = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
-FADD   = $DA66 ; ADDITION       (FR0 += FR1)
-FSUB   = $DA60 ; SUBTRACTION    (FR0 -= FR1)
-FMUL   = $DADB ; MULTIPLICATION (FR0 *= FR1)
-FDIV   = $DB28 ; DIVISION       (FR0 /= FR1)
-ZF1    = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
-FLD0R  = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
-FLD1R  = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
-FST0R  = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
-FMOVE  = $DDB6 ; MOVE FR0 TO FR1
-
-; High data
-framebuffer_top    = $8000
-textbuffer         = $8f00
-framebuffer_bottom = $9000
-display_list       = $9f00
-framebuffer_end    = $a000
-
-height = 184
-half_height = height >> 1
-width = 160
-half_width = width >> 1
-stride = width >> 2
-
-DMACTL = $D400
-DLISTL = $D402
-DLISTH = $D403
-WSYNC  = $D40A
-
-; OS shadow registers
-SDLSTL = $230
-SDLSTH = $231
-
-; interrupt stuff
-SYSVBV = $E45F
-XITVBV = $E462
-SETVBV = $E45C
-
-COLOR0 = $2C4
-COLOR1 = $2C5
-COLOR2 = $2C6
-COLOR3 = $2C7
-COLOR4 = $2C8
-
-; Keycodes!
-KEY_PLUS  = $06
-KEY_MINUS = $0e
-KEY_UP    = $8e
-KEY_DOWN  = $8f
-KEY_LEFT  = $86
-KEY_RIGHT = $87
-
-.struct float48
-    exponent .byte
-    mantissa .byte 6
-.endstruct
-
-.data
-
-strings:
-str_self:
-    .byte "MANDEL-6502"
-str_self_end:
-str_speed:
-    .byte " ms/px"
-str_speed_end:
-str_run:
-    .byte " RUN"
-str_run_end:
-str_done:
-    .byte "DONE"
-str_done_end:
-
-str_self_len = str_self_end - str_self
-str_speed_len = str_speed_end - str_speed
-str_run_len = str_run_end - str_run
-str_done_len = str_done_end - str_done
-speed_precision = 6
-
-speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
-speed_len = 14 + str_speed_len
-
-
-char_map:
-    ; Map ATASCII string values to framebuffer font entries
-    ; Sighhhhh
-    .repeat 32, i
-        .byte i + 64
-    .endrepeat
-    .repeat 64, i
-        .byte i
-    .endrepeat
-    .repeat 32, i
-        .byte 96 + i
-    .endrepeat
-
-hex_chars:
-    .byte "0123456789abcdef"
-
-aspect:
-    ; aspect ratio!
-    ; pixels at 320w are 5:6 (narrow)
-    ; pixels at 160w are 5:3 (wide)
-    ;
-    ; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4)
-    ; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4)
-    ;
-    ; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624
-    ; &horizontal range -80 .. 79.9 is -3.125 .. 3.124
-    ;
-    ; 184h is the equiv of 220.8h at square pixels
-    ; 320 / 220.8 = 1.45 display aspect ratio
-aspect_x: ; fixed4.16 5/4
-    .word 5 << (12 - 2)
-
-aspect_y: ; fixed4.16 3/4
-    .word 3 << (12 - 2)
-
-ms_per_frame: ; float48 16.66666667
-    .byte 64  ; exponent/sign
-    .byte $16 ; BCD digits
-    .byte $66
-    .byte $66
-    .byte $66
-    .byte $67
-
-display_list_start:
-    ; 24 lines overscan
-    .repeat 3
-        .byte $70 ; 8 blank lines
-    .endrep
-
-    ; 8 scan lines, 1 row of 40-column text
-    .byte $42
-    .addr textbuffer
-
-    ; 184 lines graphics
-    ; ANTIC mode e (160px 2bpp, 1 scan line per line)
-    .byte $4e
-    .addr framebuffer_top
-    .repeat half_height - 1
-        .byte $0e
-    .endrep
-    .byte $4e
-    .addr framebuffer_bottom
-    .repeat half_height - 1
-        .byte $0e
-    .endrep
-
-    .byte $41 ; jump and blank
-    .addr display_list
-display_list_end:
-display_list_len = display_list_end - display_list_start
-
-color_map:
-    .byte 0
-    .repeat 85
-        .byte 1
-        .byte 2
-        .byte 3
-    .endrepeat
-
-palette:
-    .byte $00
-    .byte $46
-    .byte $78
-    .byte $b4
-.code
-
-z_buffer_len = 16
-z_buffer_mask = z_buffer_len - 1
-z_buffer:
-    ; the last N zx/zy values
-    .repeat z_buffer_len
-        .word 0
-        .word 0
-    .endrepeat
-
-.export start
-
-max_fill_level = 6
-fill_masks:
-    .byte %00011111
-    .byte %00001111
-    .byte %00000111
-    .byte %00000011
-    .byte %00000001
-    .byte %00000000
-
-; 2 + 9 * byte cycles
-.macro add bytes, dest, arg1, arg2
-    clc ; 2 cyc
-    .repeat bytes, byte ; 9 * byte cycles
-        lda arg1 + byte
-        adc arg2 + byte
-        sta dest + byte
-    .endrepeat
-.endmacro
-
-.macro add16 dest, arg1, arg2
-    add 2, dest, arg1, arg2
-.endmacro
-
-.macro add32 dest, arg1, arg2
-    add 4, dest, arg2, dest
-.endmacro
-
-; 2 + 9 * byte cycles
-.macro sub bytes, dest, arg1, arg2
-    sec ; 2 cyc
-    .repeat bytes, byte ; 9 * byte cycles
-        lda arg1 + byte
-        sbc arg2 + byte
-        sta dest + byte
-    .endrepeat
-.endmacro
-
-.macro sub16 dest, arg1, arg2
-    sub 2, dest, arg1, arg2
-.endmacro
-
-.macro sub32 dest, arg1, arg2
-    sub 4, dest, arg1, arg2
-.endmacro
-
-.macro shl bytes, arg
-    asl arg
-    .repeat bytes-1, i
-        rol arg + 1 + i
-    .endrepeat
-.endmacro
-
-.macro shl16 arg
-    shl 2, arg
-.endmacro
-
-.macro shl24 arg
-    shl 3, arg
-.endmacro
-
-.macro shl32 arg
-    shl 4, arg
-.endmacro
-
-; 6 * bytes cycles
-.macro copy bytes, dest, arg
-    .repeat bytes, byte ; 6 * bytes cycles
-        lda arg + byte  ; 3 cyc
-        sta dest + byte ; 3 cyc
-    .endrepeat
-.endmacro
-
-.macro copy16 dest, arg
-    copy 2, dest, arg
-.endmacro
-
-.macro copy32 dest, arg
-    copy 4, dest, arg
-.endmacro
-
-.macro copyfloat dest, arg
-    copy 6, dest, arg
-.endmacro
-
-; 2 + 8 * byte cycles
-.macro neg bytes, arg
-    sec ; 2 cyc
-    .repeat bytes, byte ; 8 * byte cycles
-        lda #00         ; 2 cyc
-        sbc arg + byte  ; 3 cyc
-        sta arg + byte  ; 3 cyc
-    .endrepeat
-.endmacro
-
-; 18 cycles
-.macro neg16 arg
-    neg 2, arg
-.endmacro
-
-; 34 cycles
-.macro neg32 arg
-    neg 4, arg
-.endmacro
-
-; inner loop for imul16
-; bitnum < 8: 25 or 41 cycles
-; bitnum >= 8: 30 or 46 cycles
-.macro bitmul16 arg1, arg2, result, bitnum
-    .local zero
-    .local one
-    .local next
-
-    ; does 16-bit adds
-    ; arg1 and arg2 are treated as unsigned
-    ; negative signed inputs must be flipped first
-
-    ; 7 cycles up to the branch
-
-    ; check if arg1 has 0 or 1 bit in this place
-    ; 5 cycles either way
-    .if bitnum < 8
-        lda arg1                 ; 3 cyc
-        and #(1 << (bitnum))       ; 2 cyc
-    .else
-        lda arg1 + 1             ; 3 cyc
-        and #(1 << ((bitnum) - 8)) ; 2 cyc
-    .endif
-    bne one ; 2 cyc
-
-zero: ; 18 cyc, 23 cyc
-    lsr result + 3 ; 5 cyc
-    jmp next       ; 3 cyc
-
-one: ; 32 cyc, 37 cyc
-    ; 16-bit add on the top bits
-    clc            ; 2 cyc
-    lda result + 2 ; 3 cyc
-    adc arg2       ; 3 cyc
-    sta result + 2 ; 3 cyc
-    lda result + 3 ; 3 cyc
-    adc arg2 + 1   ; 3 cyc
-    ror a          ; 2 cyc - get a jump on the shift
-    sta result + 3 ; 3 cyc
-next:
-    ror result + 2 ; 5 cyc
-    ror result + 1 ; 5 cyc
-    .if bitnum >= 8
-        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
-        ; when it's all uninitialized data
-        ror result ; 5 cyc
-    .endif
-
-.endmacro
-
-; 5 to 25 cycles
-.macro check_sign arg
-    ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the X register.
-    .local positive
-    lda arg + 1   ; 3 cyc
-    bpl positive  ; 2 cyc
-    neg16 arg     ; 18 cyc
-    inx           ; 2 cyc
-positive:
-.endmacro
-
-; 518 - 828 cyc
-.macro imul16 dest, arg1, arg2
-    copy16 FR0, arg1  ; 12 cyc
-    copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; 470-780 cyc
-    copy32 dest, FR2  ; 24 cyc
-.endmacro
-
-.macro shift_round_16 arg, shift
-    .repeat shift
-        shl32 arg
-    .endrepeat
-    round16 arg
-.endmacro
-
-.macro imul16_round dest, arg1, arg2, shift
-    copy16 FR0, arg1  ; 12 cyc
-    copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; 470-780 cyc
-    shift_round_16 FR2, shift
-    copy16 dest, FR2 + 2  ; 12 cyc
-.endmacro
-
-; min 470 cycles
-; max 780 cycles
-.proc imul16_func
-    arg1 = FR0   ; 16-bit arg (clobbered)
-    arg2 = FR1   ; 16-bit arg (clobbered)
-    result = FR2 ; 32-bit result
-
-    ldx #0          ; 2 cyc
-    ; counts the number of sign bits in X
-    check_sign arg1 ; 5 to 25 cyc
-    check_sign arg2 ; 5 to 25 cyc
-    
-    ; zero out the 32-bit temp's top 16 bits
-    lda #0          ; 2 cyc
-    sta result + 2  ; 3 cyc
-    sta result + 3  ; 3 cyc
-    ; the bottom two bytes will get cleared by the shifts
-
-    ; unrolled loop for maximum speed, at the cost
-    ; of a larger routine
-    ; 440 to 696 cycles
-    .repeat 16, bitnum
-        ; bitnum < 8: 25 or 41 cycles
-        ; bitnum >= 8: 30 or 46 cycles
-        bitmul16 arg1, arg2, result, bitnum
-    .endrepeat
-
-    ; In case of mixed input signs, return a negative result.
-    cpx #1              ; 2 cyc
-    bne positive_result ; 2 cyc
-    neg32 result        ; 34 cyc
-positive_result:
-
-    rts ; 6 cyc
-.endproc
-
-.macro round16 arg
-    ; Round top 16 bits of 32-bit fixed-point number in-place
-    .local increment
-    .local high_half
-    .local check_sign
-    .local next
-
-    ; low word > $8000: round up
-    ;          = $8000: round up   if positive
-    ;                   round down if negative
-    ;          < $8000: round down
-
-    lda arg + 1
-    cmp #$80
-    beq high_half
-    bpl increment
-    bmi next
-
-high_half:
-    lda arg
-    beq check_sign
-    bpl increment
-    bmi next
-
-check_sign:
-    lda arg + 3
-    bmi next
-
-increment:       ; 5-10 cyc
-    inc arg + 2  ; 5 cyc
-    bne next     ; 2 cyc
-    inc arg + 3  ; 5 cyc
-
-next:
-
-.endmacro
-
-.proc mandelbrot
-    ; input:
-    ; cx: position scaled to 4.12 fixed point - -8..+7.9
-    ; cy: position scaled to 4.12
-    ;
-    ; output:
-    ; iter: iteration count at escape or 0
-
-    ; zx = 0
-    ; zy = 0
-    ; zx_2 = 0
-    ; zy_2 = 0
-    ; zx_zy = 0
-    ; dist = 0
-    ; iter = 0
-    lda #00
-    ldx #(iter - zx + 1)
-initloop:
-    sta zx - 1,x
-    dex
-    bne initloop
-    sta z_buffer_start
-    sta z_buffer_end
-
-loop:
-    ; iter++ & max-iters break
-    inc iter
-    bne keep_going
-    jmp exit_path
-keep_going:
-
-    .macro quick_exit arg, max
-        .local positive
-        .local negative
-        .local nope_out
-        .local first_equal
-        .local all_done
-
-        ; check sign bit
-        lda arg + 1
-        bmi negative
-
-    positive:
-        cmp #((max) << 4)
-        bmi all_done ; 'less than'
-        jmp exit_path
-
-    negative:
-        cmp #(256 - ((max) << 4))
-        beq first_equal ; 'equal' on first byte
-        bpl all_done    ; 'greater than'
-
-    nope_out:
-        jmp exit_path
-    
-    first_equal:
-        lda arg
-        beq nope_out  ; 2nd byte 0 shows it's really 'equal'
-
-    all_done:
-    .endmacro
-
-    ; 4.12: (-8 .. +7.9)
-    ; zx = zx_2  - zy_2  + cx
-    sub16 zx, zx_2, zy_2
-    add16 zx, zx, cx
-    quick_exit zx, 2
-
-    ; zy = zx_zy + zx_zy + cy
-    add16 zy, zx_zy, zx_zy
-    add16 zy, zy, cy
-    quick_exit zy, 2
-
-    ; zx_2 = zx * zx
-    imul16_round zx_2, zx, zx, 4
-
-    ; zy_2 = zy * zy
-    imul16_round zy_2, zy, zy, 4
-
-    ; zx_zy = zx * zy
-    imul16_round zx_zy, zx, zy, 4
-
-    ; dist = zx_2 + zy_2
-    add16 dist, zx_2, zy_2
-    quick_exit dist, 4
-
-    ; if may be in the lake, look for looping output with a small buffer
-    ; as an optimization vs running to max iters
-    lda z_buffer_active
-    beq skip_z_buffer
-
-    ldx z_buffer_start
-    cpx z_buffer_end
-    beq z_nothing_to_read
-
-z_buffer_loop:
-    .macro z_compare arg
-        .local compare_no_match
-        lda z_buffer,x
-        inx
-        cmp arg
-        bne compare_no_match
-        iny
-    compare_no_match:
-    .endmacro
-    .macro z_advance
-        .local skip_reset_x
-        cpx #(z_buffer_len * 4)
-        bmi skip_reset_x
-        ldx #0
-    skip_reset_x:
-    .endmacro
-    .macro z_store arg
-        lda arg
-        sta z_buffer,x
-        inx
-    .endmacro
-
-    ; Compare the previously stored z values
-    ldy #0
-    z_compare zx
-    z_compare zx + 1
-    z_compare zy
-    z_compare zy + 1
-
-    cpy #4
-    bne z_no_matches
-    jmp z_exit
-
-z_no_matches:
-    z_advance
-
-    cpx z_buffer_end
-    bne z_buffer_loop
-
-z_nothing_to_read:
-
-    ; Store and expand
-    z_store zx
-    z_store zx + 1
-    z_store zy
-    z_store zy + 1
-    z_advance
-    stx z_buffer_end
-
-    ; Increment the start roller if necessary (limit size)
-    lda iter
-    cmp #(z_buffer_len * 4)
-    bmi skip_inc_start
-    lda z_buffer_start
-    clc
-    adc #4
-    tax
-    z_advance
-    stx z_buffer_start
-skip_inc_start:
-
-skip_z_buffer:
-
-    jmp loop
-
-z_exit:
-    lda #0
-    sta iter
-
-exit_path:
-    ldx #0
-    lda iter
-    bne next
-    inx
-next:
-    stx z_buffer_active
-    rts
-
-.endproc
-
-.macro scale_zoom dest
-    ; clobbers X, flags
-    .local cont
-    .local enough
-
-    ; cx = (sx << (8 - zoom))
-    ldx zoom
-cont:
-    cpx #8
-    beq enough
-    shl16 dest
-    inx
-    jmp cont
-enough:
-.endmacro
-
-.macro zoom_factor dest, src, zoom, aspect
-    ; clobbers A, X, flags, etc
-    copy16 dest, src
-    scale_zoom dest
-
-    ; cy = cy * (3 / 4)
-    ; cx = cx * (5 / 4)
-    imul16_round dest, dest, aspect, 4
-.endmacro
-
-.proc pset
-    ; screen coords in signed sx,sy
-    ; iter holds the target to use
-    ; @todo implement
-
-    ; iter -> color
-    ldx iter
-    lda color_map,x
-    sta pixel_color
-    lda #(255 - 3)
-    sta pixel_mask
-
-    ; sy -> line base address in temp
-    lda sy
-    bpl positive
-
-negative:
-    ; temp1 = top half
-    lda #.lobyte(framebuffer_top + stride * half_height)
-    sta pixel_ptr
-    lda #.hibyte(framebuffer_top + stride * half_height)
-    sta pixel_ptr + 1
-    jmp point
-
-positive:
-
-    lda #.lobyte(framebuffer_bottom)
-    sta pixel_ptr
-    lda #.hibyte(framebuffer_bottom)
-    sta pixel_ptr + 1
-
-point:
-
-    ; pixel_ptr += sy * stride
-    ;    temp * 40
-    ; =  temp * 32  +  temp * 8
-    ; = (temp << 5) + (temp << 3)
-    copy16 temp, sy
-    shl16 temp
-    shl16 temp
-    shl16 temp
-    add16 pixel_ptr, pixel_ptr, temp
-    shl16 temp
-    shl16 temp
-    add16 pixel_ptr, pixel_ptr, temp
-
-    ; Ok so temp1 points to the start of the line, which is 40 bytes.
-    ; Get the byte and bit offsets
-    lda sx
-    clc
-    adc #half_width
-    sta temp
-
-    ; pixel_shift = temp & 3
-    ; pixel_color <<= pixel_shift (shifting in zeros)
-    ; pixel_mask <<= pixel_shift (shifting in ones)
-    and #3
-    sta pixel_shift
-    lda #3
-    sec
-    sbc pixel_shift
-    tax
-shift_loop:
-    beq shift_done
-    asl pixel_color
-    asl pixel_color
-    sec
-    rol pixel_mask
-    sec
-    rol pixel_mask
-    dex
-    jmp shift_loop
-shift_done:
-
-    ; pixel_offset = temp >> 2
-    lda temp
-    lsr a
-    lsr a
-    sta pixel_offset
-    tay
-
-    ; read, mask, or, write
-    lda (pixel_ptr),y
-    and pixel_mask
-    ora pixel_color
-    sta (pixel_ptr),y
-
-    rts
-.endproc
-
-.macro draw_text_indirect col, len, strptr
-    ; clobbers A, X
-    .local loop
-    .local done
-    ldx #0
-loop:
-    cpx #len
-    beq done
-    txa
-    tay
-    lda (strptr),y
-    tay
-    lda char_map,y
-    sta textbuffer + col,x
-    inx
-    jmp loop
-done:
-.endmacro
-
-.macro draw_text col, len, cstr
-    ; clobbers A, X
-    .local loop
-    .local done
-    ldx #0
-loop:
-    cpx #len
-    beq done
-    ldy cstr,x
-    lda char_map,y
-    sta textbuffer + col,x
-    inx
-    jmp loop
-done:
-.endmacro
-
-.proc vblank_handler
-    inc count_frames
-    inc palette_offset
-    jsr update_palette
-    jmp XITVBV
-.endproc
-
-.proc update_palette
-    lda palette
-    sta COLOR4
-
-    clc
-    lda palette_offset
-    and #$f0
-    adc palette + 1
-    sta COLOR0
-
-    clc
-    lda palette_offset
-    and #$f0
-    adc palette + 2
-    sta COLOR1
-
-    clc
-    lda palette_offset
-    and #$f0
-    adc palette + 3
-    sta COLOR2
-.endproc
-
-.proc update_speed
-    ; convert frames (u16) to fp
-    ; add to frames_total
-    ; convert pixels (u16) to fp
-    ; add to pixels_total
-    ; (frames_total * 16.66666667) / pixels_total
-    ; convert to ATASCII
-    ; draw text
-.endproc
-
-.proc keycheck
-    ; clobbers all
-    ; returns 255 in A if state change or 0 if no change
-
-    ; check keyboard buffer
-    lda CH
-    cmp #$ff
-    beq skip_char
-
-    ; Clear the keyboard buffer and re-enable interrupts
-    ldx #$ff
-    stx CH
-
-    tay
-
-    lda zoom
-    cpy #KEY_PLUS
-    beq plus
-    cpy #KEY_MINUS
-    beq minus
-
-    ; temp = $0010 << (8 - zoom)
-    lda #$10
-    sta temp
-    lda #$00
-    sta temp + 1
-    scale_zoom temp
-
-    cpy #KEY_UP
-    beq up
-    cpy #KEY_DOWN
-    beq down
-    cpy #KEY_LEFT
-    beq left
-    cpy #KEY_RIGHT
-    beq right
-
-skip_char:
-    lda #0
-    rts
-
-plus:
-    cmp #8
-    bpl skip_char
-    inc zoom
-    jmp done
-minus:
-    cmp #1
-    bmi skip_char
-    dec zoom
-    jmp done
-up:
-    sub16 oy, oy, temp 
-    jmp done
-down:
-    add16 oy, oy, temp
-    jmp done
-left:
-    sub16 ox, ox, temp
-    jmp done
-right:
-    add16 ox, ox, temp
-done:
-    lda #255
-    rts
-
-.endproc
-
-.proc clear_screen
-    ; zero the range from framebuffer_top to display_list
-    lda #.lobyte(framebuffer_top)
-    sta temp
-    lda #.hibyte(framebuffer_top)
-    sta temp + 1
-
-zero_page_loop:
-    lda #0
-    ldy #0
-zero_byte_loop:
-    sta (temp),y
-    iny
-    bne zero_byte_loop
-
-    inc temp + 1
-    lda temp + 1
-    cmp #.hibyte(display_list)
-    bne zero_page_loop
-
-    rts
-.endproc
-
-.proc status_bar
-    ; Status bar
-    draw_text 0, str_self_len, str_self
-    draw_text 40 - str_run_len, str_run_len, str_run
-
-    rts
-.endproc
-
-.proc start
-
-    ; ox = 0; oy = 0; zoom = 0
-    ; count_frames = 0; count_pixels = 0
-    lda #0
-    sta ox
-    sta ox + 1
-    sta oy
-    sta oy + 1
-    sta count_frames
-    sta count_pixels
-
-    ; total_ms = 0.0; total_pixels = 0.0
-    ldx #total_ms
-    jsr ZF1
-    ldx #total_pixels
-    jsr ZF1
-
-    ; zoom = 2x
-    lda #1
-    sta zoom
-
-    ; Disable display DMA
-    lda #0
-    sta DMACTL
-
-    jsr clear_screen
-
-    ; Copy the display list into properly aligned memory
-    ; Can't cross 1024-byte boundaries :D
-    ldx #0
-copy_byte_loop:
-    lda display_list_start,x
-    sta display_list,x
-    inx
-    cpx #display_list_len
-    bne copy_byte_loop
-
-    ; Set up the display list
-    lda #.lobyte(display_list)
-    sta DLISTL ; actual register
-    sta SDLSTL ; shadow register the OS will copy in
-    lda #.hibyte(display_list)
-    sta DLISTH ; actual register
-    sta SDLSTH ; shadow register the OS will copy in
-
-    ; Re-enable display DMA
-    lda #$22
-    sta DMACTL
-
-    ; Initialize the palette
-    lda #0
-    sta palette_offset
-    jsr update_palette
-
-    ; install the vblank handler
-    lda #7 ; deferred
-    ldx #.hibyte(vblank_handler)
-    ldy #.lobyte(vblank_handler)
-    jsr SETVBV
-
-main_loop:
-    jsr clear_screen
-    jsr status_bar
-
-    lda #0
-    sta fill_level
-
-fill_loop:
-
-    ; sy = -92 .. 91
-    lda #(256-half_height)
-    sta sy
-    lda #(256-1)
-    sta sy + 1
-
-loop_sy:
-    ; sx = -80 .. 79
-    lda #(256-half_width)
-    sta sx
-    lda #(256-1)
-    sta sx + 1
-
-loop_sx:
-    ; check the fill mask
-    ldy #0
-
-loop_skip_level:
-    cpy fill_level
-    beq current_level
-
-    lda fill_masks,y
-    and sx
-    bne not_skipped_mask1
-
-    lda fill_masks,y
-    and sy
-    beq skipped_mask
-
-not_skipped_mask1:
-    iny
-    jmp loop_skip_level
-
-current_level:
-    lda fill_masks,y
-    and sx
-    bne skipped_mask
-
-    lda fill_masks,y
-    and sy
-    beq not_skipped_mask
-
-skipped_mask:
-    jmp skipped
-
-not_skipped_mask:
-
-    ; run the fractal!
-    zoom_factor cx, sx, zoom, aspect_x
-    add16 cx, cx, ox
-    zoom_factor cy, sy, zoom, aspect_y
-    add16 cy, cy, oy
-    jsr mandelbrot
-    jsr pset
-
-    jsr keycheck
-    beq no_key
-    ; @fixme clear the pixel stats
-    jmp main_loop
-
-no_key:
-    ; check if we should update the counters
-    ;
-    ; count_pixels >= width? update!
-    inc count_pixels
-    lda count_pixels
-    cmp #width
-    bmi update_status
-
-    ; count_frames >= 120? update!
-    lda count_frames
-    cmp #120 ; >= 2 seconds
-    bmi skipped
-
-update_status:
-    ; FR0 = (float)count_pixels & clear count_pixels
-    lda count_pixels
-    sta FR0
-    lda #0
-    sta FR0 + 1
-    sta count_pixels
-    jsr IFP
-
-    ; FR1 = total_pixels
-    ldx #.lobyte(total_pixels)
-    ldy #.hibyte(total_pixels)
-    jsr FLD1R
-
-    ; FR0 += FR1
-    jsr FADD
-
-    ; total_pixels = FR0
-    ldx #.lobyte(total_pixels)
-    ldy #.hibyte(total_pixels)
-    jsr FST0R
-
-
-    ; FR0 = (float)count_frames & clear count_frames
-    ; warning: this should really disable interrupts @TODO
-    lda count_frames
-    sta FR0
-    lda #0
-    sta FR0 + 1
-    sta count_frames
-    jsr IFP
-
-    ; FR0 *= ms_per_frame
-    ldx #.lobyte(ms_per_frame)
-    ldy #.hibyte(ms_per_frame)
-    jsr FLD1R
-    jsr FMUL
-
-    ; FR0 += total_ms
-    ldx #total_ms
-    ldy #0
-    jsr FLD1R
-    jsr FADD
-
-    ; total_ms = FR0
-    ldx #total_ms
-    ldy #0
-    jsr FST0R
-
-    ; FR0 /= total_pixels
-    ldx #total_pixels
-    ldy #0
-    jsr FLD1R
-    jsr FDIV
-
-    ; convert to ASCII in INBUFF
-    jsr FASC
-
-    ; print the first 6 digits
-    draw_text_indirect speed_start, speed_precision, INBUFF
-    draw_text speed_start + speed_precision, str_speed_len, str_speed
-
-skipped:
-
-    clc
-    lda sx
-    adc #1
-    sta sx
-    lda sx + 1
-    adc #0
-    sta sx + 1
-
-    lda sx
-    cmp #half_width
-    beq loop_sx_done
-    jmp loop_sx
-
-loop_sx_done:
-
-    clc
-    lda sy
-    adc #1
-    sta sy
-    lda sy + 1
-    adc #0
-    sta sy + 1
-
-    lda sy
-    cmp #half_height
-    beq loop_sy_done
-    jmp loop_sy
-
-loop_sy_done:
-
-fill_loop_done:
-    inc fill_level
-    lda fill_level
-    cmp #max_fill_level
-    beq loop
-    jmp fill_loop
-
-loop:
-    ; finished
-    draw_text 40 - str_done_len, str_done_len, str_done
-    jsr keycheck
-    beq loop
-    jmp main_loop
-
-.endproc
diff --git a/readme.md b/readme.md
index 6b57378..2c9efc1 100644
--- a/readme.md
+++ b/readme.md
@@ -14,32 +14,37 @@ Non-goals:
 
 Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
 
--- brooke, january 2023 - february 2024
+-- brooke, january 2023 - december 2024
 
 ## Current state
 
-Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.
+Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
 
-The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
+The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
 
-The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.
+* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition
+* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
+* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
+* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
 
-The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
+The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates.
 
 Iterations are capped at 255.
 
 The pixels are run in a progressive layout to get the basic shape on screen faster.
 
-## Next steps
+There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D
 
-Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
+There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
 
-Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
-
-I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
+There's some cute color cycling.
 
 ## Deps and build instructions
 
 I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.
 
 Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
+
+## Todo
+
+See ideas in `todo.md`.
diff --git a/tables.js b/tables.js
new file mode 100644
index 0000000..50cbef9
--- /dev/null
+++ b/tables.js
@@ -0,0 +1,50 @@
+function db(func) {
+    let lines = [];
+    for (let i = 0; i < 256; i += 16) {
+        let items = [];
+        for (let j = 0; j < 16; j++) {
+            let x = i + j;
+            items.push(func(x));
+        }
+        lines.push('    .byte ' + items.join(', '));
+    }
+    return lines.join('\n');
+}
+
+let squares = [];
+for (let i = 0; i < 512; i++) {
+    squares.push(Math.trunc((i * i + 1) / 2));
+}
+
+console.log(
+`.segment "TABLES"
+
+.export mul_lobyte256
+.export mul_hibyte256
+.export mul_hibyte512
+.export sqr_lobyte
+.export sqr_hibyte
+
+; (i * i + 1) / 2 for the multiplier
+.align 256
+mul_lobyte256:
+${db((i) => squares[i] & 0xff)}
+
+.align 256
+mul_hibyte256:
+${db((i) => (squares[i] >> 8) & 0xff)}
+
+.align 256
+mul_hibyte512:
+${db((i) => (squares[i + 256] >> 8) & 0xff)}
+
+; (i * i) for the plain squares
+.align 256
+sqr_lobyte:
+${db((i) => (i * i) & 0xff)}
+
+.align 256
+sqr_hibyte:
+${db((i) => ((i * i) >> 8) & 0xff)}
+
+`);
diff --git a/testme.js b/testme.js
new file mode 100644
index 0000000..e12e706
--- /dev/null
+++ b/testme.js
@@ -0,0 +1,41 @@
+// ax = (a + x)2/2 - a2/2 - x2/2 
+
+function half_square(x) {
+    return Math.round(x * x / 2) & 0xffff >>> 0;
+}
+
+function mul8(a, b) {
+    let result = half_square(a + b) & 0xffff;
+    result = (result - half_square(a)) & 0xffff;
+    result = (result - half_square(b)) & 0xffff;
+    result = (result + (b & a & 1)) & 0xffff;
+    return result >>> 0;
+}
+
+function mul16(a, b) {
+    let ah = (a & 0xff00) >>> 8;
+    let al = (a & 0x00ff) >>> 0;
+    let bh = (b & 0xff00) >>> 8;
+    let bl = (b & 0x00ff) >>> 0;
+    let result = (mul8(al, bl) & 0xffff) >>> 0;
+    result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
+    result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
+    result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
+    return result;
+}
+
+let max = 65536;
+//let max = 256;
+//let max = 128;
+//let max = 8;
+
+for (let a = 0; a < max; a++) {
+    for (let b = 0; b < max; b++) {
+        let expected = Math.imul(a, b) >>> 0;
+        //let actual = mul8(a, b);
+        let actual = mul16(a, b);
+        if (expected !== actual) {
+            console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
+        }
+    }
+}
\ No newline at end of file
diff --git a/todo.md b/todo.md
new file mode 100644
index 0000000..6807ae2
--- /dev/null
+++ b/todo.md
@@ -0,0 +1,17 @@
+things to try:
+
+* fix status bar to show elapsed time, per-iter time, per-pixel iter count
+
+* 'turbo' mode disabling graphics in full or part
+
+* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
+
+* maybe clean up the load/layout of the big mul table
+
+* consider alternate lookup tables in the top 16KB under ROM
+
+* y-axis mirror optimization
+
+* extract viewport for display & re-input via keyboard
+
+* fujinet screenshot/viewport uploader