diff --git a/Makefile b/Makefile
index 711adcd..bd14c7d 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
 all : mandel.xex
 
 mandel.xex : mandel.o tables.o atari-asm-xex.cfg
-	ld65 -C ./atari-asm-xex.cfg --mapfile mandel.map -o $@ mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
 
 %.o : %.s
 	ca65 -o $@ $<
@@ -15,6 +15,4 @@ clean :
 	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex
-	rm -f mandel.map
-
 
diff --git a/mandel.s b/mandel.s
index b52f24a..b8985b3 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1,16 +1,16 @@
 ; Our zero-page vars
-ox              = $80 ; fixed6.26: center point x
-oy              = $84 ; fixed6.26: center point y
-cx              = $88 ; fixed6.26: c_x
-cy              = $8c ; fixed6.26: c_y
+ox              = $80 ; fixed8.24: center point x
+oy              = $84 ; fixed8.24: center point y
+cx              = $88 ; fixed8.24: c_x
+cy              = $8c ; fixed8.24: c_y
 
-zx              = $90 ; fixed6.26: z_x
-zy              = $94 ; fixed6.26: z_y
-zx_2            = $98 ; fixed6.26: z_x^2
-zy_2            = $9c ; fixed6.26: z_y^2
+zx              = $90 ; fixed8.24: z_x
+zy              = $94 ; fixed8.24: z_y
+zx_2            = $98 ; fixed8.24: z_x^2
+zy_2            = $9c ; fixed8.24: z_y^2
 
-zx_zy           = $a0 ; fixed6.26: z_x * z_y
-dist            = $a4 ; fixed6.26: z_x^2 + z_y^2
+zx_zy           = $a0 ; fixed8.24: z_x * z_y
+dist            = $a4 ; fixed8.24: z_x^2 + z_y^2
 sx              = $a8 ; i16: screen pixel x
 sy              = $aa ; i16: screen pixel y
 z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
@@ -31,12 +31,10 @@ chroma_offset   = $bb ; u8
 palette_ticks   = $bc ; u8
 chroma_ticks    = $bd ; u8
 count_frames    = $be ; u8
-; free space $bf
+count_pixels    = $bf ; u8
 
-count_iters     = $c0 ; u16
-text_col        = $c2 ; u8
-text_row        = $c3 ; u8
-; free space c4-cb
+total_pixels    = $c0 ; float48
+total_ms        = $c6 ; float48
 temp            = $cc ; u16
 temp2           = $ce ; u16
 
@@ -61,12 +59,10 @@ LBUFF  = $0580 ; result buffer for FASC routine
 ; FP ROM routine vectors
 FASC   = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
 IFP    = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
-FPI    = $D9D2 ; floating point to integer
 FADD   = $DA66 ; ADDITION       (FR0 += FR1)
 FSUB   = $DA60 ; SUBTRACTION    (FR0 -= FR1)
 FMUL   = $DADB ; MULTIPLICATION (FR0 *= FR1)
 FDIV   = $DB28 ; DIVISION       (FR0 /= FR1)
-ZFR0   = $DA44 ; clear FR0
 ZF1    = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
 FLD0R  = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
 FLD1R  = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
@@ -80,7 +76,7 @@ framebuffer_bottom = $b000
 display_list       = $bf00
 framebuffer_end    = $c000
 
-height = 176
+height = 184
 half_height = height >> 1
 width = 160
 half_width = width >> 1
@@ -126,10 +122,6 @@ KEY_7     = 51
 KEY_8     = 53
 KEY_9     = 48
 KEY_0     = 50
-KEY_PERIOD = 34
-KEY_E     = 42
-KEY_X     = 22
-KEY_Y     = 43
 
 .struct float48
     exponent .byte
@@ -148,68 +140,25 @@ strings:
 str_self:
     .byte "MANDEL-6502"
 str_self_end:
-    .byte 0
 str_speed:
-    .byte "us/iter: "
+    .byte " ms/px"
 str_speed_end:
-    .byte 0
 str_run:
     .byte " RUN"
 str_run_end:
-    .byte 0
 str_done:
     .byte "DONE"
 str_done_end:
-    .byte 0
-str_padding:
-    .byte "      "
-str_padding_end:
-    .byte 0
-
-str_space:
-    .byte " "
-    .byte 0
-
-str_h:
-    .byte "h"
-    .byte 0
-str_m:
-    .byte "m"
-    .byte 0
-str_s:
-    .byte "s"
-    .byte 0
 
+str_self_len = str_self_end - str_self
 str_speed_len = str_speed_end - str_speed
 str_run_len = str_run_end - str_run
 str_done_len = str_done_end - str_done
-str_padding_len = str_padding_end - str_padding
+speed_precision = 6
 
-; "3h59m59s"
-str_elapsed_spacer = 8
-speed_start = 40 - str_done_len - str_speed_len - str_padding_len - str_elapsed_spacer - 1
+speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
+speed_len = 14 + str_speed_len
 
-col_x = 1
-str_x:
-    .byte "X:"
-    .byte 0
-str_x_len = 2
-str_x_space = 12
-str_x_padding = 2
-
-col_y = col_x + str_x_len + str_x_space + str_x_padding
-str_y:
-    .byte "Y:"
-    .byte 0
-str_y_len = 2
-str_y_space = 12
-str_y_padding = 2
-
-col_zoom = col_y + str_y_len + str_y_space + str_y_padding
-str_zoom:
-    .byte "ZOOM:"
-    .byte 0
-str_zoom_len = 5
 
 char_map:
     ; Map ATASCII string values to framebuffer font entries
@@ -225,12 +174,8 @@ char_map:
     .endrepeat
 
 hex_chars:
-digits_zero:
     .byte "0123456789abcdef"
 
-digits_space:
-    .byte " 123456789abcdef"
-
 aspect:
     ; aspect ratio!
     ; pixels at 320w are 5:6 (narrow)
@@ -244,49 +189,20 @@ aspect:
     ;
     ; 184h is the equiv of 220.8h at square pixels
     ; 320 / 220.8 = 1.45 display aspect ratio
-aspect_x: ; fixed3.13 5/4
-    .word 5 << (13 - 2)
+aspect_x: ; fixed4.16 5/4
+    .word 5 << (12 - 2)
 
-aspect_y: ; fixed3.13 3/4
-    .word 3 << (13 - 2)
+aspect_y: ; fixed4.16 3/4
+    .word 3 << (12 - 2)
 
-fixed3_13_as_float: ; float48
-    ; 1 << 13
-    ; 8192
-    ; 81 92 . 00 00 00
-    .byte 65 ; exponent/sign - +1 byte
-    .byte $81
-    .byte $92
-    .byte $00
-    .byte $00
-    .byte $00
-
-sec_per_frame: ; float48 00 . 01 66 66 66 67
-    .byte 63  ; exponent/sign - -1 bytes
-    .byte $01 ; BCD digits
+ms_per_frame: ; float48 16.66666667
+    .byte 64  ; exponent/sign
+    .byte $16 ; BCD digits
     .byte $66
     .byte $66
     .byte $66
     .byte $67
 
-us_per_sec: ; float48 1e9 01 00 0,0 00 . 00
-    .byte 67  ; exponent/sign +3 bytes
-    .byte $01 ; BCD digits
-    .byte $00
-    .byte $00
-    .byte $00
-    .byte $00
-
-total_iters: ; float48
-    .repeat 6
-        .byte 0
-    .endrepeat
-
-total_sec: ; float48
-    .repeat 6
-        .byte 0
-    .endrepeat
-
 display_list_start:
     ; 24 lines overscan
     .repeat 3
@@ -310,10 +226,6 @@ display_list_start:
         .byte $0e
     .endrep
 
-    ; 8 scan lines, 1 row of 40-column text
-    .byte $42
-    .addr textbuffer + 40
-
     .byte $41 ; jump and blank
     .addr display_list
 display_list_end:
@@ -322,9 +234,9 @@ display_list_len = display_list_end - display_list_start
 color_map:
     .byte 0
     .repeat 85
-        .byte %01010101
-        .byte %10101010
-        .byte %11111111
+        .byte 1
+        .byte 2
+        .byte 3
     .endrepeat
 
 
@@ -373,46 +285,23 @@ fill_masks:
     .byte %00000001
     .byte %00000000
 
-pixel_masks:
-    .byte %11111111
-    .byte %11110000
-    .byte %11000000
-
 viewport_zoom:
-    .byte 0
-    .byte 5
-    .byte 7
-    .byte 5
-    .byte 7
-    .byte 7
+    .byte 1
+    .byte 6
+    .byte 8
+    .byte 6
 
 viewport_ox:
-    .dword ($00000000 & $3fffffff) << 2
-    .dword ($ff110000 & $3fffffff) << 2
-    .dword ($ff110000 & $3fffffff) << 2
-    .dword ($fe400000 & $3fffffff) << 2
-    .dword ($fe3b0000 & $3fffffff) << 2
-    .dword $fd220000
+    .dword $00000000
+    .dword $ff110000
+    .dword $ff110000
+    .dword $fe400000
 
 viewport_oy:
-    .dword ($00000000 & $3fffffff) << 2
-    .dword ($ffb60000 & $3fffffff) << 2
-    .dword ($ffbe0000 & $3fffffff) << 2
-    .dword ($00000000 & $3fffffff) << 2
-    .dword ($fffe0000 & $3fffffff) << 2
-    .dword $ff000000
-
-elapsed_work:
-    .dword 0
-elapsed_digit:
-    .byte 0
-
-input_col:
-    .byte 0
-input_row:
-    .byte 0
-input_max:
-    .byte 0
+    .dword $00000000
+    .dword $ffb60000
+    .dword $ffbe0000
+    .dword $00000000
 
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
@@ -461,7 +350,7 @@ input_max:
     sub 4, dest, arg1, arg2
 .endmacro
 
-; 3 + 5 * (bytes - 1) cycles
+; 3 + 5 * bytes cycles
 .macro shl bytes, arg
     asl arg              ; 3 cyc
     .repeat bytes-1, i
@@ -469,23 +358,22 @@ input_max:
     .endrepeat
 .endmacro
 
-; 8 cycles
+; 13 cycles
 .macro shl16 arg
     shl 2, arg
 .endmacro
 
-; 13 cycles
+; 18 cycles
 .macro shl24 arg
     shl 3, arg
 .endmacro
 
-; 18 cycles
+; 23 cycles
 .macro shl32 arg
     shl 4, arg
 .endmacro
 
 ; 6 * bytes cycles
-; 4 * bytes bytes
 .macro copy bytes, dest, arg
     .repeat bytes, byte ; 6 * bytes cycles
         lda arg + byte  ; 3 cyc
@@ -494,7 +382,6 @@ input_max:
 .endmacro
 
 ; 12 cycles
-; 8 bytes
 .macro copy16 dest, arg
     copy 2, dest, arg
 .endmacro
@@ -529,19 +416,17 @@ input_max:
     neg 4, arg
 .endmacro
 
-; 11-27 + 18 * shift cycles
-; 65-81 cycles for shift=3
+; 11-27 + 23 * shift cycles
+; 103-119 cycles for shift=4
 .macro shift_round_16 arg, shift
     .repeat shift
-        shl32 arg ; 18 cycles
+        shl32 arg ; 23 cycles
     .endrepeat
     round16 arg ; 11-27 cycles
 .endmacro
 
 ; input: arg1, arg2 as fixed4.12
 ; output: dest as fixed8.24
-; patch point jsr at 16 bytes in
-imul16_patch_offset = 16
 .macro imul16 dest, arg1, arg2
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
@@ -551,8 +436,6 @@ imul16_patch_offset = 16
 
 ; input: arg as fixed4.12
 ; output: dest as fixed8.24
-; patch point jsr at 8 bytes in
-sqr16_patch_offset = 8
 .macro sqr16 dest, arg
     copy16 FR0, arg   ; 12 cyc
     jsr sqr16_func    ; ? cyc
@@ -570,6 +453,20 @@ sqr16_patch_offset = 8
     sta dest + 1
 .endmacro
 
+; input: arg as u8
+; input/output: dest as u16
+; clobbers a, x
+.macro sqr8_add16 dest, arg
+    ldx arg
+    clc
+    lda sqr_lobyte,x
+    adc dest
+    sta dest
+    lda sqr_hibyte,x
+    adc dest + 1
+    sta dest + 1
+.endmacro
+
 .segment "TABLES"
 ; lookup table for top byte -> PORTB value for bank-switch
 .align 256
@@ -698,6 +595,71 @@ bank_switch_table:
     .endif
 .endmacro
 
+.proc imul8xe_init
+
+    bank_switch 0
+    lda #0
+    sta EXTENDED_RAM
+    bank_switch 1
+    lda #1
+    sta EXTENDED_RAM
+    bank_switch 0
+    lda EXTENDED_RAM
+    beq init
+
+    ; no bank switching available, we just overwrite the value in base ram
+    rts
+
+init:
+
+    ; patch imul16_func into a forwarding thunk to imul16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta imul16_func
+    lda #.lobyte(imul16xe_func)
+    sta imul16_func + 1
+    lda #.hibyte(imul16xe_func)
+    sta imul16_func + 2
+
+    ; ditto for sqr16_func -> sqr16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta sqr16_func
+    lda #.lobyte(sqr16xe_func)
+    sta sqr16_func + 1
+    lda #.hibyte(sqr16xe_func)
+    sta sqr16_func + 2
+
+    ; create the lookup table
+    ; go through the input set, in four 16KB chunks
+
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+
+    lda #$00
+    sta arg1
+    sta arg2
+    sta ptr
+    lda #$40
+    sta ptr + 1
+
+    ; $00 * $00 -> $3f * $ff
+    bank_switch 0
+    jsr imul8xe_init_section
+
+    ; $40 * $00 -> $7f * $ff
+    bank_switch 1
+    jsr imul8xe_init_section
+
+    ; $80 * $00 -> $bf * $ff
+    bank_switch 2
+    jsr imul8xe_init_section
+
+    ; $c0 * $00 -> $ff * $ff
+    bank_switch 3
+    jsr imul8xe_init_section
+
+    rts
+.endproc
 
 ; Initialize a 16 KB chunk of the table
 ; input: multipliers in temp
@@ -787,8 +749,9 @@ inner_loop:
     ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
 
     imul8 result, arg1, arg2, xe
-
-    imul8 result + 2, arg1 + 1, arg2 + 1, xe
+    lda #0
+    sta result + 2
+    sta result + 3
 
     imul8 inter, arg1 + 1, arg2, xe
     add16 result + 1, result + 1, inter
@@ -798,6 +761,9 @@ inner_loop:
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
+    imul8 inter, arg1 + 1, arg2 + 1, xe
+    add16 result + 2, result + 2, inter
+
     ; In case of negative inputs, adjust high word
     ; https://stackoverflow.com/a/28827013
     lda arg1 + 1
@@ -830,8 +796,9 @@ arg2_pos:
         ; h*h*256*256 + h*l*256 + h*l*256 + l*l
 
         sqr8 result, arg
-
-        sqr8 result + 2, arg + 1
+        lda #0
+        sta result + 2
+        sta result + 3
 
         imul8 inter, arg + 1, arg, xe
         add16 result + 1, result + 1, inter
@@ -839,6 +806,8 @@ arg2_pos:
         add16 result + 1, result + 1, inter
         add_carry result + 3
 
+        sqr8_add16 result + 2, arg + 1
+
         rts ; 6 cyc
     .endscope
 .endmacro
@@ -904,83 +873,10 @@ next:
 
 .endmacro
 
-; input in FR0, 16 bits signed 3.13 fixed
-; output in FR0, Atari float
-; clobbers a, x, y, FR0, FR1
-.proc fixed3_13_to_float
-    ldx #.lobyte(fixed3_13_as_float)
-    ldy #.hibyte(fixed3_13_as_float)
-    jsr FLD1R
-
-    ; check sign bit! conversion routine is for unsigned
-    lda FR0 + 1
-    bpl positive
-
-negative:
-    neg16 FR0
-    jsr IFP
-
-    ; set float sign bit
-    lda FR0
-    ora #$80
-    sta FR0
-    jmp common
-
-positive:
-    jsr IFP
-
-common:
-    jsr FDIV
-    rts
-
-.endproc
-
-; rounds to 16-bit first!
-; input in FR0, 32 bits signed 6.26 fixed
-; output in FR0, Atari float
-; clobbers a, x, y, FR0, FR1
-.proc fixed6_26_to_float
-    shift_round_16 FR0, 3
-    copy16 FR0, FR0 + 2
-    jsr fixed3_13_to_float
-    rts
-.endproc
-
-; input in FR0, Atari float
-; output in FR0, 16 bits signed 3.13 fixed
-; clobbers a, x, y, FR0, FR1
-.proc float_to_fixed3_13
-    ldx #.lobyte(fixed3_13_as_float)
-    ldy #.hibyte(fixed3_13_as_float)
-    jsr FLD1R
-    jsr FMUL
-
-    ; check sign bit! conversion routine is for unsigned
-    lda FR0
-    bcc positive
-
-negative:
-    ; clearfloat sign bit
-    lda FR0
-    eor #$80
-    sta FR0
-
-    jsr FPI
-    neg16 FR0
-    jmp common
-
-positive:
-    jsr FPI
-
-common:
-    rts
-
-.endproc
-
 .proc mandelbrot
     ; input:
-    ; cx: position scaled to 6.26 fixed point - -32..+31.9
-    ; cy: position scaled to 6.26
+    ; cx: position scaled to 8.24 fixed point - -128..+127.9
+    ; cy: position scaled to 8.24
     ;
     ; output:
     ; iter: iteration count at escape or 0
@@ -1031,11 +927,6 @@ common:
     sta z_buffer_end
 
 loop:
-    inc count_iters
-    bne low_iters
-    inc count_iters + 1
-low_iters:
-
     ; iter++ & max-iters break
     inc iter
     bne keep_going
@@ -1043,7 +934,7 @@ low_iters:
 keep_going:
 
     .macro quick_exit arg, max
-        ; arg: fixed6.26
+        ; arg: fixed8.24
         ; max: integer
         .local positive
         .local negative
@@ -1056,12 +947,12 @@ keep_going:
         bmi negative
 
     positive:
-        cmp #(max << 2)
+        cmp #max
         bmi all_done ; 'less than'
         jmp exit_path
 
     negative:
-        cmp #(256 - (max << 2))
+        cmp #(256 - max)
         beq first_equal ; 'equal' on first byte
         bpl all_done    ; 'greater than'
 
@@ -1081,7 +972,7 @@ keep_going:
     all_done:
     .endmacro
 
-    ; 6.26: (-32 .. 31.9)
+    ; 8.24: (-128 .. 127.9)
     ; zx = zx_2  - zy_2  + cx
     sub32 zx, zx_2, zy_2
     add32 zx, zx, cx
@@ -1092,20 +983,17 @@ keep_going:
     add32 zy, zy, cy
     quick_exit zy, 2
 
-    ; convert 6.26 -> 3.13: (-4 .. +3.9)
-    shift_round_16 zx, 3
-    shift_round_16 zy, 3
+    ; convert 8.24 -> 4.12: (-8 .. +7.9)
+    shift_round_16 zx, 4
+    shift_round_16 zy, 4
 
     ; zx_2 = zx * zx
-fixup_sqr16_1:
     sqr16 zx_2, zx + 2
 
     ; zy_2 = zy * zy
-fixup_sqr16_2:
     sqr16 zy_2, zy + 2
 
     ; zx_zy = zx * zy
-fixup_imul16_1:
     imul16 zx_zy, zx + 2, zy + 2
 
     ; dist = zx_2 + zy_2
@@ -1219,9 +1107,9 @@ enough:
 .endmacro
 
 .macro zoom_factor dest, src, aspect
-    ; output: dest: fixed6.26
-    ; input: src: fixed3.13
-    ; aspect: fixed3.13
+    ; output: dest: fixed8.24
+    ; input: src: fixed4.12
+    ; aspect: fixed4.12
     ; clobbers A, X, flags, etc
     copy16 dest, src
     scale_zoom dest
@@ -1239,11 +1127,8 @@ enough:
     ; iter -> color
     ldx iter
     lda color_map,x
-    ldx fill_level
-    and pixel_masks,x
     sta pixel_color
-    lda pixel_masks,x
-    eor #$ff
+    lda #(255 - 3)
     sta pixel_mask
 
     ; sy -> line base address in temp
@@ -1292,23 +1177,22 @@ point:
     ; pixel_mask <<= pixel_shift (shifting in ones)
     and #3
     sta pixel_shift
+    lda #3
+    sec
+    sbc pixel_shift
     tax
 shift_loop:
     beq shift_done
-    lsr pixel_color
-    lsr pixel_color
+    asl pixel_color
+    asl pixel_color
     sec
-    ror pixel_mask
+    rol pixel_mask
     sec
-    ror pixel_mask
+    rol pixel_mask
     dex
     jmp shift_loop
 shift_done:
 
-    ldy fill_level
-    ldx fill_masks,y
-    inx
-
     ; pixel_offset = temp >> 2
     lda temp
     lsr a
@@ -1316,94 +1200,48 @@ shift_done:
     sta pixel_offset
     tay
 
-draw_pixel:
     ; read, mask, or, write
     lda (pixel_ptr),y
     and pixel_mask
     ora pixel_color
     sta (pixel_ptr),y
 
-    dex
-    beq done
-    clc
-    lda #40
-    adc pixel_ptr
-    sta pixel_ptr
-    lda #0
-    adc pixel_ptr + 1
-    sta pixel_ptr + 1
-    jmp draw_pixel
-
-done:
     rts
 .endproc
 
-; in/out: column in text_col
-; in: row in text_row
-; in: pointer to string in INBUFF
-; clobbers x/y/a/temp
-.proc draw_string
-    drawptr = temp
-    strptr = INBUFF
-
-    clc
-    lda #.lobyte(textbuffer)
-    adc text_col
-    sta temp
-    lda #.hibyte(textbuffer)
-    adc #0
-    sta temp + 1
-
-    ldx text_row
-    beq done_rows
-continue_rows:
-    clc
-    lda temp
-    adc #40
-    sta temp
-    lda temp + 1
-    adc #0
-    sta temp + 1
-    dex
-    bne continue_rows
-
-done_rows:
-
-    ldy #0
+.macro draw_text_indirect col, len, strptr
+    ; clobbers A, X
+    .local loop
+    .local done
+    ldx #0
 loop:
-    lda (strptr),y
-    ; if char's null, terminate c-style
+    cpx #len
     beq done
-    ; save the char for terminator check
-    pha
-    ; strip the high bit (terminator)
-    and #$7f
-    tax
-    lda char_map,x
-    sta (drawptr),y
-    iny
-
-    pla
-    ; _last_ char has high bit set in atari rom routines
-    bmi done
+    txa
+    tay
+    lda (strptr),y
+    tay
+    lda char_map,y
+    sta textbuffer + col,x
+    inx
     jmp loop
-
 done:
-    ; move the text column pointer
-    tya
-    clc
-    adc text_col
-    sta text_col
+.endmacro
 
-    rts
-.endproc
-
-.macro draw_string_const str
-    lda #.lobyte(str)
-    sta INBUFF
-    lda #.hibyte(str)
-    sta INBUFF + 1
-    jsr draw_string
+.macro draw_text col, len, cstr
+    ; clobbers A, X
+    .local loop
+    .local done
+    ldx #0
+loop:
+    cpx #len
+    beq done
+    ldy cstr,x
+    lda char_map,y
+    sta textbuffer + col,x
+    inx
+    jmp loop
+done:
 .endmacro
 
 .proc vblank_handler
@@ -1529,7 +1367,7 @@ skip_char:
 
 plus:
     lda zoom
-    cmp #7
+    cmp #8
     bpl skip_char
     inc zoom
     jmp done
@@ -1540,20 +1378,16 @@ minus:
     dec zoom
     jmp done
 up:
-    add32 oy, oy, temp
-    jsr display_coords
+    sub32 oy, oy, temp
     jmp done
 down:
-    sub32 oy, oy, temp
-    jsr display_coords
+    add32 oy, oy, temp
     jmp done
 left:
     sub32 ox, ox, temp
-    jsr display_coords
     jmp done
 right:
     add32 ox, ox, temp
-    jsr display_coords
     jmp done
 
 number_keys:
@@ -1565,11 +1399,7 @@ number_keys:
     beq three
     cpy #KEY_4
     beq four
-    cpy #KEY_5
-    beq five
-    cpy #KEY_6
-    beq six
-    jmp letter_keys
+    jmp skip_char
 
 one:
     ldx #0
@@ -1582,27 +1412,7 @@ three:
     jmp load_key_viewport
 four:
     ldx #3
-    jmp load_key_viewport
-five:
-    ldx #4
-    jmp load_key_viewport
-six:
-    ldx #5
-    jmp load_key_viewport
-
-letter_keys:
-    cpy #KEY_X
-    bne not_x
-    jsr input_x
-    jmp done
-not_x:
-    cpy #KEY_Y
-    bne not_y
-    jsr input_y
-    jmp done
-not_y:
-    jmp skip_char
-
+    ; fall through
 load_key_viewport:
     jsr load_viewport
     ; fall through
@@ -1612,23 +1422,6 @@ done:
 
 .endproc
 
-.proc input_x
-    ldx #col_x
-    ldy #1
-    jsr input_number
-
-
-    rts
-.endproc
-
-.proc input_y
-    rts
-.endproc
-
-.proc input_number
-    rts
-.endproc
-
 .proc clear_screen
     ; zero the range from framebuffer_top to display_list
     lda #.lobyte(framebuffer_top)
@@ -1654,59 +1447,12 @@ zero_byte_loop:
 
 .proc status_bar
     ; Status bar
-
-    lda #0
-    sta text_col
-    lda #0
-    sta text_row
-    draw_string_const str_self
-
-    lda #(40 - str_run_len)
-    sta text_col
-    draw_string_const str_run
+    draw_text 0, str_self_len, str_self
+    draw_text 40 - str_run_len, str_run_len, str_run
 
     rts
 .endproc
 
-.proc display_coords
-    lda #1
-    sta text_row
-    lda #col_x
-    sta text_col
-    draw_string_const str_x
-
-    copy32 FR0, ox
-    jsr fixed6_26_to_float
-    jsr FASC
-    jsr draw_string
-
-    lda #col_y
-    sta text_col
-    draw_string_const str_y
-
-    copy32 FR0, oy
-    jsr fixed6_26_to_float
-    jsr FASC
-    jsr draw_string
-
-    lda #col_zoom
-    sta text_col
-    draw_string_const str_zoom
-
-    lda zoom
-    clc
-    adc #0
-    sta FR0
-    lda #0
-    sta FR0 + 1
-    jsr IFP
-    jsr FASC
-    jsr draw_string
-
-    rts
-
-.endproc
-
 ; input: viewport selector in x
 ; clobbers: a, x
 .proc load_viewport
@@ -1758,7 +1504,6 @@ zero_byte_loop:
     sta DMACTL
 
     jsr clear_screen
-    jsr display_coords
 
     ; Copy the display list into properly aligned memory
     ; Can't cross 1024-byte boundaries :D
@@ -1797,24 +1542,19 @@ copy_byte_loop:
     jsr SETVBV
 
 main_loop:
-    ; count_frames = 0; count_iters = 0
+    ; count_frames = 0; count_pixels = 0
     lda #0
     sta count_frames
-    sta count_iters
-    sta count_iters + 1
+    sta count_pixels
 
-    ; total_sec = 0.0; total_iters = 0.0
-    jsr ZFR0
-    ldx #.lobyte(total_sec)
-    ldy #.hibyte(total_sec)
-    jsr FST0R
-    ldx #.lobyte(total_iters)
-    ldy #.hibyte(total_iters)
-    jsr FST0R
+    ; total_ms = 0.0; total_pixels = 0.0
+    ldx #total_ms
+    jsr ZF1
+    ldx #total_pixels
+    jsr ZF1
 
     jsr clear_screen
     jsr status_bar
-    jsr display_coords
 
     lda #0
     sta fill_level
@@ -1872,7 +1612,6 @@ not_skipped_mask:
     zoom_factor cx, sx, aspect_x
     add32 cx, cx, ox
     zoom_factor cy, sy, aspect_y
-    neg32 cy
     add32 cy, cy, oy
     jsr mandelbrot
     jsr pset
@@ -1884,32 +1623,38 @@ not_skipped_mask:
 
 no_key:
     ; check if we should update the counters
+    ;
+    ; count_pixels >= width? update!
+    inc count_pixels
+    lda count_pixels
+    cmp #width
+    bmi update_status
 
     ; count_frames >= 120? update!
     lda count_frames
     cmp #120 ; >= 2 seconds
-    bpl update_status
-    jmp skipped
+    bmi skipped
 
 update_status:
-    ; FR0 = (float)count_iters & clear count_iters
-    copy16 FR0, count_iters
-    jsr IFP
+    ; FR0 = (float)count_pixels & clear count_pixels
+    lda count_pixels
+    sta FR0
     lda #0
-    sta count_iters
-    sta count_iters + 1
+    sta FR0 + 1
+    sta count_pixels
+    jsr IFP
 
-    ; FR1 = total_iters
-    ldx #.lobyte(total_iters)
-    ldy #.hibyte(total_iters)
+    ; FR1 = total_pixels
+    ldx #.lobyte(total_pixels)
+    ldy #.hibyte(total_pixels)
     jsr FLD1R
 
     ; FR0 += FR1
     jsr FADD
 
-    ; total_iters = FR0
-    ldx #.lobyte(total_iters)
-    ldy #.hibyte(total_iters)
+    ; total_pixels = FR0
+    ldx #.lobyte(total_pixels)
+    ldy #.hibyte(total_pixels)
     jsr FST0R
 
 
@@ -1922,100 +1667,44 @@ update_status:
     sta count_frames
     jsr IFP
 
-    ; FR0 *= sec_per_frame
-    ldx #.lobyte(sec_per_frame)
-    ldy #.hibyte(sec_per_frame)
+    ; FR0 *= ms_per_frame
+    ldx #.lobyte(ms_per_frame)
+    ldy #.hibyte(ms_per_frame)
     jsr FLD1R
     jsr FMUL
 
-    ; FR0 += total_sec
-    ldx #.lobyte(total_sec)
-    ldy #.hibyte(total_sec)
+    ; FR0 += total_ms
+    ldx #total_ms
+    ldy #0
     jsr FLD1R
     jsr FADD
 
-    ; total_sec = FR0
-    ldx #.lobyte(total_sec)
-    ldy #.hibyte(total_sec)
+    ; total_ms = FR0
+    ldx #total_ms
+    ldy #0
     jsr FST0R
 
-    ; FR0 /= total_iters
-    ldx #.lobyte(total_iters)
-    ldy #.hibyte(total_iters)
+    ; FR0 /= total_pixels
+    ldx #total_pixels
+    ldy #0
     jsr FLD1R
     jsr FDIV
 
-    ; FR0 *= us_per_sec
-    ldx #.lobyte(us_per_sec)
-    ldy #.hibyte(us_per_sec)
-    jsr FLD1R
-    jsr FMUL
-
-    ; round (down) to integer
-    jsr FPI
-    clc
-    jsr IFP
-
-    lda #speed_start
-    sta text_col
-    lda #0
-    sta text_row
-    draw_string_const str_speed
-
-    lda text_col
-    pha
-    draw_string_const str_padding
-    pla
-    sta text_col
-
-    ; convert to ASCII in INBUFF and print
+    ; convert to ASCII in INBUFF
     jsr FASC
-    jsr draw_string
 
-    ; elapsed time
-    ; FR0 = total_sec
-    ldx #.lobyte(total_sec)
-    ldy #.hibyte(total_sec)
-    jsr FLD0R
-    ; FR0 -> integer -> elapsed_work
-    jsr FPI
-    lda FR0
-    sta elapsed_work
-    lda FR0 + 1
-    sta elapsed_work + 1
-
-    draw_string_const str_space
-    
-    .macro do_countdown divisor, digits
-        ldx #.lobyte(divisor)
-        ldy #.hibyte(divisor)
-        lda #.lobyte(digits)
-        sta INBUFF
-        lda #.hibyte(digits)
-        sta INBUFF + 1
-        jsr countdown
-    .endmacro
-    do_countdown 36000, digits_space
-    do_countdown 3600, digits_zero
-    draw_string_const str_h
-    do_countdown 600, digits_zero
-    do_countdown 60, digits_zero
-    draw_string_const str_m
-    do_countdown 10, digits_zero
-    do_countdown 1, digits_zero
-    draw_string_const str_s
+    ; print the first 6 digits
+    draw_text_indirect speed_start, speed_precision, INBUFF
+    draw_text speed_start + speed_precision, str_speed_len, str_speed
 
 skipped:
 
-    ; sx += fill_level[fill_masks] + 1
-    ldx fill_level
-    lda fill_masks,x
     clc
-    adc #1 ; will never carry
-    adc sx
+    lda sx
+    adc #1
     sta sx
-    lda #0
-    adc sx + 1
+    lda sx + 1
+    adc #0
     sta sx + 1
 
     lda sx
@@ -2025,15 +1714,12 @@ skipped:
 
 loop_sx_done:
 
-    ; sy += fill_level[fill_masks] + 1
-    ldx fill_level
-    lda fill_masks,x
     clc
-    adc #1 ; will never carry
-    adc sy
+    lda sy
+    adc #1
     sta sy
-    lda #0
-    adc sy + 1
+    lda sy + 1
+    adc #0
     sta sy + 1
 
     lda sy
@@ -2052,130 +1738,9 @@ fill_loop_done:
 
 loop:
     ; finished
-
-    lda #(40 - str_done_len)
-    sta text_col
-    lda #0
-    sta text_row
-    draw_string_const str_done
-
+    draw_text 40 - str_done_len, str_done_len, str_done
     jsr keycheck
     beq loop
     jmp main_loop
 
 .endproc
-
-; digit string in INBUFF
-; divisor X/Y
-; clobbers temp, calls draw_string
-.proc countdown
-    divisor = temp
-    stx divisor
-    sty divisor + 1
-
-    ; count the hours
-    ldy #0
-countdown_loop:
-    lda elapsed_work + 1
-    cmp divisor + 1
-    beq countdown_lobyte
-    bcc countdown_done
-    bcs countdown_inc
-countdown_lobyte:
-    lda elapsed_work
-    cmp divisor
-    bcc countdown_done
-countdown_inc:
-    sec
-    lda elapsed_work
-    sbc divisor
-    sta elapsed_work
-    lda elapsed_work + 1
-    sbc divisor + 1
-    sta elapsed_work + 1
-    iny
-    jmp countdown_loop
-countdown_done:
-    lda (INBUFF),y
-    eor #$80
-    sta elapsed_digit
-    lda #.lobyte(elapsed_digit)
-    sta INBUFF
-    lda #.hibyte(elapsed_digit)
-    sta INBUFF + 1
-    jsr draw_string
-    rts
-.endproc
-
-.proc imul8xe_init
-
-    bank_switch 0
-    lda #0
-    sta EXTENDED_RAM
-    bank_switch 1
-    lda #1
-    sta EXTENDED_RAM
-    bank_switch 0
-    lda EXTENDED_RAM
-    beq init
-
-    ; no bank switching available, we just overwrite the value in base ram
-    rts
-
-init:
-
-    ; patch imul16_func into a forwarding thunk to imul16xe_func
-    lda #$4c ; 'jmp' opcode
-    sta imul16_func
-    lda #.lobyte(imul16xe_func)
-    sta imul16_func + 1
-    sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1
-    lda #.hibyte(imul16xe_func)
-    sta imul16_func + 2
-    sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2
-
-    ; ditto for sqr16_func -> sqr16xe_func
-    lda #$4c ; 'jmp' opcode
-    sta sqr16_func
-    lda #.lobyte(sqr16xe_func)
-    sta sqr16_func + 1
-    sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1
-    sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1
-    lda #.hibyte(sqr16xe_func)
-    sta sqr16_func + 2
-    sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2
-    sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2
-
-
-    ; create the lookup table
-    ; go through the input set, in four 16KB chunks
-
-    arg1 = FR1
-    arg2 = FR2
-    result = FR0
-
-    lda #$00
-    sta arg1
-    sta arg2
-    sta ptr
-    lda #$40
-    sta ptr + 1
-
-    ; $00 * $00 -> $3f * $ff
-    bank_switch 0
-    jsr imul8xe_init_section
-
-    ; $40 * $00 -> $7f * $ff
-    bank_switch 1
-    jsr imul8xe_init_section
-
-    ; $80 * $00 -> $bf * $ff
-    bank_switch 2
-    jsr imul8xe_init_section
-
-    ; $c0 * $00 -> $ff * $ff
-    bank_switch 3
-    jsr imul8xe_init_section
-
-    rts
-.endproc
diff --git a/readme.md b/readme.md
index 2c9efc1..d60644c 100644
--- a/readme.md
+++ b/readme.md
@@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g
 
 ## Current state
 
-Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
+Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys.
 
 The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
 
@@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
 * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
 
-The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
 
 Iterations are capped at 255.
 
diff --git a/todo.md b/todo.md
index 6807ae2..284d653 100644
--- a/todo.md
+++ b/todo.md
@@ -1,17 +1,19 @@
 things to try:
 
-* fix status bar to show elapsed time, per-iter time, per-pixel iter count
-
-* 'turbo' mode disabling graphics in full or part
+* skip add on the top-byte multiply in sqr8/mul8
+  * should save a few cycles, suggestion by jamey
 
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 
-* maybe clean up the load/layout of the big mul table
-
-* consider alternate lookup tables in the top 16KB under ROM
+* try 3.13 fixed point instead of 4.12 for more precision
+  * can we get away without the extra bit?
+  * since exit compare space would be 6.26 i think so
 
 * y-axis mirror optimization
 
+* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
+  * maybe redo tiering to just 4x4, 2x2, 1x1?
+
 * extract viewport for display & re-input via keyboard
 
 * fujinet screenshot/viewport uploader