13 changed files with 924 additions and 2436 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,3 @@
 *.o
 *.xex
-tables.s
 .DS_Store
--- a/.mailmap
+++ b/.mailmap
@ -1,2 +0,0 @@
-Brooke Vibber <bvibber@pobox.com>
-Brooke Vibber <bvibber@pobox.com> <brion@pobox.com>
--- a/14
+++ b/14
@ -2,21 +2,13 @@

 all : mandel.xex

-mandel.xex : mandel.o mandel-core.o tables.o atari-xex.cfg
-	ld65 -C ./atari-xex.cfg --mapfile mandel.map -o $@ mandel.o mandel-core.o tables.o atari.lib
-
-mandel.s : mandel.c mandel.h
-	cc65 -o $@ mandel.c
+%.xex : %.o
+	ld65 -C atari-asm-xex.cfg -o $@ $<

 %.o : %.s
 	ca65 -o $@ $<

-tables.s : tables.js
-	node tables.js > tables.s
-
 clean :
-	rm -f tables.s
-	rm -f mandel.s
 	rm -f *.o
 	rm -f *.xex
-	rm -f mandel.map
+
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@ -1,28 +0,0 @@
-FEATURES {
-    STARTADDRESS: default = $2E00;
-}
-SYMBOLS {
-    __STARTADDRESS__: type = export, value = %S;
-}
-MEMORY {
-    ZP:      file = "", define = yes, start = $0082, size = $007E;
-    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
-    # Keep $4000-7fff clear for expanded RAM access window
-    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
-    # Keep $a000-$bfff clear for BASIC cartridge
-}
-FILES {
-    %O: format = atari;
-}
-FORMATS {
-    atari: runad = start;
-}
-SEGMENTS {
-    ZEROPAGE: load = ZP,      type = zp,  optional = yes;
-    EXTZP:    load = ZP,      type = zp,  optional = yes; # to enable modules to be able to link to C and assembler programs
-    CODE:     load = MAIN,    type = rw,                  define = yes;
-    RODATA:   load = MAIN,    type = ro   optional = yes;
-    DATA:     load = MAIN,    type = rw   optional = yes;
-    BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
-    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
-}
--- a/atari-xex.cfg
+++ b/atari-xex.cfg
@ -1,69 +0,0 @@
-# Sample linker configuration for C programs using the Atari binary file support.
-# Use with: cl65 -tatari -Catari-xex.cfg prog.c -o prog.xex
-FEATURES {
-    STARTADDRESS: default = $8000;
-}
-SYMBOLS {
-    __SYSTEM_CHECK__:    type = import;  # force inclusion of "system check" load chunk
-    __STACKSIZE__:       type = weak, value = $0800; # 2k stack
-    __STARTADDRESS__:    type = export, value = %S;
-    __RESERVED_MEMORY__: type = weak, value = $0000;
-    __SYSCHKHDR__:       type = export, value = 0; # Disable system check header
-    __SYSCHKTRL__:       type = export, value = 0; # Disable system check trailer
-    __TABLESEG_START__:    type = weak, value = $2E00 + $0300;
-    __TABLESEG_SIZE__:     type = weak, value = 6 * $100;
-    __BANKSY_START__:  type = weak, value = $4000;
-    __BANKSY_SIZE__:   type = weak, value = $4000;
-    __FRAMEBUFFER_START__: type = weak, value = $A000;
-}
-MEMORY {
-# Note -- $80 and $81 (LOMEM) appear to be reserved in ZP.
-    ZP:         file = "", define = yes, start = $0082, size = $007E;
-# "system check" load chunk
-    SYSCHKCHNK: file = %O,               start = $2E00, size = $0300;
-# Note $a000-$bfff is against the BASIC cartridge, may require booting with OPTION.
-    TABLES:     file = %O, define = yes, start = __TABLESEG_START__, size = __TABLESEG_SIZE__;
-# We reserve $4000-7fff for the bank-switch window.
-# In theory we could keep data and code here that we only use on 48k/64k systems.
-    BANKSWITCH: file = "", define = yes, start = __BANKSY_START__, size = __BANKSY_SIZE__;
-# "main program" load chunk
-    MAIN:       file = %O, define = yes, start = %S, size = __FRAMEBUFFER_START__ - __STACKSIZE__ - __RESERVED_MEMORY__ - %S;
-}
-FILES {
-    %O: format = atari;
-}
-FORMATS {
-    atari: runad = start,
-           initad = SYSCHKCHNK: __SYSTEM_CHECK__;
-}
-SEGMENTS {
-    ZEROPAGE:  load = ZP,         type = zp;
-    EXTZP:     load = ZP,         type = zp,                optional = yes;
-    SYSCHK:    load = SYSCHKCHNK, type = rw,  define = yes, optional = yes;
-    TABLES:    load = TABLES,     type = ro,  optional = yes, align = 256;
-    BANKSWICH: load = BANKSWITCH, type = ro,  optional = yes;
-    STARTUP:   load = MAIN,       type = ro,  define = yes;
-    LOWBSS:    load = MAIN,       type = rw,                optional = yes;  # not zero initialized
-    LOWCODE:   load = MAIN,       type = ro,  define = yes, optional = yes;
-    ONCE:      load = MAIN,       type = ro,                optional = yes;
-    CODE:      load = MAIN,       type = ro,  define = yes;
-    RODATA:    load = MAIN,       type = ro;
-    DATA:      load = MAIN,       type = rw;
-    INIT:      load = MAIN,       type = rw,                optional = yes;
-    BSS:       load = MAIN,       type = bss, define = yes;
-}
-FEATURES {
-    CONDES: type    = constructor,
-            label   = __CONSTRUCTOR_TABLE__,
-            count   = __CONSTRUCTOR_COUNT__,
-            segment = ONCE;
-    CONDES: type    = destructor,
-            label   = __DESTRUCTOR_TABLE__,
-            count   = __DESTRUCTOR_COUNT__,
-            segment = RODATA;
-    CONDES: type    = interruptor,
-            label   = __INTERRUPTOR_TABLE__,
-            count   = __INTERRUPTOR_COUNT__,
-            segment = RODATA,
-            import  = __CALLIRQ__;
-}
--- a/mandel-core.s
+++ b/mandel-core.s
--- a/mandel.c
+++ b/mandel.c
@ -1,15 +0,0 @@
-/**
- * The UI and I/O wrapper for the Mandelbrot runner, in C.
- *
- * For the moment *all* logic is in mandel-core.s, I'm just
- * trying to get this to run within a cc65 environment.
- * Eventually just the inner loop fun will live in there.
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "mandel.h"
-
-void main(void) {
-    mandel_start();
-}
--- a/mandel.h
+++ b/mandel.h
@ -1,4 +0,0 @@
-#include <inttypes.h>
-
-// From mandel-core.s:
-extern void mandel_start(void);
--- a/mandel.s
+++ b/mandel.s
@ -0,0 +1,912 @@
+; Our zero-page vars
+sx    = $80     ; i16: screen pixel x
+sy    = $82     ; i16: screen pixel y
+ox    = $84     ; fixed4.12: center point x
+oy    = $86     ; fixed4.12: center point y
+cx    = $88     ; fixed4.12: c_x
+cy    = $8a     ; fixed4.12: c_y
+zx    = $8c     ; fixed4.12: z_x
+zy    = $8e     ; fixed4.12: z_y
+
+zx_2  = $90     ; fixed4.12: z_x^2
+zy_2  = $92     ; fixed4.12: z_y^2
+zx_zy = $94     ; fixed4.12: z_x * z_y
+dist  = $96     ; fixed4.12: z_x^2 + z_y^2
+
+iter         = $a0 ; u8: iteration count
+
+zoom         = $a1 ; u8: zoom shift level
+count_frames = $a2 ; u8
+count_pixels = $a3 ; u8
+total_ms     = $a4 ; float48
+total_pixels = $aa ; float48
+
+temp         = $b0 ; u16
+pixel_ptr    = $b2 ; u16
+pixel_color  = $b4 ; u8
+pixel_mask   = $b5 ; u8
+pixel_shift  = $b6 ; u8
+pixel_offset = $b7 ; u8
+
+
+; FP registers in zero page
+FR0    = $d4 ; float48
+FRE    = $da
+FR1    = $e0 ; float48
+FR2    = $e6 ; float48
+CIX    = $f2 ; u8 - index into INBUFF
+INBUFF = $f3 ; u16 - pointer to ascii
+FLPTR  = $fc ; u16 - pointer to user buffer float48
+
+LBUFF  = $0580 ; result buffer for FASC routine
+
+; FP ROM routine vectors
+FASC   = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
+IFP    = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
+FADD   = $DA66 ; ADDITION       (FR0 += FR1)
+FSUB   = $DA60 ; SUBTRACTION    (FR0 -= FR1)
+FMUL   = $DADB ; MULTIPLICATION (FR0 *= FR1)
+FDIV   = $DB28 ; DIVISION       (FR0 /= FR1)
+ZF1    = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
+FLD0R  = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
+FLD1R  = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
+FST0R  = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
+FMOVE  = $DDB6 ; MOVE FR0 TO FR1
+
+; High data
+framebuffer_top    = $8000
+textbuffer         = $8f00
+framebuffer_bottom = $9000
+display_list       = $9f00
+framebuffer_end    = $a000
+
+height = 184
+half_height = height >> 1
+width = 160
+half_width = width >> 1
+stride = width >> 2
+
+DMACTL = $D400
+DLISTL = $D402
+DLISTH = $D403
+
+; OS shadow registers
+SDLSTL = $230
+SDLSTH = $231
+
+; interrupt stuff
+XITVBV = $E462
+SETVBV = $E45C
+
+.struct float48
+    exponent .byte
+    mantissa .byte 6
+.endstruct
+
+.data
+
+strings:
+str_self:
+    .byte "MANDEL-6502"
+str_self_end:
+str_speed:
+    .byte "ms/px"
+str_speed_end:
+str_run:
+    .byte " RUN"
+str_run_end:
+str_done:
+    .byte "DONE"
+str_done_end:
+
+str_self_len = str_self_end - str_self
+str_speed_len = str_speed_end - str_speed
+str_run_len = str_run_end - str_run
+str_done_len = str_done_end - str_done
+
+speed_start = str_self_len + 2
+speed_len = 14 + str_speed_len
+
+
+char_map:
+    ; Map ATASCII string values to framebuffer font entries
+    ; Sighhhhh
+    .repeat 32, i
+        .byte i + 64
+    .endrepeat
+    .repeat 64, i
+        .byte i
+    .endrepeat
+    .repeat 32, i
+        .byte 96 + i
+    .endrepeat
+
+aspect:
+    ; aspect ratio!
+    ; pixels at 320w are 5:6 (narrow)
+    ; pixels at 160w are 5:3 (wide)
+    ;
+    ; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4)
+    ; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4)
+    ;
+    ; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624
+    ; &horizontal range -80 .. 79.9 is -3.125 .. 3.124
+    ;
+    ; 184h is the equiv of 220.8h at square pixels
+    ; 320 / 220.8 = 1.45 display aspect ratio
+aspect_x: ; fixed4.16 5/4
+    .word 5 << (12 - 2)
+
+aspect_y: ; fixed4.16 3/4
+    .word 3 << (12 - 2)
+
+ms_per_frame: ; float48 16.66666667
+    .byte 64  ; exponent/sign
+    .byte $16 ; BCD digits
+    .byte $66
+    .byte $66
+    .byte $66
+    .byte $67
+
+display_list_start:
+    ; 24 lines overscan
+    .repeat 3
+        .byte $70 ; 8 blank lines
+    .endrep
+
+    ; 8 scan lines, 1 row of 40-column text
+    .byte $42
+    .addr textbuffer
+
+    ; 184 lines graphics
+    ; ANTIC mode e (160px 2bpp, 1 scan line per line)
+    .byte $4e
+    .addr framebuffer_top
+    .repeat half_height - 1
+        .byte $0e
+    .endrep
+    .byte $4e
+    .addr framebuffer_bottom
+    .repeat half_height - 1
+        .byte $0e
+    .endrep
+
+    .byte $41 ; jump and blank
+    .addr display_list
+display_list_end:
+display_list_len = display_list_end - display_list_start
+
+color_map:
+    .byte 0
+    .repeat 85
+        .byte 1
+        .byte 2
+        .byte 3
+    .endrepeat
+
+.code
+
+.export start
+
+; 2 + 9 * byte cycles
+.macro add bytes, dest, arg1, arg2
+    clc ; 2 cyc
+    .repeat bytes, byte ; 9 * byte cycles
+        lda arg1 + byte
+        adc arg2 + byte
+        sta dest + byte
+    .endrepeat
+.endmacro
+
+.macro add16 dest, arg1, arg2
+    add 2, dest, arg1, arg2
+.endmacro
+
+.macro add32 dest, arg1, arg2
+    add 4, dest, arg2, dest
+.endmacro
+
+; 2 + 9 * byte cycles
+.macro sub bytes, dest, arg1, arg2
+    sec ; 2 cyc
+    .repeat bytes, byte ; 9 * byte cycles
+        lda arg1 + byte
+        sbc arg2 + byte
+        sta dest + byte
+    .endrepeat
+.endmacro
+
+.macro sub16 dest, arg1, arg2
+    sub 2, dest, arg1, arg2
+.endmacro
+
+.macro sub32 dest, arg1, arg2
+    sub 4, dest, arg1, arg2
+.endmacro
+
+.macro shl bytes, arg
+    asl arg
+    .repeat bytes-1, i
+        rol arg + 1 + i
+    .endrepeat
+.endmacro
+
+.macro shl16 arg
+    shl 2, arg
+.endmacro
+
+.macro shl24 arg
+    shl 3, arg
+.endmacro
+
+.macro shl32 arg
+    shl 4, arg
+.endmacro
+
+; 6 * bytes cycles
+.macro copy bytes, dest, arg
+    .repeat bytes, byte ; 6 * bytes cycles
+        lda arg + byte  ; 3 cyc
+        sta dest + byte ; 3 cyc
+    .endrepeat
+.endmacro
+
+.macro copy16 dest, arg
+    copy 2, dest, arg
+.endmacro
+
+.macro copy32 dest, arg
+    copy 4, dest, arg
+.endmacro
+
+.macro copyfloat dest, arg
+    copy 6, dest, arg
+.endmacro
+
+; 2 + 8 * byte cycles
+.macro neg bytes, arg
+    sec ; 2 cyc
+    .repeat bytes, byte ; 8 * byte cycles
+        lda #00         ; 2 cyc
+        sbc arg + byte  ; 3 cyc
+        sta arg + byte  ; 3 cyc
+    .endrepeat
+.endmacro
+
+; 18 cycles
+.macro neg16 arg
+    neg 2, arg
+.endmacro
+
+; 34 cycles
+.macro neg32 arg
+    neg 4, arg
+.endmacro
+
+; inner loop for imul16
+; bitnum < 8: 25 or 41 cycles
+; bitnum >= 8: 30 or 46 cycles
+.macro bitmul16 arg1, arg2, result, bitnum
+    .local zero
+    .local one
+    .local next
+
+    ; does 16-bit adds
+    ; arg1 and arg2 are treated as unsigned
+    ; negative signed inputs must be flipped first
+
+    ; 7 cycles up to the branch
+
+    ; check if arg1 has 0 or 1 bit in this place
+    ; 5 cycles either way
+    .if bitnum < 8
+        lda arg1                 ; 3 cyc
+        and #(1 << (bitnum))       ; 2 cyc
+    .else
+        lda arg1 + 1             ; 3 cyc
+        and #(1 << ((bitnum) - 8)) ; 2 cyc
+    .endif
+    bne one ; 2 cyc
+
+zero: ; 18 cyc, 23 cyc
+    lsr result + 3 ; 5 cyc
+    jmp next       ; 3 cyc
+
+one: ; 32 cyc, 37 cyc
+    ; 16-bit add on the top bits
+    clc            ; 2 cyc
+    lda result + 2 ; 3 cyc
+    adc arg2       ; 3 cyc
+    sta result + 2 ; 3 cyc
+    lda result + 3 ; 3 cyc
+    adc arg2 + 1   ; 3 cyc
+    ror a          ; 2 cyc - get a jump on the shift
+    sta result + 3 ; 3 cyc
+next:
+    ror result + 2 ; 5 cyc
+    ror result + 1 ; 5 cyc
+    .if bitnum >= 8
+        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
+        ; when it's all uninitialized data
+        ror result ; 5 cyc
+    .endif
+
+.endmacro
+
+; 5 to 25 cycles
+.macro check_sign arg
+    ; Check sign bit and flip argument to postive,
+    ; keeping a count of sign bits in the X register.
+    .local positive
+    lda arg + 1   ; 3 cyc
+    bpl positive  ; 2 cyc
+    neg16 arg     ; 18 cyc
+    inx           ; 2 cyc
+positive:
+.endmacro
+
+; 518 - 828 cyc
+.macro imul16 dest, arg1, arg2
+    copy16 FR0, arg1  ; 12 cyc
+    copy16 FR1, arg2  ; 12 cyc
+    jsr imul16_func   ; 470-780 cyc
+    copy32 dest, FR2  ; 24 cyc
+.endmacro
+
+.macro shift_round_16 arg, shift
+    .repeat shift
+        shl32 arg
+    .endrepeat
+    round16 arg
+.endmacro
+
+.macro imul16_round dest, arg1, arg2, shift
+    copy16 FR0, arg1  ; 12 cyc
+    copy16 FR1, arg2  ; 12 cyc
+    jsr imul16_func   ; 470-780 cyc
+    shift_round_16 FR2, shift
+    copy16 dest, FR2 + 2  ; 12 cyc
+.endmacro
+
+; min 470 cycles
+; max 780 cycles
+.proc imul16_func
+    arg1 = FR0   ; 16-bit arg (clobbered)
+    arg2 = FR1   ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+
+    ldx #0          ; 2 cyc
+    ; counts the number of sign bits in X
+    check_sign arg1 ; 5 to 25 cyc
+    check_sign arg2 ; 5 to 25 cyc
+    
+    ; zero out the 32-bit temp's top 16 bits
+    lda #0          ; 2 cyc
+    sta result + 2  ; 3 cyc
+    sta result + 3  ; 3 cyc
+    ; the bottom two bytes will get cleared by the shifts
+
+    ; unrolled loop for maximum speed, at the cost
+    ; of a larger routine
+    ; 440 to 696 cycles
+    .repeat 16, bitnum
+        ; bitnum < 8: 25 or 41 cycles
+        ; bitnum >= 8: 30 or 46 cycles
+        bitmul16 arg1, arg2, result, bitnum
+    .endrepeat
+
+    ; In case of mixed input signs, return a negative result.
+    cpx #1              ; 2 cyc
+    bne positive_result ; 2 cyc
+    neg32 result        ; 34 cyc
+positive_result:
+
+    rts ; 6 cyc
+.endproc
+
+.macro round16 arg
+    ; Round top 16 bits of 32-bit fixed-point number in-place
+    .local increment
+    .local high_half
+    .local check_sign
+    .local next
+
+    ; low word > $8000: round up
+    ;          = $8000: round up   if positive
+    ;                   round down if negative
+    ;          < $8000: round down
+
+    lda arg + 1
+    cmp #$80
+    beq high_half
+    bpl increment
+    bmi next
+
+high_half:
+    lda arg
+    beq check_sign
+    bpl increment
+    bmi next
+
+check_sign:
+    lda arg + 3
+    bmi next
+
+increment:       ; 5-10 cyc
+    inc arg + 2  ; 5 cyc
+    bne next     ; 2 cyc
+    inc arg + 3  ; 5 cyc
+
+next:
+
+.endmacro
+
+.proc mandelbrot
+    ; input:
+    ; cx: position scaled to 4.12 fixed point - -8..+7.9
+    ; cy: position scaled to 4.12
+    ;
+    ; output:
+    ; iter: iteration count at escape or 0
+
+    ; zx = 0
+    ; zy = 0
+    ; zx_2 = 0
+    ; zy_2 = 0
+    ; zx_zy = 0
+    ; dist = 0
+    ; iter = 0
+    lda #00
+    ldx #(iter - zx + 1)
+initloop:
+    sta zx - 1,x
+    dex
+    bne initloop
+
+loop:
+    ; iter++ & max-iters break
+    inc iter
+    bne keep_going
+    rts
+keep_going:
+
+    .macro quick_exit arg, max
+        .local positive
+        .local negative
+        .local nope_out
+        .local first_equal
+        .local all_done
+
+        ; check sign bit
+        lda arg + 1
+        bmi negative
+
+    positive:
+        cmp #((max) << 4)
+        bmi all_done ; 'less than'
+        rts
+
+    negative:
+        cmp #(256 - ((max) << 4))
+        beq first_equal ; 'equal' on first byte
+        bpl all_done    ; 'greater than'
+
+    nope_out:
+        rts
+    
+    first_equal:
+        lda arg
+        beq nope_out  ; 2nd byte 0 shows it's really 'equal'
+
+    all_done:
+    .endmacro
+
+    ; 4.12: (-8 .. +7.9)
+    ; zx = zx_2  - zy_2  + cx
+    sub16 zx, zx_2, zy_2
+    add16 zx, zx, cx
+    quick_exit zx, 2
+
+    ; zy = zx_zy + zx_zy + cy
+    add16 zy, zx_zy, zx_zy
+    add16 zy, zy, cy
+    quick_exit zy, 2
+
+    ; zx_2 = zx * zx
+    imul16_round zx_2, zx, zx, 4
+
+    ; zy_2 = zy * zy
+    imul16_round zy_2, zy, zy, 4
+
+    ; zx_zy = zx * zy
+    imul16_round zx_zy, zx, zy, 4
+
+    ; dist = zx_2 + zy_2
+    add16 dist, zx_2, zy_2
+    quick_exit dist, 4
+
+    ; if may be in the lake, look for looping output with a small buffer
+    ; as an optimization vs running to max iters
+    jmp loop
+
+peace_out:
+    rts
+
+.endproc
+
+.macro zoom_factor dest, src, zoom, aspect
+    .local cont
+    .local enough
+
+    ; cx = (sx << (8 - zoom))
+    copy16 dest, src
+    ldx zoom
+cont:
+    cpx #8
+    beq enough
+    shl16 dest
+    inx
+    jmp cont
+enough:
+
+    ; cy = cy * (3 / 4)
+    ; cx = cx * (5 / 4)
+    imul16_round dest, dest, aspect, 4
+.endmacro
+
+.proc pset
+    ; screen coords in signed sx,sy
+    ; iter holds the target to use
+    ; @todo implement
+
+    ; iter -> color
+    ldx iter
+    lda color_map,x
+    sta pixel_color
+    lda #(255 - 3)
+    sta pixel_mask
+
+    ; sy -> line base address in temp
+    lda sy
+    bpl positive
+
+negative:
+    ; temp1 = top half
+    lda #.lobyte(framebuffer_top + stride * half_height)
+    sta pixel_ptr
+    lda #.hibyte(framebuffer_top + stride * half_height)
+    sta pixel_ptr + 1
+    jmp point
+
+positive:
+
+    lda #.lobyte(framebuffer_bottom)
+    sta pixel_ptr
+    lda #.hibyte(framebuffer_bottom)
+    sta pixel_ptr + 1
+
+point:
+
+    ; pixel_ptr += sy * stride
+    ;    temp * 40
+    ; =  temp * 32  +  temp * 8
+    ; = (temp << 5) + (temp << 3)
+    copy16 temp, sy
+    shl16 temp
+    shl16 temp
+    shl16 temp
+    add16 pixel_ptr, pixel_ptr, temp
+    shl16 temp
+    shl16 temp
+    add16 pixel_ptr, pixel_ptr, temp
+
+    ; Ok so temp1 points to the start of the line, which is 40 bytes.
+    ; Get the byte and bit offsets
+    lda sx
+    clc
+    adc #half_width
+    sta temp
+
+    ; pixel_shift = temp & 3
+    ; pixel_color <<= pixel_shift (shifting in zeros)
+    ; pixel_mask <<= pixel_shift (shifting in ones)
+    and #3
+    sta pixel_shift
+    lda #3
+    sec
+    sbc pixel_shift
+    tax
+shift_loop:
+    beq shift_done
+    asl pixel_color
+    asl pixel_color
+    sec
+    rol pixel_mask
+    sec
+    rol pixel_mask
+    dex
+    jmp shift_loop
+shift_done:
+
+    ; pixel_offset = temp >> 2
+    lda temp
+    lsr a
+    lsr a
+    sta pixel_offset
+    tay
+
+    ; read, mask, or, write
+    lda (pixel_ptr),y
+    and pixel_mask
+    ora pixel_color
+    sta (pixel_ptr),y
+
+    rts
+.endproc
+
+.macro draw_text col, len, cstr
+    ; clobbers A, X
+    .local loop
+    .local done
+    ldx #0
+loop:
+    cpx #len
+    beq done
+    ldy cstr,x
+    lda char_map,y
+    sta textbuffer + col,x
+    inx
+    jmp loop
+done:
+.endmacro
+
+.proc vblank_handler
+    inc count_frames
+    jmp XITVBV
+.endproc
+
+.proc update_speed
+    ; convert frames (u16) to fp
+    ; add to frames_total
+    ; convert pixels (u16) to fp
+    ; add to pixels_total
+    ; (frames_total * 16.66666667) / pixels_total
+    ; convert to ATASCII
+    ; draw text
+.endproc
+
+.proc start
+
+    ; ox = 0; oy = 0; zoom = 0
+    ; count_frames = 0; count_pixels = 0
+    lda #0
+    sta ox
+    sta ox + 1
+    sta oy
+    sta oy + 1
+    sta count_frames
+    sta count_pixels
+
+    ; total_ms = 0.0; total_pixels = 0.0
+    ldx #total_ms
+    jsr ZF1
+    ldx #total_pixels
+    jsr ZF1
+
+    ; zoom = 2x
+    lda #1
+    sta zoom
+
+    ; Disable display DMA
+    lda #0
+    sta DMACTL
+
+    ; zero the range from framebuffer_top to framebuffer_end
+    lda #.lobyte(framebuffer_top)
+    sta temp
+    lda #.hibyte(framebuffer_top)
+    sta temp + 1
+
+zero_page_loop:
+    lda #0
+    ldy #0
+zero_byte_loop:
+    sta (temp),y
+    iny
+    bne zero_byte_loop
+
+    inc temp + 1
+    lda temp + 1
+    cmp #.hibyte(framebuffer_end)
+    bne zero_page_loop
+
+    ; Copy the display list into properly aligned memory
+    ; Can't cross 1024-byte boundaries :D
+    ldx #0
+copy_byte_loop:
+    lda display_list_start,x
+    sta display_list,x
+    inx
+    cpx #display_list_len
+    bne copy_byte_loop
+
+    ; Set up the display list
+    lda #.lobyte(display_list)
+    sta DLISTL ; actual register
+    sta SDLSTL ; shadow register the OS will copy in
+    lda #.hibyte(display_list)
+    sta DLISTH ; actual register
+    sta SDLSTH ; shadow register the OS will copy in
+
+    ; Status bar
+    draw_text 0, str_self_len, str_self
+    draw_text 40 - str_run_len, str_run_len, str_run
+
+    ; Re-enable display DMA
+    lda #$22
+    sta DMACTL
+
+    ; install the vblank handler
+    lda #7 ; deferred
+    ldx #.hibyte(vblank_handler)
+    ldy #.lobyte(vblank_handler)
+    jsr SETVBV
+
+main_loop:
+    ; sy = -92 .. 91
+    lda #(256-half_height)
+    sta sy
+    lda #(256-1)
+    sta sy + 1
+
+loop_sy:
+    ; sx = -80 .. 79
+    lda #(256-half_width)
+    sta sx
+    lda #(256-1)
+    sta sx + 1
+
+loop_sx:
+    zoom_factor cx, sx, zoom, aspect_x
+    zoom_factor cy, sy, zoom, aspect_y
+    jsr mandelbrot
+    jsr pset
+
+
+    ; check if we should update the counters
+    ;
+    ; count_pixels >= width? update!
+    inc count_pixels
+    lda count_pixels
+    cmp #width
+    bmi update_status
+
+    ; count_frames >= 120? update!
+    lda count_frames
+    cmp #120 ; >= 2 seconds
+    bmi skip_status
+
+update_status:
+    ; FR0 = (float)count_pixels & clear count_pixels
+    lda count_pixels
+    sta FR0
+    lda #0
+    sta FR0 + 1
+    sta count_pixels
+    jsr IFP
+
+    ; FR1 = total_pixels
+    ldx #.lobyte(total_pixels)
+    ldy #.hibyte(total_pixels)
+    jsr FLD1R
+
+    ; FR0 += FR1
+    jsr FADD
+
+    ; total_pixels = FR0
+    ldx #.lobyte(total_pixels)
+    ldy #.hibyte(total_pixels)
+    jsr FST0R
+
+
+    ; FR0 = (float)count_frames & clear count_frames
+    ; warning: this should really disable interrupts @TODO
+    lda count_frames
+    sta FR0
+    lda #0
+    sta FR0 + 1
+    sta count_frames
+    jsr IFP
+
+    ; FR0 *= ms_per_frame
+    ldx #.lobyte(ms_per_frame)
+    ldy #.hibyte(ms_per_frame)
+    jsr FLD1R
+    jsr FMUL
+
+    ; FR0 += total_ms
+    ldx #total_ms
+    ldy #0
+    jsr FLD1R
+    jsr FADD
+
+    ; total_ms = FR0
+    ldx #total_ms
+    ldy #0
+    jsr FST0R
+
+    ; FR0 /= total_pixels
+    ldx #total_pixels
+    ldy #0
+    jsr FLD1R
+    jsr FDIV
+
+    ; convert to ASCII in INBUFF
+    jsr FASC
+
+    ; find the last byte
+    ldy #0
+number_loop:
+    lda (INBUFF),y
+    bmi lastchar
+
+    tax
+    lda char_map,x
+    sta textbuffer + speed_start,y
+
+    iny
+    bpl number_loop
+lastchar:
+    ; Y is last char
+    ; trim that high bit
+    and #$7f
+    tax
+    lda char_map,x
+    sta textbuffer + speed_start,y
+
+    ; Fill out any remaining spaces
+    lda #0
+space_loop:
+    iny
+    sta textbuffer + speed_start,y
+    cpy #(20)
+    bmi space_loop
+
+skip_status:
+
+    clc
+    lda sx
+    adc #1
+    sta sx
+    lda sx + 1
+    adc #0
+    sta sx + 1
+
+    lda sx
+    cmp #half_width
+    beq loop_sx_done
+    jmp loop_sx
+
+loop_sx_done:
+
+    clc
+    lda sy
+    adc #1
+    sta sy
+    lda sy + 1
+    adc #0
+    sta sy + 1
+
+    lda sy
+    cmp #half_height
+    beq loop_sy_done
+    jmp loop_sy
+
+loop_sy_done:
+
+    draw_text 40 - str_done_len, str_done_len, str_done
+
+loop:
+    ; finished
+    jmp loop
+.endproc
--- a/readme.md
+++ b/readme.md
@ -14,37 +14,30 @@ Non-goals:

 Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.

-- brooke, january 2023 - december 2024
+-- brion, january 2023

 ## Current state

-Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
+Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.

-The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
+The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.

-* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition
-* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
-* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
-* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
+The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.

-The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.

 Iterations are capped at 255.

-The pixels are run in a progressive layout to get the basic shape on screen faster.
+## Next steps

-There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D
+Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!

-There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
+Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.

-There's some cute color cycling.
+I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.

 ## Deps and build instructions

 I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.

 Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
-
-## Todo
-
-See ideas in `todo.md`.
--- a/tables.js
+++ b/tables.js
@ -1,50 +0,0 @@
-function db(func) {
-    let lines = [];
-    for (let i = 0; i < 256; i += 16) {
-        let items = [];
-        for (let j = 0; j < 16; j++) {
-            let x = i + j;
-            items.push(func(x));
-        }
-        lines.push('    .byte ' + items.join(', '));
-    }
-    return lines.join('\n');
-}
-
-let squares = [];
-for (let i = 0; i < 512; i++) {
-    squares.push(Math.trunc((i * i + 1) / 2));
-}
-
-console.log(
-`.segment "TABLES"
-
-.export mul_lobyte256
-.export mul_hibyte256
-.export mul_hibyte512
-.export sqr_lobyte
-.export sqr_hibyte
-
-; (i * i + 1) / 2 for the multiplier
-.align 256
-mul_lobyte256:
-${db((i) => squares[i] & 0xff)}
-
-.align 256
-mul_hibyte256:
-${db((i) => (squares[i] >> 8) & 0xff)}
-
-.align 256
-mul_hibyte512:
-${db((i) => (squares[i + 256] >> 8) & 0xff)}
-
-; (i * i) for the plain squares
-.align 256
-sqr_lobyte:
-${db((i) => (i * i) & 0xff)}
-
-.align 256
-sqr_hibyte:
-${db((i) => ((i * i) >> 8) & 0xff)}
-
-`);
--- a/testme.js
+++ b/testme.js
@ -1,41 +0,0 @@
-// ax = (a + x)2/2 - a2/2 - x2/2 
-
-function half_square(x) {
-    return Math.round(x * x / 2) & 0xffff >>> 0;
-}
-
-function mul8(a, b) {
-    let result = half_square(a + b) & 0xffff;
-    result = (result - half_square(a)) & 0xffff;
-    result = (result - half_square(b)) & 0xffff;
-    result = (result + (b & a & 1)) & 0xffff;
-    return result >>> 0;
-}
-
-function mul16(a, b) {
-    let ah = (a & 0xff00) >>> 8;
-    let al = (a & 0x00ff) >>> 0;
-    let bh = (b & 0xff00) >>> 8;
-    let bl = (b & 0x00ff) >>> 0;
-    let result = (mul8(al, bl) & 0xffff) >>> 0;
-    result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
-    result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
-    result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
-    return result;
-}
-
-let max = 65536;
-//let max = 256;
-//let max = 128;
-//let max = 8;
-
-for (let a = 0; a < max; a++) {
-    for (let b = 0; b < max; b++) {
-        let expected = Math.imul(a, b) >>> 0;
-        //let actual = mul8(a, b);
-        let actual = mul16(a, b);
-        if (expected !== actual) {
-            console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
-        }
-    }
-}
--- a/todo.md
+++ b/todo.md
@ -1,17 +0,0 @@
-things to try:
-
-* fix status bar to show elapsed time, per-iter time, per-pixel iter count
-
-* 'turbo' mode disabling graphics in full or part
-
-* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
-
-* maybe clean up the load/layout of the big mul table
-
-* consider alternate lookup tables in the top 16KB under ROM
-
-* y-axis mirror optimization
-
-* extract viewport for display & re-input via keyboard
-
-* fujinet screenshot/viewport uploader