From f10bb4fe18d560b8db777d591484ef93da545327 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sat, 11 Feb 2023 12:24:48 -0800
Subject: [PATCH 001/104] WIP alternate imul16

not working at present
---
 .gitignore |   1 +
 Makefile   |   8 +++-
 mandel.s   | 124 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 tables.js  |  33 ++++++++++++++
 testme.js  |  41 ++++++++++++++++++
 5 files changed, 194 insertions(+), 13 deletions(-)
 create mode 100644 tables.js
 create mode 100644 testme.js

diff --git a/.gitignore b/.gitignore
index 8d2f7ce..771e47a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.o
 *.xex
+tables.s
 .DS_Store
diff --git a/Makefile b/Makefile
index 25148b4..008bf8c 100644
--- a/Makefile
+++ b/Makefile
@@ -2,13 +2,17 @@
 
 all : mandel.xex
 
-%.xex : %.o
-	ld65 -C atari-asm-xex.cfg -o $@ $<
+mandel.xex : mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+
 
 %.o : %.s
 	ca65 -o $@ $<
 
+tables.s : tables.js
+	node tables.js > tables.s
+
 clean :
+	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex
 
diff --git a/mandel.s b/mandel.s
index 097b700..71bc6c2 100644
--- a/mandel.s
+++ b/mandel.s
@@ -22,11 +22,12 @@ total_ms     = $a4 ; float48
 total_pixels = $aa ; float48
 
 temp         = $b0 ; u16
-pixel_ptr    = $b2 ; u16
-pixel_color  = $b4 ; u8
-pixel_mask   = $b5 ; u8
-pixel_shift  = $b6 ; u8
-pixel_offset = $b7 ; u8
+temp2        = $b2 ; u16
+pixel_ptr    = $b4 ; u16
+pixel_color  = $b6 ; u8
+pixel_mask   = $b7 ; u8
+pixel_shift  = $b8 ; u8
+pixel_offset = $b9 ; u8
 
 
 ; FP registers in zero page
@@ -83,6 +84,10 @@ SETVBV = $E45C
     mantissa .byte 6
 .endstruct
 
+.import mul_lobyte256
+.import mul_hibyte256
+.import mul_hibyte512
+
 .data
 
 strings:
@@ -206,6 +211,12 @@ color_map:
     add 4, dest, arg2, dest
 .endmacro
 
+.macro add_carry dest
+    lda dest
+    adc #0
+    sta dest
+.endmacro
+
 ; 2 + 9 * byte cycles
 .macro sub bytes, dest, arg1, arg2
     sec ; 2 cyc
@@ -336,12 +347,12 @@ next:
 ; 5 to 25 cycles
 .macro check_sign arg
     ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the X register.
+    ; keeping a count of sign bits in the Y register.
     .local positive
     lda arg + 1   ; 3 cyc
     bpl positive  ; 2 cyc
     neg16 arg     ; 18 cyc
-    inx           ; 2 cyc
+    iny           ; 2 cyc
 positive:
 .endmacro
 
@@ -370,13 +381,13 @@ positive:
 
 ; min 470 cycles
 ; max 780 cycles
-.proc imul16_func
+.proc imul16_func_orig
     arg1 = FR0   ; 16-bit arg (clobbered)
     arg2 = FR1   ; 16-bit arg (clobbered)
     result = FR2 ; 32-bit result
 
-    ldx #0          ; 2 cyc
-    ; counts the number of sign bits in X
+    ldy #0          ; 2 cyc
+    ; counts the number of sign bits in Y
     check_sign arg1 ; 5 to 25 cyc
     check_sign arg2 ; 5 to 25 cyc
     
@@ -396,7 +407,98 @@ positive:
     .endrepeat
 
     ; In case of mixed input signs, return a negative result.
-    cpx #1              ; 2 cyc
+    cpy #1              ; 2 cyc
+    bne positive_result ; 2 cyc
+    neg32 result        ; 34 cyc
+positive_result:
+
+    rts ; 6 cyc
+.endproc
+
+; Adapted from https://everything2.com/title/Fast+6502+multiplication
+.macro imul8 dest, arg1, arg2
+    .local under256
+    .local next
+    .local small_product
+    .scope
+        mul_factor_a   = arg1
+        mul_factor_x   = arg2
+        mul_product_lo = dest
+        mul_product_hi = dest + 1
+
+        lda mul_factor_a      ; setup: 6 cycles
+        ;ldx mul_factor_x
+
+        clc                   ; (a + x)^2/2: 23 cycles
+        adc mul_factor_x
+        tax
+        bcc under256
+        lda mul_hibyte512,x
+        bcs next
+    under256:
+        lda mul_hibyte256,x
+        sec
+    next:
+        sta mul_product_hi
+        lda mul_lobyte256,x
+
+        ldx mul_factor_a      ; - a^2/2: 20 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+
+        ldx mul_factor_x      ; + x & a & 1: 22 cycles
+        txa                   ; (this is a kludge to correct a
+        and mul_factor_a      ; roundoff error that makes odd * odd too low)
+        and #1
+
+        clc
+        adc mul_product_lo
+        bcc small_product
+        inc mul_product_hi
+    small_product:
+        sec                   ; - x^2/2: 25 cycles
+        sbc mul_lobyte256,x
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+    .endscope
+.endmacro
+
+.proc imul16_func
+    arg1 = FR0   ; 16-bit arg (clobbered)
+    arg2 = FR1   ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+    inter = temp2
+
+    ldy #0          ; 2 cyc
+    ; counts the number of sign bits in Y
+    check_sign arg1 ; 5 to 25 cyc
+    check_sign arg2 ; 5 to 25 cyc
+
+    lda #0
+    sta result + 0
+    sta result + 1
+    sta result + 2
+    sta result + 3
+
+    imul8 inter, arg1, arg2
+    add16 result, result, inter
+
+    imul8 inter, arg1 + 1, arg2
+    add16 result + 1, result + 1, inter
+
+    imul8 inter, arg1, arg2 + 1
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, inter
+
+    ; In case of mixed input signs, return a negative result.
+    cpy #1              ; 2 cyc
     bne positive_result ; 2 cyc
     neg32 result        ; 34 cyc
 positive_result:
diff --git a/tables.js b/tables.js
new file mode 100644
index 0000000..5afc3c0
--- /dev/null
+++ b/tables.js
@@ -0,0 +1,33 @@
+function db(func) {
+    let lines = [];
+    for (let i = 0; i < 256; i += 16) {
+        let items = [];
+        for (let j = 0; j < 16; j++) {
+            let x = i + j;
+            items.push(func(x));
+        }
+        lines.push('    .byte ' + items.join(', '));
+    }
+    return lines.join('\n');
+}
+
+console.log(
+`.segment "TABLES"
+
+.export mul_lobyte256
+.export mul_hibyte256
+.export mul_hibyte512
+
+.align 256
+mul_lobyte256:
+${db((x) => Math.round(x * x / 2) & 0xff)}
+
+.align 256
+mul_hibyte256:
+${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)}
+
+.align 256
+mul_hibyte512:
+${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)}
+
+`);
diff --git a/testme.js b/testme.js
new file mode 100644
index 0000000..e12e706
--- /dev/null
+++ b/testme.js
@@ -0,0 +1,41 @@
+// ax = (a + x)2/2 - a2/2 - x2/2 
+
+function half_square(x) {
+    return Math.round(x * x / 2) & 0xffff >>> 0;
+}
+
+function mul8(a, b) {
+    let result = half_square(a + b) & 0xffff;
+    result = (result - half_square(a)) & 0xffff;
+    result = (result - half_square(b)) & 0xffff;
+    result = (result + (b & a & 1)) & 0xffff;
+    return result >>> 0;
+}
+
+function mul16(a, b) {
+    let ah = (a & 0xff00) >>> 8;
+    let al = (a & 0x00ff) >>> 0;
+    let bh = (b & 0xff00) >>> 8;
+    let bl = (b & 0x00ff) >>> 0;
+    let result = (mul8(al, bl) & 0xffff) >>> 0;
+    result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
+    result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
+    result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
+    return result;
+}
+
+let max = 65536;
+//let max = 256;
+//let max = 128;
+//let max = 8;
+
+for (let a = 0; a < max; a++) {
+    for (let b = 0; b < max; b++) {
+        let expected = Math.imul(a, b) >>> 0;
+        //let actual = mul8(a, b);
+        let actual = mul16(a, b);
+        if (expected !== actual) {
+            console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
+        }
+    }
+}
\ No newline at end of file

From 0501a364c728cd1af1fe29acce9f816151270fde Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sun, 12 Feb 2023 11:37:52 -0800
Subject: [PATCH 002/104] Check for repeated zx/zy values

These will never escape, so saves
some time in the lake

trick is taken from fractint
---
 mandel.s | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 106 insertions(+), 10 deletions(-)

diff --git a/mandel.s b/mandel.s
index 097b700..467245d 100644
--- a/mandel.s
+++ b/mandel.s
@@ -21,12 +21,16 @@ count_pixels = $a3 ; u8
 total_ms     = $a4 ; float48
 total_pixels = $aa ; float48
 
-temp         = $b0 ; u16
-pixel_ptr    = $b2 ; u16
-pixel_color  = $b4 ; u8
-pixel_mask   = $b5 ; u8
-pixel_shift  = $b6 ; u8
-pixel_offset = $b7 ; u8
+z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
+z_buffer_start  = $b1 ; u8: index into z_buffer
+z_buffer_end    = $b2 ; u8: index into z_buffer
+temp            = $b4 ; u16
+
+pixel_ptr       = $b6 ; u16
+pixel_color     = $b8 ; u8
+pixel_mask      = $b9 ; u8
+pixel_shift     = $ba ; u8
+pixel_offset    = $bb ; u8
 
 
 ; FP registers in zero page
@@ -186,6 +190,15 @@ color_map:
 
 .code
 
+z_buffer_len = 16
+z_buffer_mask = z_buffer_len - 1
+z_buffer:
+    ; the last N zx/zy values
+    .repeat z_buffer_len
+        .word 0
+        .word 0
+    .endrepeat
+
 .export start
 
 ; 2 + 9 * byte cycles
@@ -462,12 +475,14 @@ initloop:
     sta zx - 1,x
     dex
     bne initloop
+    sta z_buffer_start
+    sta z_buffer_len
 
 loop:
     ; iter++ & max-iters break
     inc iter
     bne keep_going
-    rts
+    jmp exit_path
 keep_going:
 
     .macro quick_exit arg, max
@@ -484,7 +499,7 @@ keep_going:
     positive:
         cmp #((max) << 4)
         bmi all_done ; 'less than'
-        rts
+        jmp exit_path
 
     negative:
         cmp #(256 - ((max) << 4))
@@ -492,7 +507,7 @@ keep_going:
         bpl all_done    ; 'greater than'
 
     nope_out:
-        rts
+        jmp exit_path
     
     first_equal:
         lda arg
@@ -527,9 +542,90 @@ keep_going:
 
     ; if may be in the lake, look for looping output with a small buffer
     ; as an optimization vs running to max iters
+    lda z_buffer_active
+    beq skip_z_buffer
+
+    ldx z_buffer_start
+    cpx z_buffer_end
+    beq z_nothing_to_read
+
+z_buffer_loop:
+    .macro z_compare arg
+        .local compare_no_match
+        lda z_buffer,x
+        inx
+        cmp arg
+        bne compare_no_match
+        iny
+    compare_no_match:
+    .endmacro
+    .macro z_advance
+        .local skip_reset_x
+        cpx #(z_buffer_len * 4)
+        bmi skip_reset_x
+        ldx #0
+    skip_reset_x:
+    .endmacro
+    .macro z_store arg
+        lda arg
+        sta z_buffer,x
+        inx
+    .endmacro
+
+    ; Compare the previously stored z values
+    ldy #0
+    z_compare zx
+    z_compare zx + 1
+    z_compare zy
+    z_compare zy + 1
+
+    cpy #4
+    bne z_no_matches
+    jmp z_exit
+
+z_no_matches:
+    z_advance
+
+    cpx z_buffer_end
+    bne z_buffer_loop
+
+z_nothing_to_read:
+
+    ; Store and expand
+    z_store zx
+    z_store zx + 1
+    z_store zy
+    z_store zy + 1
+    z_advance
+    stx z_buffer_end
+
+    ; Increment the start roller if necessary (limit size)
+    lda iter
+    cmp #(z_buffer_len * 4)
+    bmi skip_inc_start
+    lda z_buffer_start
+    clc
+    adc #4
+    tax
+    z_advance
+    stx z_buffer_start
+skip_inc_start:
+
+skip_z_buffer:
+
     jmp loop
 
-peace_out:
+z_exit:
+    lda #0
+    sta iter
+
+exit_path:
+    ldx #0
+    lda iter
+    bne next
+    inx
+next:
+    stx z_buffer_active
     rts
 
 .endproc

From 9926ec28e73a0fb546f37c025a7364ca62cbc93f Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sun, 5 Mar 2023 13:48:39 -0800
Subject: [PATCH 003/104] clean up speed display now uses ms/px msg

---
 mandel.s | 51 +++++++++++++++++++++++----------------------------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/mandel.s b/mandel.s
index 467245d..1f94b6a 100644
--- a/mandel.s
+++ b/mandel.s
@@ -94,7 +94,7 @@ str_self:
     .byte "MANDEL-6502"
 str_self_end:
 str_speed:
-    .byte "ms/px"
+    .byte " ms/px"
 str_speed_end:
 str_run:
     .byte " RUN"
@@ -740,6 +740,25 @@ shift_done:
     rts
 .endproc
 
+.macro draw_text_indirect col, len, strptr
+    ; clobbers A, X
+    .local loop
+    .local done
+    ldx #0
+loop:
+    cpx #len
+    beq done
+    txa
+    tay
+    lda (strptr),y
+    tay
+    lda char_map,y
+    sta textbuffer + col,x
+    inx
+    jmp loop
+done:
+.endmacro
+
 .macro draw_text col, len, cstr
     ; clobbers A, X
     .local loop
@@ -940,33 +959,9 @@ update_status:
     ; convert to ASCII in INBUFF
     jsr FASC
 
-    ; find the last byte
-    ldy #0
-number_loop:
-    lda (INBUFF),y
-    bmi lastchar
-
-    tax
-    lda char_map,x
-    sta textbuffer + speed_start,y
-
-    iny
-    bpl number_loop
-lastchar:
-    ; Y is last char
-    ; trim that high bit
-    and #$7f
-    tax
-    lda char_map,x
-    sta textbuffer + speed_start,y
-
-    ; Fill out any remaining spaces
-    lda #0
-space_loop:
-    iny
-    sta textbuffer + speed_start,y
-    cpy #(20)
-    bmi space_loop
+    ; print the first 6 digits
+    draw_text_indirect speed_start, 6, INBUFF
+    draw_text speed_start + 6, str_speed_len, str_speed
 
 skip_status:
 

From 24abc21b01c5436e1bf30f83e961e756a9968c5b Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sun, 5 Mar 2023 13:56:50 -0800
Subject: [PATCH 004/104] move speed to the right

---
 mandel.s | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mandel.s b/mandel.s
index 1f94b6a..b524c6f 100644
--- a/mandel.s
+++ b/mandel.s
@@ -107,8 +107,9 @@ str_self_len = str_self_end - str_self
 str_speed_len = str_speed_end - str_speed
 str_run_len = str_run_end - str_run
 str_done_len = str_done_end - str_done
+speed_precision = 6
 
-speed_start = str_self_len + 2
+speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
 speed_len = 14 + str_speed_len
 
 
@@ -960,8 +961,8 @@ update_status:
     jsr FASC
 
     ; print the first 6 digits
-    draw_text_indirect speed_start, 6, INBUFF
-    draw_text speed_start + 6, str_speed_len, str_speed
+    draw_text_indirect speed_start, speed_precision, INBUFF
+    draw_text speed_start + speed_precision, str_speed_len, str_speed
 
 skip_status:
 

From 53336f7af1f5a3ef114a1dbc5f317c06fa61f1ab Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sun, 5 Mar 2023 15:45:44 -0800
Subject: [PATCH 005/104] WIP quick hack to check keyboard

this for some reason only works ONCE
though I can replicate the logic in BASIC
and it works over multiple keys
not sure what's wrong
---
 mandel.s | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/mandel.s b/mandel.s
index b524c6f..44bff1f 100644
--- a/mandel.s
+++ b/mandel.s
@@ -42,6 +42,9 @@ CIX    = $f2 ; u8 - index into INBUFF
 INBUFF = $f3 ; u16 - pointer to ascii
 FLPTR  = $fc ; u16 - pointer to user buffer float48
 
+CH1    = $02f2 ; previous character read from keyboard
+CH     = $02fc ; current character read from keyboard
+
 LBUFF  = $0580 ; result buffer for FASC routine
 
 ; FP ROM routine vectors
@@ -73,15 +76,21 @@ stride = width >> 2
 DMACTL = $D400
 DLISTL = $D402
 DLISTH = $D403
+WSYNC  = $D40A
 
 ; OS shadow registers
 SDLSTL = $230
 SDLSTH = $231
 
 ; interrupt stuff
+SYSVBV = $E45F
 XITVBV = $E462
 SETVBV = $E45C
 
+
+; Keys
+KEY_RIGHT = $07
+
 .struct float48
     exponent .byte
     mantissa .byte 6
@@ -126,6 +135,9 @@ char_map:
         .byte 96 + i
     .endrepeat
 
+hex_chars:
+    .byte "0123456789abcdef"
+
 aspect:
     ; aspect ratio!
     ; pixels at 320w are 5:6 (narrow)
@@ -888,6 +900,33 @@ loop_sx:
     jsr mandelbrot
     jsr pset
 
+    ; check keyboard buffer
+    lda CH
+    cmp #$ff
+    beq skip_char
+
+    ; Clear the keyboard buffer
+    ldx #$ff
+    stx CH
+
+    tax
+    lsr a
+    lsr a
+    lsr a
+    lsr a
+    tay
+    lda hex_chars,y
+    sta temp
+
+    txa
+    and #$0f
+    tay
+    lda hex_chars,y
+    sta temp + 1
+
+    draw_text 14, 2, temp
+
+skip_char:
 
     ; check if we should update the counters
     ;

From b1c26c1edd4c574a21d7c05153c52374f3080737 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sun, 5 Mar 2023 16:54:40 -0800
Subject: [PATCH 006/104] WIP fix keyboard check

---
 mandel.s | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index 44bff1f..42982cb 100644
--- a/mandel.s
+++ b/mandel.s
@@ -489,7 +489,7 @@ initloop:
     dex
     bne initloop
     sta z_buffer_start
-    sta z_buffer_len
+    sta z_buffer_end
 
 loop:
     ; iter++ & max-iters break
@@ -905,7 +905,7 @@ loop_sx:
     cmp #$ff
     beq skip_char
 
-    ; Clear the keyboard buffer
+    ; Clear the keyboard buffer and re-enable interrupts
     ldx #$ff
     stx CH
 

From 3d792603db942c167765db3ba7fc3872b711e018 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sat, 11 Mar 2023 20:45:32 -0800
Subject: [PATCH 007/104] keyboard nav sorta working

---
 mandel.s | 126 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 93 insertions(+), 33 deletions(-)

diff --git a/mandel.s b/mandel.s
index 42982cb..97544ec 100644
--- a/mandel.s
+++ b/mandel.s
@@ -87,9 +87,13 @@ SYSVBV = $E45F
 XITVBV = $E462
 SETVBV = $E45C
 
-
-; Keys
-KEY_RIGHT = $07
+; Keycodes!
+KEY_PLUS  = $06
+KEY_MINUS = $0e
+KEY_UP    = $8e
+KEY_DOWN  = $8f
+KEY_LEFT  = $86
+KEY_RIGHT = $87
 
 .struct float48
     exponent .byte
@@ -643,12 +647,12 @@ next:
 
 .endproc
 
-.macro zoom_factor dest, src, zoom, aspect
+.macro scale_zoom dest
+    ; clobbers X, flags
     .local cont
     .local enough
 
     ; cx = (sx << (8 - zoom))
-    copy16 dest, src
     ldx zoom
 cont:
     cpx #8
@@ -657,6 +661,12 @@ cont:
     inx
     jmp cont
 enough:
+.endmacro
+
+.macro zoom_factor dest, src, zoom, aspect
+    ; clobbers A, X, flags, etc
+    copy16 dest, src
+    scale_zoom dest
 
     ; cy = cy * (3 / 4)
     ; cx = cx * (5 / 4)
@@ -803,6 +813,74 @@ done:
     ; draw text
 .endproc
 
+.proc keycheck
+    ; clobbers all
+    ; returns 255 in A if state change or 0 if no change
+
+    ; check keyboard buffer
+    lda CH
+    cmp #$ff
+    beq skip_char
+
+    ; Clear the keyboard buffer and re-enable interrupts
+    ldx #$ff
+    stx CH
+
+    tay
+
+    lda zoom
+    cpy #KEY_PLUS
+    beq plus
+    cpy #KEY_MINUS
+    beq minus
+
+    ; temp = $0010 << (8 - zoom)
+    lda #$10
+    sta temp
+    lda #$00
+    sta temp + 1
+    scale_zoom temp
+
+    cpy #KEY_UP
+    beq up
+    cpy #KEY_DOWN
+    beq down
+    cpy #KEY_LEFT
+    beq left
+    cpy #KEY_RIGHT
+    beq right
+
+skip_char:
+    lda #0
+    rts
+
+plus:
+    cmp #8
+    bpl skip_char
+    inc zoom
+    jmp done
+minus:
+    cmp #1
+    bmi skip_char
+    dec zoom
+    jmp done
+up:
+    sub16 oy, oy, temp 
+    jmp done
+down:
+    add16 oy, oy, temp
+    jmp done
+left:
+    sub16 ox, ox, temp
+    jmp done
+right:
+    add16 ox, ox, temp
+done:
+    lda #255
+    rts
+
+.endproc
+
 .proc start
 
     ; ox = 0; oy = 0; zoom = 0
@@ -896,38 +974,17 @@ loop_sy:
 
 loop_sx:
     zoom_factor cx, sx, zoom, aspect_x
+    add16 cx, cx, ox
     zoom_factor cy, sy, zoom, aspect_y
+    add16 cy, cy, oy
     jsr mandelbrot
     jsr pset
 
-    ; check keyboard buffer
-    lda CH
-    cmp #$ff
-    beq skip_char
-
-    ; Clear the keyboard buffer and re-enable interrupts
-    ldx #$ff
-    stx CH
-
-    tax
-    lsr a
-    lsr a
-    lsr a
-    lsr a
-    tay
-    lda hex_chars,y
-    sta temp
-
-    txa
-    and #$0f
-    tay
-    lda hex_chars,y
-    sta temp + 1
-
-    draw_text 14, 2, temp
-
-skip_char:
+    jsr keycheck
+    beq no_key
+    jmp main_loop
 
+no_key:
     ; check if we should update the counters
     ;
     ; count_pixels >= width? update!
@@ -1039,5 +1096,8 @@ loop_sy_done:
 
 loop:
     ; finished
-    jmp loop
+    jsr keycheck
+    beq loop
+    jmp main_loop
+
 .endproc

From 510457f97a0b0b79f90b7247ce0aed10bbe06c50 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sat, 11 Mar 2023 21:15:08 -0800
Subject: [PATCH 008/104] add a note to fix stats when changing zoom

---
 mandel.s | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mandel.s b/mandel.s
index 97544ec..ee99f05 100644
--- a/mandel.s
+++ b/mandel.s
@@ -982,6 +982,7 @@ loop_sx:
 
     jsr keycheck
     beq no_key
+    ; @fixme clear the pixel stats
     jmp main_loop
 
 no_key:

From c152c4346bf1bba5af4ee471d2e5bce71923cd34 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 4 Feb 2024 14:25:15 -0800
Subject: [PATCH 009/104] Progressive pixel layout

---
 mandel.s  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 readme.md |  4 +++-
 2 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/mandel.s b/mandel.s
index ee99f05..ae8b897 100644
--- a/mandel.s
+++ b/mandel.s
@@ -31,7 +31,7 @@ pixel_color     = $b8 ; u8
 pixel_mask      = $b9 ; u8
 pixel_shift     = $ba ; u8
 pixel_offset    = $bb ; u8
-
+fill_level      = $bc ; u8
 
 ; FP registers in zero page
 FR0    = $d4 ; float48
@@ -218,6 +218,15 @@ z_buffer:
 
 .export start
 
+max_fill_level = 6
+fill_masks:
+    .byte %00011111
+    .byte %00001111
+    .byte %00000111
+    .byte %00000011
+    .byte %00000001
+    .byte %00000000
+
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
     clc ; 2 cyc
@@ -959,6 +968,11 @@ copy_byte_loop:
     jsr SETVBV
 
 main_loop:
+    lda #0
+    sta fill_level
+
+fill_loop:
+
     ; sy = -92 .. 91
     lda #(256-half_height)
     sta sy
@@ -973,6 +987,40 @@ loop_sy:
     sta sx + 1
 
 loop_sx:
+    ; check the fill mask
+    ldy #0
+
+loop_skip_level:
+    cpy fill_level
+    beq current_level
+
+    lda fill_masks,y
+    and sx
+    bne not_skipped_mask1
+
+    lda fill_masks,y
+    and sy
+    beq skipped_mask
+
+not_skipped_mask1:
+    iny
+    jmp loop_skip_level
+
+current_level:
+    lda fill_masks,y
+    and sx
+    bne skipped_mask
+
+    lda fill_masks,y
+    and sy
+    beq not_skipped_mask
+
+skipped_mask:
+    jmp skipped
+
+not_skipped_mask:
+
+    ; run the fractal!
     zoom_factor cx, sx, zoom, aspect_x
     add16 cx, cx, ox
     zoom_factor cy, sy, zoom, aspect_y
@@ -983,7 +1031,7 @@ loop_sx:
     jsr keycheck
     beq no_key
     ; @fixme clear the pixel stats
-    jmp main_loop
+    jmp fill_loop
 
 no_key:
     ; check if we should update the counters
@@ -997,7 +1045,7 @@ no_key:
     ; count_frames >= 120? update!
     lda count_frames
     cmp #120 ; >= 2 seconds
-    bmi skip_status
+    bmi skipped
 
 update_status:
     ; FR0 = (float)count_pixels & clear count_pixels
@@ -1061,7 +1109,7 @@ update_status:
     draw_text_indirect speed_start, speed_precision, INBUFF
     draw_text speed_start + speed_precision, str_speed_len, str_speed
 
-skip_status:
+skipped:
 
     clc
     lda sx
@@ -1095,6 +1143,13 @@ loop_sy_done:
 
     draw_text 40 - str_done_len, str_done_len, str_done
 
+fill_loop_done:
+    inc fill_level
+    lda fill_level
+    cmp #max_fill_level
+    beq loop
+    jmp fill_loop
+
 loop:
     ; finished
     jsr keycheck
diff --git a/readme.md b/readme.md
index 46ebd36..6b57378 100644
--- a/readme.md
+++ b/readme.md
@@ -14,7 +14,7 @@ Non-goals:
 
 Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
 
--- brion, january 2023
+-- brooke, january 2023 - february 2024
 
 ## Current state
 
@@ -28,6 +28,8 @@ The mandelbrot calculations are done using 4.12-precision fixed point numbers. I
 
 Iterations are capped at 255.
 
+The pixels are run in a progressive layout to get the basic shape on screen faster.
+
 ## Next steps
 
 Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!

From 201d9bf15cf06ad34434535593c03e234be0a322 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 25 Feb 2024 15:15:23 -0800
Subject: [PATCH 010/104] clear screen after zoom/scroll

---
 mandel.s | 59 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/mandel.s b/mandel.s
index ae8b897..6dedb17 100644
--- a/mandel.s
+++ b/mandel.s
@@ -890,6 +890,37 @@ done:
 
 .endproc
 
+.proc clear_screen
+    ; zero the range from framebuffer_top to display_list
+    lda #.lobyte(framebuffer_top)
+    sta temp
+    lda #.hibyte(framebuffer_top)
+    sta temp + 1
+
+zero_page_loop:
+    lda #0
+    ldy #0
+zero_byte_loop:
+    sta (temp),y
+    iny
+    bne zero_byte_loop
+
+    inc temp + 1
+    lda temp + 1
+    cmp #.hibyte(display_list)
+    bne zero_page_loop
+
+    rts
+.endproc
+
+.proc status_bar
+    ; Status bar
+    draw_text 0, str_self_len, str_self
+    draw_text 40 - str_run_len, str_run_len, str_run
+
+    rts
+.endproc
+
 .proc start
 
     ; ox = 0; oy = 0; zoom = 0
@@ -916,24 +947,7 @@ done:
     lda #0
     sta DMACTL
 
-    ; zero the range from framebuffer_top to framebuffer_end
-    lda #.lobyte(framebuffer_top)
-    sta temp
-    lda #.hibyte(framebuffer_top)
-    sta temp + 1
-
-zero_page_loop:
-    lda #0
-    ldy #0
-zero_byte_loop:
-    sta (temp),y
-    iny
-    bne zero_byte_loop
-
-    inc temp + 1
-    lda temp + 1
-    cmp #.hibyte(framebuffer_end)
-    bne zero_page_loop
+    jsr clear_screen
 
     ; Copy the display list into properly aligned memory
     ; Can't cross 1024-byte boundaries :D
@@ -953,10 +967,6 @@ copy_byte_loop:
     sta DLISTH ; actual register
     sta SDLSTH ; shadow register the OS will copy in
 
-    ; Status bar
-    draw_text 0, str_self_len, str_self
-    draw_text 40 - str_run_len, str_run_len, str_run
-
     ; Re-enable display DMA
     lda #$22
     sta DMACTL
@@ -968,6 +978,9 @@ copy_byte_loop:
     jsr SETVBV
 
 main_loop:
+    jsr clear_screen
+    jsr status_bar
+
     lda #0
     sta fill_level
 
@@ -1031,7 +1044,7 @@ not_skipped_mask:
     jsr keycheck
     beq no_key
     ; @fixme clear the pixel stats
-    jmp fill_loop
+    jmp main_loop
 
 no_key:
     ; check if we should update the counters

From ee5b12dae88db3ace0f386b441cc6343de3e488a Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 18 Aug 2024 20:15:47 -0700
Subject: [PATCH 011/104] mailmap

---
 .mailmap | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .mailmap

diff --git a/.mailmap b/.mailmap
new file mode 100644
index 0000000..3102e50
--- /dev/null
+++ b/.mailmap
@@ -0,0 +1,2 @@
+Brooke Vibber <bvibber@pobox.com>
+Brooke Vibber <bvibber@pobox.com> <brion@pobox.com>

From 8be03993abb9f498e219a22da1f65c1e8f28e0c5 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 18 Aug 2024 20:29:39 -0700
Subject: [PATCH 012/104] fix time of drawing of 'DONE' text

---
 mandel.s | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index 6dedb17..8cbd770 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1154,8 +1154,6 @@ loop_sx_done:
 
 loop_sy_done:
 
-    draw_text 40 - str_done_len, str_done_len, str_done
-
 fill_loop_done:
     inc fill_level
     lda fill_level
@@ -1165,6 +1163,7 @@ fill_loop_done:
 
 loop:
     ; finished
+    draw_text 40 - str_done_len, str_done_len, str_done
     jsr keycheck
     beq loop
     jmp main_loop

From 6f05a9bbd057d0a3d7bfc963e2817117ad5463f5 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 18 Aug 2024 21:06:30 -0700
Subject: [PATCH 013/104] basic palette cycling

---
 mandel.s | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/mandel.s b/mandel.s
index 8cbd770..5976b73 100644
--- a/mandel.s
+++ b/mandel.s
@@ -32,6 +32,7 @@ pixel_mask      = $b9 ; u8
 pixel_shift     = $ba ; u8
 pixel_offset    = $bb ; u8
 fill_level      = $bc ; u8
+palette_offset  = $bd ; u8
 
 ; FP registers in zero page
 FR0    = $d4 ; float48
@@ -87,6 +88,12 @@ SYSVBV = $E45F
 XITVBV = $E462
 SETVBV = $E45C
 
+COLOR0 = $2C4
+COLOR1 = $2C5
+COLOR2 = $2C6
+COLOR3 = $2C7
+COLOR4 = $2C8
+
 ; Keycodes!
 KEY_PLUS  = $06
 KEY_MINUS = $0e
@@ -205,6 +212,11 @@ color_map:
         .byte 3
     .endrepeat
 
+palette:
+    .byte $00
+    .byte $36
+    .byte $88
+    .byte $d4
 .code
 
 z_buffer_len = 16
@@ -809,9 +821,33 @@ done:
 
 .proc vblank_handler
     inc count_frames
+    inc palette_offset
     jmp XITVBV
 .endproc
 
+.proc update_palette
+    lda palette
+    sta COLOR4
+
+    clc
+    lda palette_offset
+    and #$f0
+    adc palette + 1
+    sta COLOR0
+
+    clc
+    lda palette_offset
+    and #$f0
+    adc palette + 2
+    sta COLOR1
+
+    clc
+    lda palette_offset
+    and #$f0
+    adc palette + 3
+    sta COLOR2
+.endproc
+
 .proc update_speed
     ; convert frames (u16) to fp
     ; add to frames_total
@@ -971,6 +1007,11 @@ copy_byte_loop:
     lda #$22
     sta DMACTL
 
+    ; Initialize the palette
+    lda #0
+    sta palette_offset
+    jsr update_palette
+
     ; install the vblank handler
     lda #7 ; deferred
     ldx #.hibyte(vblank_handler)
@@ -1122,6 +1163,8 @@ update_status:
     draw_text_indirect speed_start, speed_precision, INBUFF
     draw_text speed_start + speed_precision, str_speed_len, str_speed
 
+    jsr update_palette
+
 skipped:
 
     clc

From c559b6e76b138052af33c9b1b1fe3f3a06aa5711 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 18 Aug 2024 21:07:53 -0700
Subject: [PATCH 014/104] palette adjustment

---
 mandel.s | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mandel.s b/mandel.s
index 5976b73..8068fff 100644
--- a/mandel.s
+++ b/mandel.s
@@ -214,9 +214,9 @@ color_map:
 
 palette:
     .byte $00
-    .byte $36
-    .byte $88
-    .byte $d4
+    .byte $46
+    .byte $78
+    .byte $b4
 .code
 
 z_buffer_len = 16

From 29630c88872c8b22a2b357983f1cd0fc86bb197d Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 19 Aug 2024 13:21:44 -0700
Subject: [PATCH 015/104] update palette more smoothly

---
 mandel.s | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index 8068fff..3db6a77 100644
--- a/mandel.s
+++ b/mandel.s
@@ -822,6 +822,7 @@ done:
 .proc vblank_handler
     inc count_frames
     inc palette_offset
+    jsr update_palette
     jmp XITVBV
 .endproc
 
@@ -1163,8 +1164,6 @@ update_status:
     draw_text_indirect speed_start, speed_precision, INBUFF
     draw_text speed_start + speed_precision, str_speed_len, str_speed
 
-    jsr update_palette
-
 skipped:
 
     clc

From 5637783529e71dbbdc568d853d5df5616f25970c Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sat, 11 Feb 2023 12:24:48 -0800
Subject: [PATCH 016/104] Faster imul16 routine

Improves runtime from 16.24 ms/px to 14.44 ms/px

This uses a routine found on Everything2:
https://everything2.com/title/Fast+6502+multiplication

which uses a lookup table of squares to do 8-bit imuls,
which are then composed into a 16-bit imul
---
 .gitignore |   1 +
 Makefile   |   8 ++-
 mandel.s   | 176 +++++++++++++++++++++++++++++------------------------
 tables.js  |  38 ++++++++++++
 testme.js  |  41 +++++++++++++
 5 files changed, 183 insertions(+), 81 deletions(-)
 create mode 100644 tables.js
 create mode 100644 testme.js

diff --git a/.gitignore b/.gitignore
index 8d2f7ce..771e47a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.o
 *.xex
+tables.s
 .DS_Store
diff --git a/Makefile b/Makefile
index 25148b4..008bf8c 100644
--- a/Makefile
+++ b/Makefile
@@ -2,13 +2,17 @@
 
 all : mandel.xex
 
-%.xex : %.o
-	ld65 -C atari-asm-xex.cfg -o $@ $<
+mandel.xex : mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+
 
 %.o : %.s
 	ca65 -o $@ $<
 
+tables.s : tables.js
+	node tables.js > tables.s
+
 clean :
+	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex
 
diff --git a/mandel.s b/mandel.s
index 3db6a77..1244a02 100644
--- a/mandel.s
+++ b/mandel.s
@@ -25,14 +25,14 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
 z_buffer_start  = $b1 ; u8: index into z_buffer
 z_buffer_end    = $b2 ; u8: index into z_buffer
 temp            = $b4 ; u16
-
-pixel_ptr       = $b6 ; u16
-pixel_color     = $b8 ; u8
-pixel_mask      = $b9 ; u8
-pixel_shift     = $ba ; u8
-pixel_offset    = $bb ; u8
-fill_level      = $bc ; u8
-palette_offset  = $bd ; u8
+temp2           = $b6 ; u16
+pixel_ptr       = $b8 ; u16
+pixel_color     = $ba ; u8
+pixel_mask      = $bb ; u8
+pixel_shift     = $bc ; u8
+pixel_offset    = $bd ; u8
+fill_level      = $be ; u8
+palette_offset  = $bf ; u8
 
 ; FP registers in zero page
 FR0    = $d4 ; float48
@@ -107,6 +107,10 @@ KEY_RIGHT = $87
     mantissa .byte 6
 .endstruct
 
+.import mul_lobyte256
+.import mul_hibyte256
+.import mul_hibyte512
+
 .data
 
 strings:
@@ -257,6 +261,12 @@ fill_masks:
     add 4, dest, arg2, dest
 .endmacro
 
+.macro add_carry dest
+    lda dest
+    adc #0
+    sta dest
+.endmacro
+
 ; 2 + 9 * byte cycles
 .macro sub bytes, dest, arg1, arg2
     sec ; 2 cyc
@@ -334,65 +344,15 @@ fill_masks:
     neg 4, arg
 .endmacro
 
-; inner loop for imul16
-; bitnum < 8: 25 or 41 cycles
-; bitnum >= 8: 30 or 46 cycles
-.macro bitmul16 arg1, arg2, result, bitnum
-    .local zero
-    .local one
-    .local next
-
-    ; does 16-bit adds
-    ; arg1 and arg2 are treated as unsigned
-    ; negative signed inputs must be flipped first
-
-    ; 7 cycles up to the branch
-
-    ; check if arg1 has 0 or 1 bit in this place
-    ; 5 cycles either way
-    .if bitnum < 8
-        lda arg1                 ; 3 cyc
-        and #(1 << (bitnum))       ; 2 cyc
-    .else
-        lda arg1 + 1             ; 3 cyc
-        and #(1 << ((bitnum) - 8)) ; 2 cyc
-    .endif
-    bne one ; 2 cyc
-
-zero: ; 18 cyc, 23 cyc
-    lsr result + 3 ; 5 cyc
-    jmp next       ; 3 cyc
-
-one: ; 32 cyc, 37 cyc
-    ; 16-bit add on the top bits
-    clc            ; 2 cyc
-    lda result + 2 ; 3 cyc
-    adc arg2       ; 3 cyc
-    sta result + 2 ; 3 cyc
-    lda result + 3 ; 3 cyc
-    adc arg2 + 1   ; 3 cyc
-    ror a          ; 2 cyc - get a jump on the shift
-    sta result + 3 ; 3 cyc
-next:
-    ror result + 2 ; 5 cyc
-    ror result + 1 ; 5 cyc
-    .if bitnum >= 8
-        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
-        ; when it's all uninitialized data
-        ror result ; 5 cyc
-    .endif
-
-.endmacro
-
 ; 5 to 25 cycles
 .macro check_sign arg
     ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the X register.
+    ; keeping a count of sign bits in the Y register.
     .local positive
     lda arg + 1   ; 3 cyc
     bpl positive  ; 2 cyc
     neg16 arg     ; 18 cyc
-    inx           ; 2 cyc
+    iny           ; 2 cyc
 positive:
 .endmacro
 
@@ -419,35 +379,93 @@ positive:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
-; min 470 cycles
-; max 780 cycles
+; Adapted from https://everything2.com/title/Fast+6502+multiplication
+.macro imul8 dest, arg1, arg2
+    .local under256
+    .local next
+    .local small_product
+    .scope
+        mul_factor_a   = arg1
+        mul_factor_x   = arg2
+        mul_product_lo = dest
+        mul_product_hi = dest + 1
+
+        lda mul_factor_a      ; setup: 6 cycles
+        ;ldx mul_factor_x
+
+        clc                   ; (a + x)^2/2: 23 cycles
+        adc mul_factor_x
+        tax
+        bcc under256
+        lda mul_hibyte512,x
+        bcs next
+    under256:
+        lda mul_hibyte256,x
+        sec
+    next:
+        sta mul_product_hi
+        lda mul_lobyte256,x
+
+        ldx mul_factor_a      ; - a^2/2: 20 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+
+        ldx mul_factor_x      ; + x & a & 1: 22 cycles
+        txa                   ; (this is a kludge to correct a
+        and mul_factor_a      ; roundoff error that makes odd * odd too low)
+        and #1
+
+        clc
+        adc mul_product_lo
+        bcc small_product
+        inc mul_product_hi
+    small_product:
+        sec                   ; - x^2/2: 25 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+    .endscope
+.endmacro
+
 .proc imul16_func
     arg1 = FR0   ; 16-bit arg (clobbered)
     arg2 = FR1   ; 16-bit arg (clobbered)
     result = FR2 ; 32-bit result
+    inter = temp2
 
-    ldx #0          ; 2 cyc
-    ; counts the number of sign bits in X
+    ldy #0          ; 2 cyc
+    ; counts the number of sign bits in Y
     check_sign arg1 ; 5 to 25 cyc
     check_sign arg2 ; 5 to 25 cyc
-    
-    ; zero out the 32-bit temp's top 16 bits
-    lda #0          ; 2 cyc
-    sta result + 2  ; 3 cyc
-    sta result + 3  ; 3 cyc
-    ; the bottom two bytes will get cleared by the shifts
 
-    ; unrolled loop for maximum speed, at the cost
-    ; of a larger routine
-    ; 440 to 696 cycles
-    .repeat 16, bitnum
-        ; bitnum < 8: 25 or 41 cycles
-        ; bitnum >= 8: 30 or 46 cycles
-        bitmul16 arg1, arg2, result, bitnum
-    .endrepeat
+    ; h1l1 * h2l2
+    ; (h1*256 + l1) * (h2*256 + l2)
+    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
+    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
+
+    imul8 result, arg1, arg2
+    lda #0
+    sta result + 2
+    sta result + 3
+
+    imul8 inter, arg1 + 1, arg2
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1, arg2 + 1
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, inter
 
     ; In case of mixed input signs, return a negative result.
-    cpx #1              ; 2 cyc
+    cpy #1              ; 2 cyc
     bne positive_result ; 2 cyc
     neg32 result        ; 34 cyc
 positive_result:
diff --git a/tables.js b/tables.js
new file mode 100644
index 0000000..c772f81
--- /dev/null
+++ b/tables.js
@@ -0,0 +1,38 @@
+function db(func) {
+    let lines = [];
+    for (let i = 0; i < 256; i += 16) {
+        let items = [];
+        for (let j = 0; j < 16; j++) {
+            let x = i + j;
+            items.push(func(x));
+        }
+        lines.push('    .byte ' + items.join(', '));
+    }
+    return lines.join('\n');
+}
+
+let squares = [];
+for (let i = 0; i < 512; i++) {
+    squares.push(Math.trunc((i * i + 1) / 2));
+}
+
+console.log(
+`.segment "TABLES"
+
+.export mul_lobyte256
+.export mul_hibyte256
+.export mul_hibyte512
+
+.align 256
+mul_lobyte256:
+${db((i) => squares[i] & 0xff)}
+
+.align 256
+mul_hibyte256:
+${db((i) => (squares[i] >> 8) & 0xff)}
+
+.align 256
+mul_hibyte512:
+${db((i) => (squares[i + 256] >> 8) & 0xff)}
+
+`);
diff --git a/testme.js b/testme.js
new file mode 100644
index 0000000..e12e706
--- /dev/null
+++ b/testme.js
@@ -0,0 +1,41 @@
+// ax = (a + x)2/2 - a2/2 - x2/2 
+
+function half_square(x) {
+    return Math.round(x * x / 2) & 0xffff >>> 0;
+}
+
+function mul8(a, b) {
+    let result = half_square(a + b) & 0xffff;
+    result = (result - half_square(a)) & 0xffff;
+    result = (result - half_square(b)) & 0xffff;
+    result = (result + (b & a & 1)) & 0xffff;
+    return result >>> 0;
+}
+
+function mul16(a, b) {
+    let ah = (a & 0xff00) >>> 8;
+    let al = (a & 0x00ff) >>> 0;
+    let bh = (b & 0xff00) >>> 8;
+    let bl = (b & 0x00ff) >>> 0;
+    let result = (mul8(al, bl) & 0xffff) >>> 0;
+    result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
+    result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
+    result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
+    return result;
+}
+
+let max = 65536;
+//let max = 256;
+//let max = 128;
+//let max = 8;
+
+for (let a = 0; a < max; a++) {
+    for (let b = 0; b < max; b++) {
+        let expected = Math.imul(a, b) >>> 0;
+        //let actual = mul8(a, b);
+        let actual = mul16(a, b);
+        if (expected !== actual) {
+            console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
+        }
+    }
+}
\ No newline at end of file

From 7f2bc43cff173e7dffd9a5629bb9bcb56f374259 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 14 Dec 2024 18:56:26 -0800
Subject: [PATCH 017/104] squares

---
 readme.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/readme.md b/readme.md
index 6b57378..873793f 100644
--- a/readme.md
+++ b/readme.md
@@ -37,6 +37,7 @@ Add a running counter of ms/px using the vertical blank interrupts as a timer. T
 Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
 
 I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
+(done)
 
 ## Deps and build instructions
 

From 05133aabdd59739805bbe7bb2eb32e9815120718 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 15 Dec 2024 20:17:45 -0800
Subject: [PATCH 018/104] slightly faster handling of signed mul

previously we were flipping the inputs if negative, and then the
output if both inputs were negative

turns out you can just treat the whole thing as an unsigned mul
and then subtract each term from the high word if the other term
is negative.

https://stackoverflow.com/a/28827013

this saves a handful of cycles, reducing our runtime to a merge
14.211 ms/px \o/
---
 mandel.s | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

diff --git a/mandel.s b/mandel.s
index 1244a02..3622995 100644
--- a/mandel.s
+++ b/mandel.s
@@ -344,18 +344,6 @@ fill_masks:
     neg 4, arg
 .endmacro
 
-; 5 to 25 cycles
-.macro check_sign arg
-    ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the Y register.
-    .local positive
-    lda arg + 1   ; 3 cyc
-    bpl positive  ; 2 cyc
-    neg16 arg     ; 18 cyc
-    iny           ; 2 cyc
-positive:
-.endmacro
-
 ; 518 - 828 cyc
 .macro imul16 dest, arg1, arg2
     copy16 FR0, arg1  ; 12 cyc
@@ -438,11 +426,6 @@ positive:
     result = FR2 ; 32-bit result
     inter = temp2
 
-    ldy #0          ; 2 cyc
-    ; counts the number of sign bits in Y
-    check_sign arg1 ; 5 to 25 cyc
-    check_sign arg2 ; 5 to 25 cyc
-
     ; h1l1 * h2l2
     ; (h1*256 + l1) * (h2*256 + l2)
     ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
@@ -464,11 +447,16 @@ positive:
     imul8 inter, arg1 + 1, arg2 + 1
     add16 result + 2, result + 2, inter
 
-    ; In case of mixed input signs, return a negative result.
-    cpy #1              ; 2 cyc
-    bne positive_result ; 2 cyc
-    neg32 result        ; 34 cyc
-positive_result:
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg1 + 1
+    bpl arg1_pos
+    sub16 result + 2, result + 2, arg2
+arg1_pos:
+    lda arg2 + 1
+    bpl arg2_pos
+    sub16 result + 2, result + 2, arg1
+arg2_pos:
 
     rts ; 6 cyc
 .endproc

From 405cec6d511947ccc1a0dcc3c79e06e4ac1a5278 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Wed, 25 Dec 2024 10:51:27 -0800
Subject: [PATCH 019/104] WIP imul8 via table experiments

planning to try a 64KB table of 8x7-bit multiplies in the high memory
on a 130XE or other high-memory-capable machine

not yet working or finished

too many cycles of overhead per invocation
---
 imul8xe.s | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mandel.s  | 71 ++++++++++++++++++++++-------------------
 2 files changed, 133 insertions(+), 32 deletions(-)
 create mode 100644 imul8xe.s

diff --git a/imul8xe.s b/imul8xe.s
new file mode 100644
index 0000000..5cbb852
--- /dev/null
+++ b/imul8xe.s
@@ -0,0 +1,94 @@
+FR0    = $d4 ; float48
+PORTB = $d301
+
+
+EXTENDED_RAM = $4000 ; 16KiB bank on the XE
+bankswitch = ; ???
+
+; input in X/Y (lo/hi)
+; output in FR0
+; clobbers FR0
+; 128 cycles
+proc imul8xe
+    output = FR0
+    ptr = FR0 + 2
+
+    lda #0       ; 2 cyc
+    sta ptr      ; 3 cyc
+    sta ptr + 1  ; 3 cyc
+
+    ; bottom 14 bits except the LSB are the per-bank table index
+    ; add $4000 for the bank pointer
+    txa          ; 2 cyc
+    and #$fe     ; 2 cyc
+    sta ptr      ; 3 cyc
+    tya          ; 2 cyc
+    and #$3f     ; 2 cyc
+    clc          ; 2 cyc
+    adc #$40     ; 2 cyc
+    sta ptr + 1  ; 3 cyc
+    
+    ; top 2 bits are the table bank selector
+    tya          ; 2 cyc
+    and #$c0     ; 2 cyc
+    ; shift in extended RAM mode 2x 1 bits
+    sec          ; 2 cyc
+    ror          ; 2 cyc
+    ror          ; 2 cyc
+    ; shift in 0 bits
+    asr          ; 2 cyc
+    asr          ; 2 cyc
+    asr          ; 2 cyc
+
+    ; save the second param for later
+    phy          ; 3 cyc
+
+    ; disable interrupts
+    lda NMIEN    ; 4 cyc
+    pha          ; 3 cyc
+    lda #0       ; 2 cyc
+    sta NMIEN    ; 4 cyc
+
+    ; set the standard top RAM and OS ROM on
+    or #$81      ; 2 cyc
+    sta PORTB    ; 4 cyc
+
+
+    ; copy the entry into output
+    ldy #0       ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output   ; 3 cyc
+    iny          ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output+1 ; 3 cyc
+
+    ; restore memory
+    lda #$81     ; 2 cyc
+    sta PORTB    ; 4 cyc
+
+    ; restore interrupts
+    pla          ; 3 cyc
+    sta NMIEN    ; 4 cyc
+
+    ; check that 1 bit we skipped to fit into space
+    txa          ; 2 cyc
+    and $#1      ; 2 cyc
+    beq done     ; 2 cyc
+
+    ; add the second param one last tie for the skipped bit
+    clc          ; 2 cyc
+    pla          ; 3 cyc
+    adc output   ; 3 cyc
+    sta output   ; 3 cyc
+    lda #0       ; 2 cyc
+    adc output+1 ; 3 cyc
+    sta output+1 ; 3 cyc
+
+done:
+    pla
+    rts          ; 6 cyc
+endproc
+
+proc imul8xe_init
+    rts
+endproc
diff --git a/mandel.s b/mandel.s
index 3622995..3b0bc9f 100644
--- a/mandel.s
+++ b/mandel.s
@@ -372,51 +372,58 @@ fill_masks:
     .local under256
     .local next
     .local small_product
+    ; circa 92 cycles? this doesn't seem right
     .scope
         mul_factor_a   = arg1
         mul_factor_x   = arg2
         mul_product_lo = dest
         mul_product_hi = dest + 1
 
-        lda mul_factor_a      ; setup: 6 cycles
-        ;ldx mul_factor_x
+        lda mul_factor_a      ; 3 cyc
 
-        clc                   ; (a + x)^2/2: 23 cycles
-        adc mul_factor_x
-        tax
-        bcc under256
-        lda mul_hibyte512,x
-        bcs next
+        ; (a + x)^2/2
+        clc                   ; 2 cyc         
+        adc mul_factor_x      ; 3 cyc
+        tax                   ; 2 cyc
+        bcc under256          ; 2 cyc
+        lda mul_hibyte512,x   ; 4 cyc
+        bcs next              ; 2 cyc
     under256:
-        lda mul_hibyte256,x
-        sec
+        lda mul_hibyte256,x   ; 4 cyc
+        sec                   ; 2 cyc
     next:
-        sta mul_product_hi
-        lda mul_lobyte256,x
+        sta mul_product_hi    ; 3 cyc
+        lda mul_lobyte256,x   ; 4 cyc
 
-        ldx mul_factor_a      ; - a^2/2: 20 cycles
-        sbc mul_lobyte256,x
-        sta mul_product_lo
-        lda mul_product_hi
-        sbc mul_hibyte256,x
-        sta mul_product_hi
+        ; - a^2/2
+        ldx mul_factor_a      ; 3 cyc
+        sbc mul_lobyte256,x   ; 4 cyc
+        sta mul_product_lo    ; 3 cyc
+        lda mul_product_hi    ; 3 cyc
+        sbc mul_hibyte256,x   ; 4 cyc
+        sta mul_product_hi    ; 3 cyc
 
-        ldx mul_factor_x      ; + x & a & 1: 22 cycles
-        txa                   ; (this is a kludge to correct a
-        and mul_factor_a      ; roundoff error that makes odd * odd too low)
-        and #1
+        ; + x & a & 1:
+        ; (this is a kludge to correct a
+        ; roundoff error that makes odd * odd too low)
+        ldx mul_factor_x      ; 3 cyc
+        txa                   ; 2 cyc
+        and mul_factor_a      ; 3 cyc
+        and #1                ; 2 cyc
 
-        clc
-        adc mul_product_lo
-        bcc small_product
-        inc mul_product_hi
+        clc                   ; 2 cyc
+        adc mul_product_lo    ; 3 cyc
+        bcc small_product     ; 2 cyc
+        inc mul_product_hi    ; 5 cyc
+
+        ; - x^2/2
     small_product:
-        sec                   ; - x^2/2: 25 cycles
-        sbc mul_lobyte256,x
-        sta mul_product_lo
-        lda mul_product_hi
-        sbc mul_hibyte256,x
-        sta mul_product_hi
+        sec                   ; 2 cyc
+        sbc mul_lobyte256,x   ; 4 cyc
+        sta mul_product_lo    ; 3 cyc
+        lda mul_product_hi    ; 3 cyc
+        sbc mul_hibyte256,x   ; 4 cyc
+        sta mul_product_hi    ; 3 cyc
     .endscope
 .endmacro
 

From f996c3cbcd84b3aff3fd39bf3daee9a6c60a9e2a Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Wed, 25 Dec 2024 12:47:37 -0800
Subject: [PATCH 020/104] provisional maybe

old mode runs in 81-92 cycles

provisional code runs in 58-77 cycles

if it works ;)
---
 imul8xe.s | 76 ++++++++++++++++++++-----------------------------------
 mandel.s  |  1 +
 2 files changed, 29 insertions(+), 48 deletions(-)

diff --git a/imul8xe.s b/imul8xe.s
index 5cbb852..d12f53f 100644
--- a/imul8xe.s
+++ b/imul8xe.s
@@ -3,55 +3,38 @@ PORTB = $d301
 
 
 EXTENDED_RAM = $4000 ; 16KiB bank on the XE
-bankswitch = ; ???
 
-; input in X/Y (lo/hi)
-; output in FR0
-; clobbers FR0
-; 128 cycles
-proc imul8xe
-    output = FR0
-    ptr = FR0 + 2
+; lookup table for top byte -> PORTB value for bank-switch
+.align 256
+bankswitch:
+    .repeat 256, i
+        .byte ((i & $c0) >> 5) | $c1
+    .endrepeat
 
-    lda #0       ; 2 cyc
-    sta ptr      ; 3 cyc
-    sta ptr + 1  ; 3 cyc
+; 58-77 cycles
+.macro imul8xe dest, arg1, arg2
+.local done
+.local output
+.local ptr
+
+    output = dest
+    ptr = dest + 2 ; scratch space assumed
 
     ; bottom 14 bits except the LSB are the per-bank table index
     ; add $4000 for the bank pointer
-    txa          ; 2 cyc
+    lda arg1     ; 3 cyc
     and #$fe     ; 2 cyc
     sta ptr      ; 3 cyc
-    tya          ; 2 cyc
+    lda arg2     ; 3 cyc
     and #$3f     ; 2 cyc
     clc          ; 2 cyc
     adc #$40     ; 2 cyc
     sta ptr + 1  ; 3 cyc
     
     ; top 2 bits are the table bank selector
-    tya          ; 2 cyc
-    and #$c0     ; 2 cyc
-    ; shift in extended RAM mode 2x 1 bits
-    sec          ; 2 cyc
-    ror          ; 2 cyc
-    ror          ; 2 cyc
-    ; shift in 0 bits
-    asr          ; 2 cyc
-    asr          ; 2 cyc
-    asr          ; 2 cyc
-
-    ; save the second param for later
-    phy          ; 3 cyc
-
-    ; disable interrupts
-    lda NMIEN    ; 4 cyc
-    pha          ; 3 cyc
-    lda #0       ; 2 cyc
-    sta NMIEN    ; 4 cyc
-
-    ; set the standard top RAM and OS ROM on
-    or #$81      ; 2 cyc
-    sta PORTB    ; 4 cyc
+    ldx arg2          ; 3 cyc
+    lda bank_switch,x ; 4 cyc
+    sta PORTB         ; 4 cyc
 
 
     ; copy the entry into output
@@ -62,22 +45,21 @@ proc imul8xe
     lda (ptr),y  ; 5 cyc
     sta output+1 ; 3 cyc
 
-    ; restore memory
-    lda #$81     ; 2 cyc
-    sta PORTB    ; 4 cyc
-
-    ; restore interrupts
-    pla          ; 3 cyc
-    sta NMIEN    ; 4 cyc
+    ; note: we are not restoring memory to save 6 cycles!
+    ; this means those 16kb have to be switched back to base RAM
+    ; if we need to use them anywhere else
+    ;;; restore memory
+    ;;lda #$81     ; 2 cyc - disabled
+    ;;sta PORTB    ; 4 cyc - disabled
 
     ; check that 1 bit we skipped to fit into space
-    txa          ; 2 cyc
+    lda arg1     ; 3 cyc
     and $#1      ; 2 cyc
     beq done     ; 2 cyc
 
     ; add the second param one last tie for the skipped bit
     clc          ; 2 cyc
-    pla          ; 3 cyc
+    lda arg2     ; 3 cyc
     adc output   ; 3 cyc
     sta output   ; 3 cyc
     lda #0       ; 2 cyc
@@ -85,9 +67,7 @@ proc imul8xe
     sta output+1 ; 3 cyc
 
 done:
-    pla
-    rts          ; 6 cyc
-endproc
+.endmacro
 
 proc imul8xe_init
     rts
diff --git a/mandel.s b/mandel.s
index 3b0bc9f..e0a8570 100644
--- a/mandel.s
+++ b/mandel.s
@@ -373,6 +373,7 @@ fill_masks:
     .local next
     .local small_product
     ; circa 92 cycles? this doesn't seem right
+    ; 81-92 cycles
     .scope
         mul_factor_a   = arg1
         mul_factor_x   = arg2

From 829d2860e8f946a088218fa5cde2e07067e0dfa6 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Thu, 26 Dec 2024 12:04:01 -0800
Subject: [PATCH 021/104] :P

---
 imul8xe.s | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/imul8xe.s b/imul8xe.s
index d12f53f..15adf64 100644
--- a/imul8xe.s
+++ b/imul8xe.s
@@ -12,6 +12,7 @@ bankswitch:
     .endrepeat
 
 ; 58-77 cycles
+; clobbers x, y, dest to dest + 3
 .macro imul8xe dest, arg1, arg2
 .local done
 .local output
@@ -54,10 +55,10 @@ bankswitch:
 
     ; check that 1 bit we skipped to fit into space
     lda arg1     ; 3 cyc
-    and $#1      ; 2 cyc
+    and #1       ; 2 cyc
     beq done     ; 2 cyc
 
-    ; add the second param one last tie for the skipped bit
+    ; add the second param one last time for the skipped bit
     clc          ; 2 cyc
     lda arg2     ; 3 cyc
     adc output   ; 3 cyc

From a9d551a98d01a3634cb5068fc00506f2b398f8d2 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Thu, 26 Dec 2024 17:50:59 -0800
Subject: [PATCH 022/104] first draft initializer

---
 imul8xe.s | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/imul8xe.s b/imul8xe.s
index 15adf64..855e044 100644
--- a/imul8xe.s
+++ b/imul8xe.s
@@ -70,6 +70,106 @@ bankswitch:
 done:
 .endmacro
 
+.macro bank_switch bank
+    lda #((bank << 1) | $c1)
+    sta PORTB
+.endmacro
+
 proc imul8xe_init
+
+    ; go through the input set, in four 16KB chunks
+
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+
+    lda #$00
+    sta arg1
+    sta arg2
+
+    ; $00 * $00 -> $3f * $ff
+    bank_switch 0
+    jsr imul8xe_init_section
+
+    ; $40 * $00 -> $7f * $ff
+    bank_switch 1
+    jsr imul8xe_init_section
+
+    ; $80 * $00 -> $bf * $ff
+    bank_switch 2
+    jsr imul8xe_init_section
+
+    ; $c0 * $00 -> $ff * $ff
+    bank_switch 3
+    jsr imul8xe_init_section
+
     rts
 endproc
+
+; Initialize a 16 KB chunk of the table
+; input: multipliers in temp
+; output: new multipliers in temp
+; clobbers: temp, temp2
+proc imul8xe_init_section
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+    ptr = temp2
+
+    lda #$00
+    sta ptr
+    lda #$40
+    sta ptr + 1
+
+    ldx #0
+    ldy #0
+
+    ; outer loop: $00 -> $3f
+outer_loop:
+
+    ; reset result to 0
+    lda #0
+    sta result
+    sta result + 1
+
+    ; inner loop: $00 -> $ff
+inner_loop:
+
+    ; copy result to data set
+    lda result
+    sta (ptr),y
+    lda result + 1
+    sta (ptr),y
+
+    ; result += 2 * arg2
+    clc
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result
+
+    ; inner loop check
+    inc arg1
+    inc arg1
+    inc ptr
+    inc ptr
+    bne inner_loop
+
+    ; outer loop check
+    inc arg2
+    inc ptr + 1
+    lda ptr + 1
+    cmp #$40
+    bne outer_loop
+
+    rts
+
+endproc

From 34ce9da030ea3ee9853a8e5ecf64f65798faaded Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Thu, 26 Dec 2024 18:17:01 -0800
Subject: [PATCH 023/104] builds, not used yte

---
 mandel.s | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)

diff --git a/mandel.s b/mandel.s
index e0a8570..d824193 100644
--- a/mandel.s
+++ b/mandel.s
@@ -74,6 +74,9 @@ width = 160
 half_width = width >> 1
 stride = width >> 2
 
+EXTENDED_RAM = $4000 ; 16KiB bank on the XE
+PORTB  = $D301 ; memory & bank-switch for XL/XE
+
 DMACTL = $D400
 DLISTL = $D402
 DLISTH = $D403
@@ -428,6 +431,176 @@ fill_masks:
     .endscope
 .endmacro
 
+; lookup table for top byte -> PORTB value for bank-switch
+;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
+bankswitch:
+    .repeat 256, i
+        .byte ((i & $c0) >> 5) | $c1
+    .endrepeat
+
+; 58-77 cycles
+; clobbers x, y, dest to dest + 3
+.macro imul8xe dest, arg1, arg2
+.local done
+.local output
+.local ptr
+
+    output = dest
+    ptr = dest + 2 ; scratch space assumed
+
+    ; bottom 14 bits except the LSB are the per-bank table index
+    ; add $4000 for the bank pointer
+    lda arg1     ; 3 cyc
+    and #$fe     ; 2 cyc
+    sta ptr      ; 3 cyc
+    lda arg2     ; 3 cyc
+    and #$3f     ; 2 cyc
+    clc          ; 2 cyc
+    adc #$40     ; 2 cyc
+    sta ptr + 1  ; 3 cyc
+    
+    ; top 2 bits are the table bank selector
+    ldx arg2          ; 3 cyc
+    lda bank_switch,x ; 4 cyc
+    sta PORTB         ; 4 cyc
+
+
+    ; copy the entry into output
+    ldy #0       ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output   ; 3 cyc
+    iny          ; 2 cyc
+    lda (ptr),y  ; 5 cyc
+    sta output+1 ; 3 cyc
+
+    ; note: we are not restoring memory to save 6 cycles!
+    ; this means those 16kb have to be switched back to base RAM
+    ; if we need to use them anywhere else
+    ;;; restore memory
+    ;;lda #$81     ; 2 cyc - disabled
+    ;;sta PORTB    ; 4 cyc - disabled
+
+    ; check that 1 bit we skipped to fit into space
+    lda arg1     ; 3 cyc
+    and #1       ; 2 cyc
+    beq done     ; 2 cyc
+
+    ; add the second param one last time for the skipped bit
+    clc          ; 2 cyc
+    lda arg2     ; 3 cyc
+    adc output   ; 3 cyc
+    sta output   ; 3 cyc
+    lda #0       ; 2 cyc
+    adc output+1 ; 3 cyc
+    sta output+1 ; 3 cyc
+
+done:
+.endmacro
+
+.macro bank_switch bank
+    lda #((bank << 1) | $c1)
+    sta PORTB
+.endmacro
+
+.proc imul8xe_init
+
+    ; go through the input set, in four 16KB chunks
+
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+
+    lda #$00
+    sta arg1
+    sta arg2
+
+    ; $00 * $00 -> $3f * $ff
+    bank_switch 0
+    jsr imul8xe_init_section
+
+    ; $40 * $00 -> $7f * $ff
+    bank_switch 1
+    jsr imul8xe_init_section
+
+    ; $80 * $00 -> $bf * $ff
+    bank_switch 2
+    jsr imul8xe_init_section
+
+    ; $c0 * $00 -> $ff * $ff
+    bank_switch 3
+    jsr imul8xe_init_section
+
+    rts
+.endproc
+
+; Initialize a 16 KB chunk of the table
+; input: multipliers in temp
+; output: new multipliers in temp
+; clobbers: temp, temp2
+.proc imul8xe_init_section
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+    ptr = temp2
+
+    lda #$00
+    sta ptr
+    lda #$40
+    sta ptr + 1
+
+    ldx #0
+    ldy #0
+
+    ; outer loop: $00 -> $3f
+outer_loop:
+
+    ; reset result to 0
+    lda #0
+    sta result
+    sta result + 1
+
+    ; inner loop: $00 -> $ff
+inner_loop:
+
+    ; copy result to data set
+    lda result
+    sta (ptr),y
+    lda result + 1
+    sta (ptr),y
+
+    ; result += 2 * arg2
+    clc
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result
+    lda arg2
+    adc result
+    sta result
+    lda #0
+    adc result + 1
+    sta result
+
+    ; inner loop check
+    inc arg1
+    inc arg1
+    inc ptr
+    inc ptr
+    bne inner_loop
+
+    ; outer loop check
+    inc arg2
+    inc ptr + 1
+    lda ptr + 1
+    cmp #$40
+    bne outer_loop
+
+    rts
+
+.endproc
+
 .proc imul16_func
     arg1 = FR0   ; 16-bit arg (clobbered)
     arg2 = FR1   ; 16-bit arg (clobbered)

From 45c5a4cb2d62d6fbed4ba64364220eb8827369f0 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Thu, 26 Dec 2024 18:20:10 -0800
Subject: [PATCH 024/104] called, gets lost

---
 mandel.s | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/mandel.s b/mandel.s
index d824193..a8f3cac 100644
--- a/mandel.s
+++ b/mandel.s
@@ -433,7 +433,7 @@ fill_masks:
 
 ; lookup table for top byte -> PORTB value for bank-switch
 ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
-bankswitch:
+bank_switch_table:
     .repeat 256, i
         .byte ((i & $c0) >> 5) | $c1
     .endrepeat
@@ -460,9 +460,9 @@ bankswitch:
     sta ptr + 1  ; 3 cyc
     
     ; top 2 bits are the table bank selector
-    ldx arg2          ; 3 cyc
-    lda bank_switch,x ; 4 cyc
-    sta PORTB         ; 4 cyc
+    ldx arg2                ; 3 cyc
+    lda bank_switch_table,x ; 4 cyc
+    sta PORTB               ; 4 cyc
 
 
     ; copy the entry into output
@@ -612,20 +612,20 @@ inner_loop:
     ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
     ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
 
-    imul8 result, arg1, arg2
+    imul8xe result, arg1, arg2
     lda #0
     sta result + 2
     sta result + 3
 
-    imul8 inter, arg1 + 1, arg2
+    imul8xe inter, arg1 + 1, arg2
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
-    imul8 inter, arg1, arg2 + 1
+    imul8xe inter, arg1, arg2 + 1
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
-    imul8 inter, arg1 + 1, arg2 + 1
+    imul8xe inter, arg1 + 1, arg2 + 1
     add16 result + 2, result + 2, inter
 
     ; In case of negative inputs, adjust high word
@@ -1147,6 +1147,8 @@ zero_byte_loop:
 
 .proc start
 
+    jsr imul8xe_init
+
     ; ox = 0; oy = 0; zoom = 0
     ; count_frames = 0; count_pixels = 0
     lda #0

From 0cde31905e62b9b97d8df3ea03c73a89bbb5d602 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Thu, 26 Dec 2024 18:35:37 -0800
Subject: [PATCH 025/104] runs but doesn't work

---
 mandel.s | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index a8f3cac..79d9c78 100644
--- a/mandel.s
+++ b/mandel.s
@@ -548,7 +548,6 @@ done:
     lda #$40
     sta ptr + 1
 
-    ldx #0
     ldy #0
 
     ; outer loop: $00 -> $3f
@@ -566,7 +565,9 @@ inner_loop:
     lda result
     sta (ptr),y
     lda result + 1
+    iny
     sta (ptr),y
+    dey
 
     ; result += 2 * arg2
     clc
@@ -594,7 +595,7 @@ inner_loop:
     inc arg2
     inc ptr + 1
     lda ptr + 1
-    cmp #$40
+    cmp #$80
     bne outer_loop
 
     rts

From e84a990789b13c6c67e63cdb2db2a2be2b7893a6 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Thu, 26 Dec 2024 21:41:03 -0800
Subject: [PATCH 026/104] tweaks:

---
 mandel.s | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/mandel.s b/mandel.s
index 79d9c78..8c6130b 100644
--- a/mandel.s
+++ b/mandel.s
@@ -435,9 +435,15 @@ fill_masks:
 ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
 bank_switch_table:
     .repeat 256, i
-        .byte ((i & $c0) >> 5) | $c1
+        .byte ((i & $c0) >> 4) | $d1
     .endrepeat
 
+.macro bank_switch bank
+    lda #((bank << 2) | $d1)
+    sta PORTB
+.endmacro
+
+
 ; 58-77 cycles
 ; clobbers x, y, dest to dest + 3
 .macro imul8xe dest, arg1, arg2
@@ -497,11 +503,6 @@ bank_switch_table:
 done:
 .endmacro
 
-.macro bank_switch bank
-    lda #((bank << 1) | $c1)
-    sta PORTB
-.endmacro
-
 .proc imul8xe_init
 
     ; go through the input set, in four 16KB chunks
@@ -576,13 +577,14 @@ inner_loop:
     sta result
     lda #0
     adc result + 1
-    sta result
+    sta result + 1
+    clc
     lda arg2
     adc result
     sta result
     lda #0
     adc result + 1
-    sta result
+    sta result + 1
 
     ; inner loop check
     inc arg1

From ee1c2687054d760d21ffe8f1be97eb5eb6ecc7b9 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Thu, 26 Dec 2024 21:49:13 -0800
Subject: [PATCH 027/104] it works

---
 mandel.s | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index 8c6130b..3ff91d1 100644
--- a/mandel.s
+++ b/mandel.s
@@ -435,11 +435,11 @@ fill_masks:
 ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
 bank_switch_table:
     .repeat 256, i
-        .byte ((i & $c0) >> 4) | $d1
+        .byte ((i & $c0) >> 4) | $e1
     .endrepeat
 
 .macro bank_switch bank
-    lda #((bank << 2) | $d1)
+    lda #((bank << 2) | $e1)
     sta PORTB
 .endmacro
 

From 83cba4afa3e28cc8f6b0377c9edc49e60af36187 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Fri, 27 Dec 2024 18:37:03 -0800
Subject: [PATCH 028/104] Runtime detection of XE-style extended memory

Uses the "big multiplication table" in 64KB of extended memory if
bank switching appears to work, otherwise uses the table of squares
lookups.

Initial view clocks in at 13.133 ms/px for the XE version and still
14.211 ms/px for the 400/800/XL version.

Tested in emulator with 130XE and XL+Ultimate 1MB upgrade configs,
and base implementation on the 800XL emulator.
---
 mandel.s | 75 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 66 insertions(+), 9 deletions(-)

diff --git a/mandel.s b/mandel.s
index 3ff91d1..d198989 100644
--- a/mandel.s
+++ b/mandel.s
@@ -347,14 +347,6 @@ fill_masks:
     neg 4, arg
 .endmacro
 
-; 518 - 828 cyc
-.macro imul16 dest, arg1, arg2
-    copy16 FR0, arg1  ; 12 cyc
-    copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; 470-780 cyc
-    copy32 dest, FR2  ; 24 cyc
-.endmacro
-
 .macro shift_round_16 arg, shift
     .repeat shift
         shl32 arg
@@ -365,7 +357,7 @@ fill_masks:
 .macro imul16_round dest, arg1, arg2, shift
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; 470-780 cyc
+    jsr imul16_func   ; ? cyc
     shift_round_16 FR2, shift
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
@@ -505,6 +497,30 @@ done:
 
 .proc imul8xe_init
 
+    bank_switch 0
+    lda #0
+    sta EXTENDED_RAM
+    bank_switch 1
+    lda #1
+    sta EXTENDED_RAM
+    bank_switch 0
+    lda EXTENDED_RAM
+    beq init
+
+    ; no bank switching available, we just overwrite the value in base ram
+    rts
+
+init:
+
+    ; patch imul16_func into a forwarding thunk to imul16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta imul16_func
+    lda #.lobyte(imul16xe_func)
+    sta imul16_func + 1
+    lda #.hibyte(imul16xe_func)
+    sta imul16_func + 2
+
+    ; create the lookup table
     ; go through the input set, in four 16KB chunks
 
     arg1 = FR1
@@ -615,6 +631,47 @@ inner_loop:
     ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
     ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
 
+    imul8 result, arg1, arg2
+    lda #0
+    sta result + 2
+    sta result + 3
+
+    imul8 inter, arg1 + 1, arg2
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1, arg2 + 1
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    imul8 inter, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, inter
+
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg1 + 1
+    bpl arg1_pos
+    sub16 result + 2, result + 2, arg2
+arg1_pos:
+    lda arg2 + 1
+    bpl arg2_pos
+    sub16 result + 2, result + 2, arg1
+arg2_pos:
+
+    rts ; 6 cyc
+.endproc
+
+.proc imul16xe_func
+    arg1 = FR0   ; 16-bit arg (clobbered)
+    arg2 = FR1   ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+    inter = temp2
+
+    ; h1l1 * h2l2
+    ; (h1*256 + l1) * (h2*256 + l2)
+    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
+    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
+
     imul8xe result, arg1, arg2
     lda #0
     sta result + 2

From f32cc5fa7cdd117c26b5e923a7a29bcac8079f45 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Fri, 27 Dec 2024 19:15:19 -0800
Subject: [PATCH 029/104] whoops

---
 atari-asm-xex.cfg | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 atari-asm-xex.cfg

diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg
new file mode 100644
index 0000000..6e6498d
--- /dev/null
+++ b/atari-asm-xex.cfg
@@ -0,0 +1,25 @@
+FEATURES {
+    STARTADDRESS: default = $2E00;
+}
+SYMBOLS {
+    __STARTADDRESS__: type = export, value = %S;
+}
+MEMORY {
+    ZP:      file = "", define = yes, start = $0082, size = $007E;
+    MAIN:    file = %O, define = yes, start = %S,    size = $BC20 - %S;
+}
+FILES {
+    %O: format = atari;
+}
+FORMATS {
+    atari: runad = start;
+}
+SEGMENTS {
+    ZEROPAGE: load = ZP,      type = zp,  optional = yes;
+    EXTZP:    load = ZP,      type = zp,  optional = yes; # to enable modules to be able to link to C and assembler programs
+    CODE:     load = MAIN,    type = rw,                  define = yes;
+    RODATA:   load = MAIN,    type = ro   optional = yes;
+    DATA:     load = MAIN,    type = rw   optional = yes;
+    BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
+    TABLES:   load = MAIN,    type = ro,  optional = yes, align = 256;
+}

From d83b811444a1de0cf2f0dd58772d89517fe3d48c Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 28 Dec 2024 15:13:06 -0800
Subject: [PATCH 030/104] remove stray copy of the expanded-ram imul

it's not finished or working, just keep the core one :D
---
 imul8xe.s | 175 ------------------------------------------------------
 1 file changed, 175 deletions(-)
 delete mode 100644 imul8xe.s

diff --git a/imul8xe.s b/imul8xe.s
deleted file mode 100644
index 855e044..0000000
--- a/imul8xe.s
+++ /dev/null
@@ -1,175 +0,0 @@
-FR0    = $d4 ; float48
-PORTB = $d301
-
-
-EXTENDED_RAM = $4000 ; 16KiB bank on the XE
-
-; lookup table for top byte -> PORTB value for bank-switch
-.align 256
-bankswitch:
-    .repeat 256, i
-        .byte ((i & $c0) >> 5) | $c1
-    .endrepeat
-
-; 58-77 cycles
-; clobbers x, y, dest to dest + 3
-.macro imul8xe dest, arg1, arg2
-.local done
-.local output
-.local ptr
-
-    output = dest
-    ptr = dest + 2 ; scratch space assumed
-
-    ; bottom 14 bits except the LSB are the per-bank table index
-    ; add $4000 for the bank pointer
-    lda arg1     ; 3 cyc
-    and #$fe     ; 2 cyc
-    sta ptr      ; 3 cyc
-    lda arg2     ; 3 cyc
-    and #$3f     ; 2 cyc
-    clc          ; 2 cyc
-    adc #$40     ; 2 cyc
-    sta ptr + 1  ; 3 cyc
-    
-    ; top 2 bits are the table bank selector
-    ldx arg2          ; 3 cyc
-    lda bank_switch,x ; 4 cyc
-    sta PORTB         ; 4 cyc
-
-
-    ; copy the entry into output
-    ldy #0       ; 2 cyc
-    lda (ptr),y  ; 5 cyc
-    sta output   ; 3 cyc
-    iny          ; 2 cyc
-    lda (ptr),y  ; 5 cyc
-    sta output+1 ; 3 cyc
-
-    ; note: we are not restoring memory to save 6 cycles!
-    ; this means those 16kb have to be switched back to base RAM
-    ; if we need to use them anywhere else
-    ;;; restore memory
-    ;;lda #$81     ; 2 cyc - disabled
-    ;;sta PORTB    ; 4 cyc - disabled
-
-    ; check that 1 bit we skipped to fit into space
-    lda arg1     ; 3 cyc
-    and #1       ; 2 cyc
-    beq done     ; 2 cyc
-
-    ; add the second param one last time for the skipped bit
-    clc          ; 2 cyc
-    lda arg2     ; 3 cyc
-    adc output   ; 3 cyc
-    sta output   ; 3 cyc
-    lda #0       ; 2 cyc
-    adc output+1 ; 3 cyc
-    sta output+1 ; 3 cyc
-
-done:
-.endmacro
-
-.macro bank_switch bank
-    lda #((bank << 1) | $c1)
-    sta PORTB
-.endmacro
-
-proc imul8xe_init
-
-    ; go through the input set, in four 16KB chunks
-
-    arg1 = FR1
-    arg2 = FR2
-    result = FR0
-
-    lda #$00
-    sta arg1
-    sta arg2
-
-    ; $00 * $00 -> $3f * $ff
-    bank_switch 0
-    jsr imul8xe_init_section
-
-    ; $40 * $00 -> $7f * $ff
-    bank_switch 1
-    jsr imul8xe_init_section
-
-    ; $80 * $00 -> $bf * $ff
-    bank_switch 2
-    jsr imul8xe_init_section
-
-    ; $c0 * $00 -> $ff * $ff
-    bank_switch 3
-    jsr imul8xe_init_section
-
-    rts
-endproc
-
-; Initialize a 16 KB chunk of the table
-; input: multipliers in temp
-; output: new multipliers in temp
-; clobbers: temp, temp2
-proc imul8xe_init_section
-    arg1 = FR1
-    arg2 = FR2
-    result = FR0
-    ptr = temp2
-
-    lda #$00
-    sta ptr
-    lda #$40
-    sta ptr + 1
-
-    ldx #0
-    ldy #0
-
-    ; outer loop: $00 -> $3f
-outer_loop:
-
-    ; reset result to 0
-    lda #0
-    sta result
-    sta result + 1
-
-    ; inner loop: $00 -> $ff
-inner_loop:
-
-    ; copy result to data set
-    lda result
-    sta (ptr),y
-    lda result + 1
-    sta (ptr),y
-
-    ; result += 2 * arg2
-    clc
-    lda arg2
-    adc result
-    sta result
-    lda #0
-    adc result + 1
-    sta result
-    lda arg2
-    adc result
-    sta result
-    lda #0
-    adc result + 1
-    sta result
-
-    ; inner loop check
-    inc arg1
-    inc arg1
-    inc ptr
-    inc ptr
-    bne inner_loop
-
-    ; outer loop check
-    inc arg2
-    inc ptr + 1
-    lda ptr + 1
-    cmp #$40
-    bne outer_loop
-
-    rts
-
-endproc

From 0fcf4d66763a3c6f76dd99c1c5883d03238d4321 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 28 Dec 2024 17:40:21 -0800
Subject: [PATCH 031/104] comment tweak

---
 mandel.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mandel.s b/mandel.s
index d198989..7c5c652 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1209,7 +1209,7 @@ zero_byte_loop:
 
     jsr imul8xe_init
 
-    ; ox = 0; oy = 0; zoom = 0
+    ; ox = 0; oy = 0
     ; count_frames = 0; count_pixels = 0
     lda #0
     sta ox

From 504457595a7d4ad1a351b70ea508031c3ad55714 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 28 Dec 2024 18:11:35 -0800
Subject: [PATCH 032/104] correct zoom border checks

---
 mandel.s | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mandel.s b/mandel.s
index 7c5c652..fe29001 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1148,11 +1148,13 @@ skip_char:
     rts
 
 plus:
+    lda zoom
     cmp #8
     bpl skip_char
     inc zoom
     jmp done
 minus:
+    lda zoom
     cmp #1
     bmi skip_char
     dec zoom

From 2b0167226e9a4bfb399bca4d856080ab28fa621c Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 28 Dec 2024 20:44:27 -0800
Subject: [PATCH 033/104] todos

---
 todo.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 todo.md

diff --git a/todo.md b/todo.md
new file mode 100644
index 0000000..ed4e628
--- /dev/null
+++ b/todo.md
@@ -0,0 +1,26 @@
+things to try:
+
+* fix the pan/zoom bug where it doesn't reset loop right :(
+
+* add some preset viewports that can be switched via number keys (1, 2, 3 etc)
+
+* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
+
+* square-root special case of multiplication for zx*zx and zy*zy
+  * the hi1*hi2 and lo1*lo2 8-bit muls can be optimized into a 512-byte lookup table
+  * jamey on mastodon tried this but had some problems. see what happens on our version!
+
+* double-check rounding behavior is correct
+
+* try 3.13 fixed point instead of 4.12 for more precision
+  * can we get away without the extra bit?
+
+* y-axis mirror optimization
+
+* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
+
+* rework the palette cycling to look more like an advancing flow
+
+* extact viewport for display & re-input via keyboard
+
+* fujinet screenshot/viewport uploader

From 0fc5ba914f7dab24046a631f889cf2c6db0a0cbe Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 29 Dec 2024 12:29:36 -0800
Subject: [PATCH 034/104] fix pan/zoom bug

was missing an rts on update_palette

this happened to fall through to keycheck
which if timing was wrong would dutifully process the viewport
change and return to update_palette's caller

which in turn was -not- expecting to reset the outer loop

fixed
---
 mandel.s | 2 ++
 todo.md  | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index fe29001..d6ae4c6 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1094,6 +1094,8 @@ done:
     and #$f0
     adc palette + 3
     sta COLOR2
+
+    rts
 .endproc
 
 .proc update_speed
diff --git a/todo.md b/todo.md
index ed4e628..aebaae3 100644
--- a/todo.md
+++ b/todo.md
@@ -1,7 +1,5 @@
 things to try:
 
-* fix the pan/zoom bug where it doesn't reset loop right :(
-
 * add some preset viewports that can be switched via number keys (1, 2, 3 etc)
 
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D

From 2118890977591785a15aef0fcbce86414ebdd7db Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 29 Dec 2024 13:10:35 -0800
Subject: [PATCH 035/104] add an alternate viewport (compile-time currently)

zoomed to max
---
 mandel.s | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/mandel.s b/mandel.s
index d6ae4c6..3e60083 100644
--- a/mandel.s
+++ b/mandel.s
@@ -246,6 +246,18 @@ fill_masks:
     .byte %00000001
     .byte %00000000
 
+viewport_zoom:
+    .byte 1
+    .byte 8
+
+viewport_ox:
+    .word $0000
+    .word $f110
+
+viewport_oy:
+    .word $0000
+    .word $fbe0
+
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
     clc ; 2 cyc
@@ -1213,13 +1225,27 @@ zero_byte_loop:
 
     jsr imul8xe_init
 
-    ; ox = 0; oy = 0
+    ; initialize viewport
+    ;ldx #0 ; overview
+    ldx #1 ; closeup
+    lda viewport_zoom,x
+    sta zoom
+
+    txa
+    asl a
+    tax
+    lda viewport_ox,x
+    sta ox
+    lda viewport_oy,x
+    sta oy
+    inx
+    lda viewport_ox,x
+    sta ox + 1
+    lda viewport_oy,x
+    sta oy + 1
+
     ; count_frames = 0; count_pixels = 0
     lda #0
-    sta ox
-    sta ox + 1
-    sta oy
-    sta oy + 1
     sta count_frames
     sta count_pixels
 
@@ -1229,10 +1255,6 @@ zero_byte_loop:
     ldx #total_pixels
     jsr ZF1
 
-    ; zoom = 2x
-    lda #1
-    sta zoom
-
     ; Disable display DMA
     lda #0
     sta DMACTL

From 15fc5367f9054de770afec40958295cbd26c7b46 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 29 Dec 2024 13:18:54 -0800
Subject: [PATCH 036/104] switck with the overview as default fo rnow

---
 mandel.s | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index 3e60083..3579b0f 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1226,8 +1226,8 @@ zero_byte_loop:
     jsr imul8xe_init
 
     ; initialize viewport
-    ;ldx #0 ; overview
-    ldx #1 ; closeup
+    ldx #0 ; overview
+    ;ldx #1 ; closeup
     lda viewport_zoom,x
     sta zoom
 

From 8ad996981abdc4a74f2e220fe2ae970c1bd90960 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 29 Dec 2024 13:19:58 -0800
Subject: [PATCH 037/104] whoops

---
 mandel.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mandel.s b/mandel.s
index 3579b0f..cf52fdb 100644
--- a/mandel.s
+++ b/mandel.s
@@ -107,7 +107,7 @@ KEY_RIGHT = $87
 
 .struct float48
     exponent .byte
-    mantissa .byte 6
+    mantissa .byte 5
 .endstruct
 
 .import mul_lobyte256

From f903272335b5749ef805ddbd31b52a58051e6b94 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 29 Dec 2024 17:37:06 -0800
Subject: [PATCH 038/104] refactoring and start on squares

---
 mandel.s  | 284 +++++++++++++++++++++++++-----------------------------
 tables.js |  12 +++
 2 files changed, 143 insertions(+), 153 deletions(-)

diff --git a/mandel.s b/mandel.s
index cf52fdb..2e16b53 100644
--- a/mandel.s
+++ b/mandel.s
@@ -374,65 +374,13 @@ viewport_oy:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
-; Adapted from https://everything2.com/title/Fast+6502+multiplication
-.macro imul8 dest, arg1, arg2
-    .local under256
-    .local next
-    .local small_product
-    ; circa 92 cycles? this doesn't seem right
-    ; 81-92 cycles
-    .scope
-        mul_factor_a   = arg1
-        mul_factor_x   = arg2
-        mul_product_lo = dest
-        mul_product_hi = dest + 1
-
-        lda mul_factor_a      ; 3 cyc
-
-        ; (a + x)^2/2
-        clc                   ; 2 cyc         
-        adc mul_factor_x      ; 3 cyc
-        tax                   ; 2 cyc
-        bcc under256          ; 2 cyc
-        lda mul_hibyte512,x   ; 4 cyc
-        bcs next              ; 2 cyc
-    under256:
-        lda mul_hibyte256,x   ; 4 cyc
-        sec                   ; 2 cyc
-    next:
-        sta mul_product_hi    ; 3 cyc
-        lda mul_lobyte256,x   ; 4 cyc
-
-        ; - a^2/2
-        ldx mul_factor_a      ; 3 cyc
-        sbc mul_lobyte256,x   ; 4 cyc
-        sta mul_product_lo    ; 3 cyc
-        lda mul_product_hi    ; 3 cyc
-        sbc mul_hibyte256,x   ; 4 cyc
-        sta mul_product_hi    ; 3 cyc
-
-        ; + x & a & 1:
-        ; (this is a kludge to correct a
-        ; roundoff error that makes odd * odd too low)
-        ldx mul_factor_x      ; 3 cyc
-        txa                   ; 2 cyc
-        and mul_factor_a      ; 3 cyc
-        and #1                ; 2 cyc
-
-        clc                   ; 2 cyc
-        adc mul_product_lo    ; 3 cyc
-        bcc small_product     ; 2 cyc
-        inc mul_product_hi    ; 5 cyc
-
-        ; - x^2/2
-    small_product:
-        sec                   ; 2 cyc
-        sbc mul_lobyte256,x   ; 4 cyc
-        sta mul_product_lo    ; 3 cyc
-        lda mul_product_hi    ; 3 cyc
-        sbc mul_hibyte256,x   ; 4 cyc
-        sta mul_product_hi    ; 3 cyc
-    .endscope
+; clobbers a, x
+.macro sqr8 dest, arg
+    ldx arg
+    lda sqr_lobyte,x
+    sta dest
+    lda sqr_hibyte,x
+    sta dest + 1
 .endmacro
 
 ; lookup table for top byte -> PORTB value for bank-switch
@@ -447,64 +395,121 @@ bank_switch_table:
     sta PORTB
 .endmacro
 
+.macro imul8 dest, arg1, arg2, xe
+    .if xe
+        ; using 64KB lookup table
+        ; 58-77 cycles
+        ; clobbers x, y, dest to dest + 3
+        .scope
+            output = dest
+            ptr = dest + 2 ; scratch space assumed
 
-; 58-77 cycles
-; clobbers x, y, dest to dest + 3
-.macro imul8xe dest, arg1, arg2
-.local done
-.local output
-.local ptr
-
-    output = dest
-    ptr = dest + 2 ; scratch space assumed
-
-    ; bottom 14 bits except the LSB are the per-bank table index
-    ; add $4000 for the bank pointer
-    lda arg1     ; 3 cyc
-    and #$fe     ; 2 cyc
-    sta ptr      ; 3 cyc
-    lda arg2     ; 3 cyc
-    and #$3f     ; 2 cyc
-    clc          ; 2 cyc
-    adc #$40     ; 2 cyc
-    sta ptr + 1  ; 3 cyc
-    
-    ; top 2 bits are the table bank selector
-    ldx arg2                ; 3 cyc
-    lda bank_switch_table,x ; 4 cyc
-    sta PORTB               ; 4 cyc
+            ; bottom 14 bits except the LSB are the per-bank table index
+            ; add $4000 for the bank pointer
+            lda arg1     ; 3 cyc
+            and #$fe     ; 2 cyc
+            sta ptr      ; 3 cyc
+            lda arg2     ; 3 cyc
+            and #$3f     ; 2 cyc
+            clc          ; 2 cyc
+            adc #$40     ; 2 cyc
+            sta ptr + 1  ; 3 cyc
+            
+            ; top 2 bits are the table bank selector
+            ldx arg2                ; 3 cyc
+            lda bank_switch_table,x ; 4 cyc
+            sta PORTB               ; 4 cyc
 
 
-    ; copy the entry into output
-    ldy #0       ; 2 cyc
-    lda (ptr),y  ; 5 cyc
-    sta output   ; 3 cyc
-    iny          ; 2 cyc
-    lda (ptr),y  ; 5 cyc
-    sta output+1 ; 3 cyc
+            ; copy the entry into output
+            ldy #0       ; 2 cyc
+            lda (ptr),y  ; 5 cyc
+            sta output   ; 3 cyc
+            iny          ; 2 cyc
+            lda (ptr),y  ; 5 cyc
+            sta output+1 ; 3 cyc
 
-    ; note: we are not restoring memory to save 6 cycles!
-    ; this means those 16kb have to be switched back to base RAM
-    ; if we need to use them anywhere else
-    ;;; restore memory
-    ;;lda #$81     ; 2 cyc - disabled
-    ;;sta PORTB    ; 4 cyc - disabled
+            ; note: we are not restoring memory to save 6 cycles!
+            ; this means those 16kb have to be switched back to base RAM
+            ; if we need to use them anywhere else
+            ;;; restore memory
+            ;;lda #$81     ; 2 cyc - disabled
+            ;;sta PORTB    ; 4 cyc - disabled
 
-    ; check that 1 bit we skipped to fit into space
-    lda arg1     ; 3 cyc
-    and #1       ; 2 cyc
-    beq done     ; 2 cyc
+            ; check that 1 bit we skipped to fit into space
+            lda arg1     ; 3 cyc
+            and #1       ; 2 cyc
+            beq done     ; 2 cyc
 
-    ; add the second param one last time for the skipped bit
-    clc          ; 2 cyc
-    lda arg2     ; 3 cyc
-    adc output   ; 3 cyc
-    sta output   ; 3 cyc
-    lda #0       ; 2 cyc
-    adc output+1 ; 3 cyc
-    sta output+1 ; 3 cyc
+            ; add the second param one last time for the skipped bit
+            clc          ; 2 cyc
+            lda arg2     ; 3 cyc
+            adc output   ; 3 cyc
+            sta output   ; 3 cyc
+            lda #0       ; 2 cyc
+            adc output+1 ; 3 cyc
+            sta output+1 ; 3 cyc
 
-done:
+        done:
+        .endscope
+    .else
+        ; Using base 48k RAM compatibility mode
+        ; Small table of half squares
+        ; Adapted from https://everything2.com/title/Fast+6502+multiplication
+        ; 81-92 cycles
+        .scope
+            mul_factor_a   = arg1
+            mul_factor_x   = arg2
+            mul_product_lo = dest
+            mul_product_hi = dest + 1
+
+            lda mul_factor_a      ; 3 cyc
+
+            ; (a + x)^2/2
+            clc                   ; 2 cyc         
+            adc mul_factor_x      ; 3 cyc
+            tax                   ; 2 cyc
+            bcc under256          ; 2 cyc
+            lda mul_hibyte512,x   ; 4 cyc
+            bcs next              ; 2 cyc
+        under256:
+            lda mul_hibyte256,x   ; 4 cyc
+            sec                   ; 2 cyc
+        next:
+            sta mul_product_hi    ; 3 cyc
+            lda mul_lobyte256,x   ; 4 cyc
+
+            ; - a^2/2
+            ldx mul_factor_a      ; 3 cyc
+            sbc mul_lobyte256,x   ; 4 cyc
+            sta mul_product_lo    ; 3 cyc
+            lda mul_product_hi    ; 3 cyc
+            sbc mul_hibyte256,x   ; 4 cyc
+            sta mul_product_hi    ; 3 cyc
+
+            ; + x & a & 1:
+            ; (this is a kludge to correct a
+            ; roundoff error that makes odd * odd too low)
+            ldx mul_factor_x      ; 3 cyc
+            txa                   ; 2 cyc
+            and mul_factor_a      ; 3 cyc
+            and #1                ; 2 cyc
+
+            clc                   ; 2 cyc
+            adc mul_product_lo    ; 3 cyc
+            bcc small_product     ; 2 cyc
+            inc mul_product_hi    ; 5 cyc
+
+            ; - x^2/2
+        small_product:
+            sec                   ; 2 cyc
+            sbc mul_lobyte256,x   ; 4 cyc
+            sta mul_product_lo    ; 3 cyc
+            lda mul_product_hi    ; 3 cyc
+            sbc mul_hibyte256,x   ; 4 cyc
+            sta mul_product_hi    ; 3 cyc
+        .endscope
+    .endif
 .endmacro
 
 .proc imul8xe_init
@@ -632,7 +637,13 @@ inner_loop:
 
 .endproc
 
-.proc imul16_func
+.macro imul16_impl xe
+    .local arg1
+    .local arg2
+    .local result
+    .local inter
+    .local arg1_pos
+    .local arg2_pos
     arg1 = FR0   ; 16-bit arg (clobbered)
     arg2 = FR1   ; 16-bit arg (clobbered)
     result = FR2 ; 32-bit result
@@ -643,20 +654,20 @@ inner_loop:
     ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
     ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
 
-    imul8 result, arg1, arg2
+    imul8 result, arg1, arg2, xe
     lda #0
     sta result + 2
     sta result + 3
 
-    imul8 inter, arg1 + 1, arg2
+    imul8 inter, arg1 + 1, arg2, xe
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
-    imul8 inter, arg1, arg2 + 1
+    imul8 inter, arg1, arg2 + 1, xe
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
-    imul8 inter, arg1 + 1, arg2 + 1
+    imul8 inter, arg1 + 1, arg2 + 1, xe
     add16 result + 2, result + 2, inter
 
     ; In case of negative inputs, adjust high word
@@ -671,47 +682,14 @@ arg1_pos:
 arg2_pos:
 
     rts ; 6 cyc
+.endmacro
+
+.proc imul16_func
+    imul16_impl 0
 .endproc
 
 .proc imul16xe_func
-    arg1 = FR0   ; 16-bit arg (clobbered)
-    arg2 = FR1   ; 16-bit arg (clobbered)
-    result = FR2 ; 32-bit result
-    inter = temp2
-
-    ; h1l1 * h2l2
-    ; (h1*256 + l1) * (h2*256 + l2)
-    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
-    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
-
-    imul8xe result, arg1, arg2
-    lda #0
-    sta result + 2
-    sta result + 3
-
-    imul8xe inter, arg1 + 1, arg2
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-
-    imul8xe inter, arg1, arg2 + 1
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-
-    imul8xe inter, arg1 + 1, arg2 + 1
-    add16 result + 2, result + 2, inter
-
-    ; In case of negative inputs, adjust high word
-    ; https://stackoverflow.com/a/28827013
-    lda arg1 + 1
-    bpl arg1_pos
-    sub16 result + 2, result + 2, arg2
-arg1_pos:
-    lda arg2 + 1
-    bpl arg2_pos
-    sub16 result + 2, result + 2, arg1
-arg2_pos:
-
-    rts ; 6 cyc
+    imul16_impl 1
 .endproc
 
 .macro round16 arg
diff --git a/tables.js b/tables.js
index c772f81..50cbef9 100644
--- a/tables.js
+++ b/tables.js
@@ -22,7 +22,10 @@ console.log(
 .export mul_lobyte256
 .export mul_hibyte256
 .export mul_hibyte512
+.export sqr_lobyte
+.export sqr_hibyte
 
+; (i * i + 1) / 2 for the multiplier
 .align 256
 mul_lobyte256:
 ${db((i) => squares[i] & 0xff)}
@@ -35,4 +38,13 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
 mul_hibyte512:
 ${db((i) => (squares[i + 256] >> 8) & 0xff)}
 
+; (i * i) for the plain squares
+.align 256
+sqr_lobyte:
+${db((i) => (i * i) & 0xff)}
+
+.align 256
+sqr_hibyte:
+${db((i) => ((i * i) >> 8) & 0xff)}
+
 `);

From 3ab5006aa3033cf595311fc6a09247c0c04f9c14 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 29 Dec 2024 17:56:14 -0800
Subject: [PATCH 039/104] wip refacotring

---
 mandel.s | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index 2e16b53..9eb6ce1 100644
--- a/mandel.s
+++ b/mandel.s
@@ -374,6 +374,14 @@ viewport_oy:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
+.macro sqr16_round dest, arg, shift
+    imul16_round dest, arg, arg, shift
+    ;copy16 FR0, arg   ; 12 cyc
+    ;jsr sqr16_func      ; ? cyc
+    ;shift_round_16 FR2, shift
+    ;copy16 dest, FR2 + 2  ; 12 cyc
+.endmacro
+
 ; clobbers a, x
 .macro sqr8 dest, arg
     ldx arg
@@ -537,6 +545,14 @@ init:
     lda #.hibyte(imul16xe_func)
     sta imul16_func + 2
 
+    ; ditto for sqr16_func -> sqr16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta sqr16_func
+    lda #.lobyte(sqr16xe_func)
+    sta sqr16_func + 1
+    lda #.hibyte(sqr16xe_func)
+    sta sqr16_func + 2
+
     ; create the lookup table
     ; go through the input set, in four 16KB chunks
 
@@ -684,6 +700,45 @@ arg2_pos:
     rts ; 6 cyc
 .endmacro
 
+.macro sqr16_impl xe
+    .local arg
+    .local result
+    .local inter
+    .local arg_pos
+    arg = FR0    ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+    inter = temp2
+
+    ; hl * hl
+    ; (h*256 + l) * (h*256 + l)
+    ; h*256*(h*256 + l) + l*(h*256 + l)
+    ; h*h*256*256 + h*l*256 + h*l*256 + l*l
+
+    sqr8 result, arg
+    lda #0
+    sta result + 2
+    sta result + 3
+
+    imul8 inter, arg + 1, arg, xe
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+    add16 result + 1, result + 1, inter
+    add_carry result + 3
+
+    sqr8 inter, arg + 1, arg + 1, xe
+    add16 result + 2, result + 2, inter
+
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg + 1
+    bpl arg_pos
+    sub16 result + 2, result + 2, arg
+    sub16 result + 2, result + 2, arg
+arg_pos:
+
+    rts ; 6 cyc
+.endmacro
+
 .proc imul16_func
     imul16_impl 0
 .endproc
@@ -692,6 +747,14 @@ arg2_pos:
     imul16_impl 1
 .endproc
 
+.proc sqr16_func
+    imul16_impl 0
+.endproc
+
+.proc sqr16xe_func
+    imul16_impl 1
+.endproc
+
 .macro round16 arg
     ; Round top 16 bits of 32-bit fixed-point number in-place
     .local increment
@@ -803,10 +866,10 @@ keep_going:
     quick_exit zy, 2
 
     ; zx_2 = zx * zx
-    imul16_round zx_2, zx, zx, 4
+    sqr16_round zx_2, zx, 4
 
     ; zy_2 = zy * zy
-    imul16_round zy_2, zy, zy, 4
+    sqr16_round zy_2, zy, 4
 
     ; zx_zy = zx * zy
     imul16_round zx_zy, zx, zy, 4

From 0c63430dd95a1c3e72e0fb0c252e233ddc0c9d79 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 29 Dec 2024 20:37:58 -0800
Subject: [PATCH 040/104] wip tables segment to be

---
 Makefile          |  4 ++--
 atari-asm-xex.cfg |  3 ++-
 mandel.s          | 10 +++++-----
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 008bf8c..bd14c7d 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,8 @@
 
 all : mandel.xex
 
-mandel.xex : mandel.o tables.o
-	ld65 -C ./atari-asm-xex.cfg -o $@ $+
+mandel.xex : mandel.o tables.o atari-asm-xex.cfg
+	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
 
 %.o : %.s
 	ca65 -o $@ $<
diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg
index 6e6498d..fb43089 100644
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@@ -6,7 +6,8 @@ SYMBOLS {
 }
 MEMORY {
     ZP:      file = "", define = yes, start = $0082, size = $007E;
-    MAIN:    file = %O, define = yes, start = %S,    size = $BC20 - %S;
+    #MAIN:    file = %O, define = yes, start = %S,    size = $BC20 - %S;
+    MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
 }
 FILES {
     %O: format = atari;
diff --git a/mandel.s b/mandel.s
index 9eb6ce1..7bfb577 100644
--- a/mandel.s
+++ b/mandel.s
@@ -375,11 +375,11 @@ viewport_oy:
 .endmacro
 
 .macro sqr16_round dest, arg, shift
-    imul16_round dest, arg, arg, shift
-    ;copy16 FR0, arg   ; 12 cyc
-    ;jsr sqr16_func      ; ? cyc
-    ;shift_round_16 FR2, shift
-    ;copy16 dest, FR2 + 2  ; 12 cyc
+    ;imul16_round dest, arg, arg, shift
+    copy16 FR0, arg   ; 12 cyc
+    jsr sqr16_func      ; ? cyc
+    shift_round_16 FR2, shift
+    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
 ; clobbers a, x

From 883f926e575cbc4720b25826b2252495a0621d81 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 29 Dec 2024 21:06:48 -0800
Subject: [PATCH 041/104] split memory, wip

appears to work on 800 but xl/xe overlap basic lol
---
 atari-asm-xex.cfg |  3 ++-
 mandel.s          | 64 +++++++++++++++++++++++------------------------
 2 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg
index fb43089..9f871ca 100644
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@@ -8,6 +8,7 @@ MEMORY {
     ZP:      file = "", define = yes, start = $0082, size = $007E;
     #MAIN:    file = %O, define = yes, start = %S,    size = $BC20 - %S;
     MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
+    TABLES:  file = %O, define = yes, start = $a000, size = $c000 - $a000;
 }
 FILES {
     %O: format = atari;
@@ -22,5 +23,5 @@ SEGMENTS {
     RODATA:   load = MAIN,    type = ro   optional = yes;
     DATA:     load = MAIN,    type = rw   optional = yes;
     BSS:      load = MAIN,    type = bss, optional = yes, define = yes;
-    TABLES:   load = MAIN,    type = ro,  optional = yes, align = 256;
+    TABLES:   load = TABLES,  type = ro,  optional = yes, align = 256;
 }
diff --git a/mandel.s b/mandel.s
index 7bfb577..a5bcb35 100644
--- a/mandel.s
+++ b/mandel.s
@@ -113,6 +113,8 @@ KEY_RIGHT = $87
 .import mul_lobyte256
 .import mul_hibyte256
 .import mul_hibyte512
+.import sqr_lobyte
+.import sqr_hibyte
 
 .data
 
@@ -701,42 +703,40 @@ arg2_pos:
 .endmacro
 
 .macro sqr16_impl xe
-    .local arg
-    .local result
-    .local inter
-    .local arg_pos
-    arg = FR0    ; 16-bit arg (clobbered)
-    result = FR2 ; 32-bit result
-    inter = temp2
+    .scope
+        arg = FR0    ; 16-bit arg (clobbered)
+        result = FR2 ; 32-bit result
+        ;inter = temp2
+        inter = FR1
 
-    ; hl * hl
-    ; (h*256 + l) * (h*256 + l)
-    ; h*256*(h*256 + l) + l*(h*256 + l)
-    ; h*h*256*256 + h*l*256 + h*l*256 + l*l
+        lda arg + 1
+        bpl arg_pos
+        neg16 arg
+    arg_pos:
 
-    sqr8 result, arg
-    lda #0
-    sta result + 2
-    sta result + 3
+        ; hl * hl
+        ; (h*256 + l) * (h*256 + l)
+        ; h*256*(h*256 + l) + l*(h*256 + l)
+        ; h*h*256*256 + h*l*256 + h*l*256 + l*l
 
-    imul8 inter, arg + 1, arg, xe
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
-    add16 result + 1, result + 1, inter
-    add_carry result + 3
+        sqr8 result, arg
+        ;imul8 inter, arg, arg, xe
+        lda #0
+        sta result + 2
+        sta result + 3
 
-    sqr8 inter, arg + 1, arg + 1, xe
-    add16 result + 2, result + 2, inter
+        imul8 inter, arg + 1, arg, xe
+        add16 result + 1, result + 1, inter
+        add_carry result + 3
+        add16 result + 1, result + 1, inter
+        add_carry result + 3
 
-    ; In case of negative inputs, adjust high word
-    ; https://stackoverflow.com/a/28827013
-    lda arg + 1
-    bpl arg_pos
-    sub16 result + 2, result + 2, arg
-    sub16 result + 2, result + 2, arg
-arg_pos:
+        sqr8 inter, arg + 1
+        ;imul8 inter, arg + 1, arg + 1, xe
+        add16 result + 2, result + 2, inter
 
-    rts ; 6 cyc
+        rts ; 6 cyc
+    .endscope
 .endmacro
 
 .proc imul16_func
@@ -748,11 +748,11 @@ arg_pos:
 .endproc
 
 .proc sqr16_func
-    imul16_impl 0
+    sqr16_impl 0
 .endproc
 
 .proc sqr16xe_func
-    imul16_impl 1
+    sqr16_impl 1
 .endproc
 
 .macro round16 arg

From acac5a8df42f7128a785d8d6efd65b69ad2178bf Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 29 Dec 2024 21:19:55 -0800
Subject: [PATCH 042/104] moving the framebuffer into the basic space

fails on 130xe and 800xl for some reason

works on 800 as expected
---
 atari-asm-xex.cfg |  5 +++--
 mandel.s          | 10 +++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg
index 9f871ca..93b80f3 100644
--- a/atari-asm-xex.cfg
+++ b/atari-asm-xex.cfg
@@ -6,9 +6,10 @@ SYMBOLS {
 }
 MEMORY {
     ZP:      file = "", define = yes, start = $0082, size = $007E;
-    #MAIN:    file = %O, define = yes, start = %S,    size = $BC20 - %S;
     MAIN:    file = %O, define = yes, start = %S,    size = $4000 - %S;
-    TABLES:  file = %O, define = yes, start = $a000, size = $c000 - $a000;
+    # Keep $4000-7fff clear for expanded RAM access window
+    TABLES:  file = %O, define = yes, start = $8000, size = $a000 - $8000;
+    # Keep $a000-$bfff clear for BASIC cartridge
 }
 FILES {
     %O: format = atari;
diff --git a/mandel.s b/mandel.s
index a5bcb35..8517685 100644
--- a/mandel.s
+++ b/mandel.s
@@ -62,11 +62,11 @@ FST0R  = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
 FMOVE  = $DDB6 ; MOVE FR0 TO FR1
 
 ; High data
-framebuffer_top    = $8000
-textbuffer         = $8f00
-framebuffer_bottom = $9000
-display_list       = $9f00
-framebuffer_end    = $a000
+framebuffer_top    = $a000
+textbuffer         = $af00
+framebuffer_bottom = $b000
+display_list       = $bf00
+framebuffer_end    = $c000
 
 height = 184
 half_height = height >> 1

From 70d2c91f03dd4e2b90dd2419e060bbb220747dd9 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 03:56:35 -0800
Subject: [PATCH 043/104] fix bank switch on xl/xe

was accidentally enabling basic rom :D

5m46s - 11.759 ms/px - 800xl
5m30s - 11.215 ms/px - 130xe
---
 mandel.s | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index 8517685..9f594e8 100644
--- a/mandel.s
+++ b/mandel.s
@@ -397,11 +397,11 @@ viewport_oy:
 ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
 bank_switch_table:
     .repeat 256, i
-        .byte ((i & $c0) >> 4) | $e1
+        .byte ((i & $c0) >> 4) | $e3
     .endrepeat
 
 .macro bank_switch bank
-    lda #((bank << 2) | $e1)
+    lda #((bank << 2) | $e3)
     sta PORTB
 .endmacro
 

From c4b98c7be27558c662a23849126d2a802c9bf4bc Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 05:35:22 -0800
Subject: [PATCH 044/104] optimize out a temporary

down to 11.076 ms/px on xe
---
 mandel.s | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/mandel.s b/mandel.s
index 9f594e8..0239167 100644
--- a/mandel.s
+++ b/mandel.s
@@ -393,6 +393,18 @@ viewport_oy:
     sta dest + 1
 .endmacro
 
+; clobbers a, x
+.macro sqr8_add16 dest, arg
+    ldx arg
+    clc
+    lda sqr_lobyte,x
+    adc dest
+    sta dest
+    lda sqr_hibyte,x
+    adc dest + 1
+    sta dest + 1
+.endmacro
+
 ; lookup table for top byte -> PORTB value for bank-switch
 ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
 bank_switch_table:
@@ -720,7 +732,6 @@ arg2_pos:
         ; h*h*256*256 + h*l*256 + h*l*256 + l*l
 
         sqr8 result, arg
-        ;imul8 inter, arg, arg, xe
         lda #0
         sta result + 2
         sta result + 3
@@ -731,9 +742,7 @@ arg2_pos:
         add16 result + 1, result + 1, inter
         add_carry result + 3
 
-        sqr8 inter, arg + 1
-        ;imul8 inter, arg + 1, arg + 1, xe
-        add16 result + 2, result + 2, inter
+        sqr8_add16 result + 2, arg + 1
 
         rts ; 6 cyc
     .endscope

From e51aa91e4e159f4ad632759f5c5bc7c8e5e6603f Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 06:48:04 -0800
Subject: [PATCH 045/104] notes

---
 todo.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/todo.md b/todo.md
index aebaae3..2e28c8e 100644
--- a/todo.md
+++ b/todo.md
@@ -4,11 +4,7 @@ things to try:
 
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 
-* square-root special case of multiplication for zx*zx and zy*zy
-  * the hi1*hi2 and lo1*lo2 8-bit muls can be optimized into a 512-byte lookup table
-  * jamey on mastodon tried this but had some problems. see what happens on our version!
-
-* double-check rounding behavior is correct
+* optimize out a store/load with mul8_add16 and mul8_add24
 
 * try 3.13 fixed point instead of 4.12 for more precision
   * can we get away without the extra bit?

From 100c0f33148c411a6bb066f0de32e1f79ffe2c78 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 09:16:08 -0800
Subject: [PATCH 046/104] 1/2/3 selectable viewports

---
 mandel.s | 65 ++++++++++++++++++++++++++++++++++++++++++--------------
 todo.md  |  2 --
 2 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/mandel.s b/mandel.s
index 0239167..90396d4 100644
--- a/mandel.s
+++ b/mandel.s
@@ -104,6 +104,9 @@ KEY_UP    = $8e
 KEY_DOWN  = $8f
 KEY_LEFT  = $86
 KEY_RIGHT = $87
+KEY_1     = $1f
+KEY_2     = $1e
+KEY_3     = $1a
 
 .struct float48
     exponent .byte
@@ -250,14 +253,17 @@ fill_masks:
 
 viewport_zoom:
     .byte 1
+    .byte 6
     .byte 8
 
 viewport_ox:
     .word $0000
     .word $f110
+    .word $f110
 
 viewport_oy:
     .word $0000
+    .word $fb60
     .word $fbe0
 
 ; 2 + 9 * byte cycles
@@ -1206,7 +1212,13 @@ done:
     beq left
     cpy #KEY_RIGHT
     beq right
-
+    cpy #KEY_1
+    beq one
+    cpy #KEY_2
+    beq two
+    cpy #KEY_3
+    beq three
+ 
 skip_char:
     lda #0
     rts
@@ -1234,6 +1246,19 @@ left:
     jmp done
 right:
     add16 ox, ox, temp
+    jmp done
+one:
+    ldx #0
+    jmp load_key_viewport
+two:
+    ldx #1
+    jmp load_key_viewport
+three:
+    ldx #2
+    ; fall through
+load_key_viewport:
+    jsr load_viewport
+    ; fall through
 done:
     lda #255
     rts
@@ -1271,13 +1296,10 @@ zero_byte_loop:
     rts
 .endproc
 
-.proc start
+; input: viewport selector in x
+; clobbers: a, x
+.proc load_viewport
 
-    jsr imul8xe_init
-
-    ; initialize viewport
-    ldx #0 ; overview
-    ;ldx #1 ; closeup
     lda viewport_zoom,x
     sta zoom
 
@@ -1294,16 +1316,16 @@ zero_byte_loop:
     lda viewport_oy,x
     sta oy + 1
 
-    ; count_frames = 0; count_pixels = 0
-    lda #0
-    sta count_frames
-    sta count_pixels
+    rts
+.endproc
 
-    ; total_ms = 0.0; total_pixels = 0.0
-    ldx #total_ms
-    jsr ZF1
-    ldx #total_pixels
-    jsr ZF1
+.proc start
+
+    jsr imul8xe_init
+
+    ; initialize viewport
+    ldx #0 ; overview
+    jsr load_viewport
 
     ; Disable display DMA
     lda #0
@@ -1345,6 +1367,17 @@ copy_byte_loop:
     jsr SETVBV
 
 main_loop:
+    ; count_frames = 0; count_pixels = 0
+    lda #0
+    sta count_frames
+    sta count_pixels
+
+    ; total_ms = 0.0; total_pixels = 0.0
+    ldx #total_ms
+    jsr ZF1
+    ldx #total_pixels
+    jsr ZF1
+
     jsr clear_screen
     jsr status_bar
 
diff --git a/todo.md b/todo.md
index 2e28c8e..6c6d84d 100644
--- a/todo.md
+++ b/todo.md
@@ -1,7 +1,5 @@
 things to try:
 
-* add some preset viewports that can be switched via number keys (1, 2, 3 etc)
-
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 
 * optimize out a store/load with mul8_add16 and mul8_add24

From 64a6cf50f3a5d7aa46632b6ab8f83120e2c49448 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 10:21:52 -0800
Subject: [PATCH 047/104] awesome new palette cycler

---
 mandel.s | 108 ++++++++++++++++++++++++++++++++++++++++++-------------
 todo.md  |   2 --
 2 files changed, 84 insertions(+), 26 deletions(-)

diff --git a/mandel.s b/mandel.s
index 90396d4..198e40c 100644
--- a/mandel.s
+++ b/mandel.s
@@ -13,13 +13,13 @@ zy_2  = $92     ; fixed4.12: z_y^2
 zx_zy = $94     ; fixed4.12: z_x * z_y
 dist  = $96     ; fixed4.12: z_x^2 + z_y^2
 
-iter         = $a0 ; u8: iteration count
+iter          = $a0 ; u8: iteration count
 
-zoom         = $a1 ; u8: zoom shift level
-count_frames = $a2 ; u8
-count_pixels = $a3 ; u8
-total_ms     = $a4 ; float48
-total_pixels = $aa ; float48
+zoom          = $a1 ; u8: zoom shift level
+count_frames  = $a2 ; u8
+count_pixels  = $a3 ; u8
+total_ms      = $a4 ; float48
+total_pixels  = $aa ; float48
 
 z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
 z_buffer_start  = $b1 ; u8: index into z_buffer
@@ -34,6 +34,14 @@ pixel_offset    = $bd ; u8
 fill_level      = $be ; u8
 palette_offset  = $bf ; u8
 
+palette_ticks = $c0 ; u8
+chroma_ticks  = $c1 ; u8
+chroma_offset = $c2 ; u8
+
+palette_delay = 120
+chroma_delay = 120
+
+
 ; FP registers in zero page
 FR0    = $d4 ; float48
 FRE    = $da
@@ -224,11 +232,26 @@ color_map:
         .byte 3
     .endrepeat
 
-palette:
-    .byte $00
-    .byte $46
-    .byte $78
-    .byte $b4
+
+palette_start:
+    .byte $04
+    .byte $08
+    .byte $0e
+palette_repeat:
+    .byte $03
+    .byte $09
+
+palette_entries = 3
+
+palette_chroma:
+    .repeat 15, i
+        .byte (i + 1) << 4
+    .endrepeat
+    .repeat 2, i
+        .byte (i + 1) << 4
+    .endrepeat
+palette_chroma_entries = 15
+
 .code
 
 z_buffer_len = 16
@@ -1136,31 +1159,65 @@ done:
 
 .proc vblank_handler
     inc count_frames
+
+    inc chroma_ticks
+    lda chroma_ticks
+    cmp #(chroma_delay)
+    bne skip_chroma
+
+    lda #0
+    sta chroma_ticks
+
+    inc chroma_offset
+    lda chroma_offset
+    cmp #(palette_chroma_entries)
+    bne skip_chroma
+
+    lda #0
+    sta chroma_offset
+skip_chroma:
+
+    inc palette_ticks
+    lda palette_ticks
+    cmp #(palette_delay)
+    bne skip_luma
+
+    lda #0
+    sta palette_ticks
+
     inc palette_offset
+    lda palette_offset
+    cmp #(palette_entries)
+    bne skip_luma
+
+    lda #0
+    sta palette_offset
+
+skip_luma:
     jsr update_palette
     jmp XITVBV
 .endproc
 
 .proc update_palette
-    lda palette
+    lda #0
     sta COLOR4
 
-    clc
-    lda palette_offset
-    and #$f0
-    adc palette + 1
+    ldx chroma_offset
+    ldy palette_offset
+    lda palette_chroma,x
+    ora palette_start,y
     sta COLOR0
 
-    clc
-    lda palette_offset
-    and #$f0
-    adc palette + 2
+    inx
+    iny
+    lda palette_chroma,x
+    ora palette_start,y
     sta COLOR1
 
-    clc
-    lda palette_offset
-    and #$f0
-    adc palette + 3
+    inx
+    iny
+    lda palette_chroma,x
+    ora palette_start,y
     sta COLOR2
 
     rts
@@ -1358,6 +1415,9 @@ copy_byte_loop:
     ; Initialize the palette
     lda #0
     sta palette_offset
+    sta palette_delay
+    sta chroma_offset
+    sta chroma_delay
     jsr update_palette
 
     ; install the vblank handler
diff --git a/todo.md b/todo.md
index 6c6d84d..a8675af 100644
--- a/todo.md
+++ b/todo.md
@@ -11,8 +11,6 @@ things to try:
 
 * 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
 
-* rework the palette cycling to look more like an advancing flow
-
 * extact viewport for display & re-input via keyboard
 
 * fujinet screenshot/viewport uploader

From 71d8d93abc60b7b2a8f9730257c13cb5751a5905 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 11:33:55 -0800
Subject: [PATCH 048/104] even better palette cycling

---
 mandel.s | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mandel.s b/mandel.s
index 198e40c..9b6b32d 100644
--- a/mandel.s
+++ b/mandel.s
@@ -38,8 +38,8 @@ palette_ticks = $c0 ; u8
 chroma_ticks  = $c1 ; u8
 chroma_offset = $c2 ; u8
 
-palette_delay = 120
-chroma_delay = 120
+palette_delay = 23
+chroma_delay = 137
 
 
 ; FP registers in zero page
@@ -1208,13 +1208,13 @@ skip_luma:
     ora palette_start,y
     sta COLOR0
 
-    inx
+    ;inx
     iny
     lda palette_chroma,x
     ora palette_start,y
     sta COLOR1
 
-    inx
+    ;inx
     iny
     lda palette_chroma,x
     ora palette_start,y

From 14125a398aa37d08e0468f2bfd56379cf884d48d Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 11:35:45 -0800
Subject: [PATCH 049/104] cycle 'in' not 'out'

---
 mandel.s | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mandel.s b/mandel.s
index 9b6b32d..985df82 100644
--- a/mandel.s
+++ b/mandel.s
@@ -234,12 +234,12 @@ color_map:
 
 
 palette_start:
-    .byte $04
-    .byte $08
     .byte $0e
+    .byte $08
+    .byte $04
 palette_repeat:
-    .byte $03
-    .byte $09
+    .byte $0e
+    .byte $08
 
 palette_entries = 3
 
@@ -1206,7 +1206,7 @@ skip_luma:
     ldy palette_offset
     lda palette_chroma,x
     ora palette_start,y
-    sta COLOR0
+    sta COLOR2
 
     ;inx
     iny
@@ -1218,7 +1218,7 @@ skip_luma:
     iny
     lda palette_chroma,x
     ora palette_start,y
-    sta COLOR2
+    sta COLOR0
 
     rts
 .endproc

From 63e74d51520f24e4e2708feaf4071ee4d8191e0f Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 13:44:31 -0800
Subject: [PATCH 050/104] tweak

---
 mandel.s | 5 ++++-
 todo.md  | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index 985df82..844a286 100644
--- a/mandel.s
+++ b/mandel.s
@@ -434,13 +434,16 @@ viewport_oy:
     sta dest + 1
 .endmacro
 
+.segment "TABLES"
 ; lookup table for top byte -> PORTB value for bank-switch
-;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
+.align 256
 bank_switch_table:
     .repeat 256, i
         .byte ((i & $c0) >> 4) | $e3
     .endrepeat
 
+.code
+
 .macro bank_switch bank
     lda #((bank << 2) | $e3)
     sta PORTB
diff --git a/todo.md b/todo.md
index a8675af..4aaedc0 100644
--- a/todo.md
+++ b/todo.md
@@ -11,6 +11,6 @@ things to try:
 
 * 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
 
-* extact viewport for display & re-input via keyboard
+* extract viewport for display & re-input via keyboard
 
 * fujinet screenshot/viewport uploader

From 3bd9b1ac3164d4895b6564f48ecca76150f0384b Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 14:09:02 -0800
Subject: [PATCH 051/104] micro-optimizations in imul8xe

53-72 cycles
overview in 10.896 ms/px
---
 mandel.s | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/mandel.s b/mandel.s
index 844a286..97a8863 100644
--- a/mandel.s
+++ b/mandel.s
@@ -37,6 +37,7 @@ palette_offset  = $bf ; u8
 palette_ticks = $c0 ; u8
 chroma_ticks  = $c1 ; u8
 chroma_offset = $c2 ; u8
+ptr           = $c4 ; u16
 
 palette_delay = 23
 chroma_delay = 137
@@ -452,21 +453,19 @@ bank_switch_table:
 .macro imul8 dest, arg1, arg2, xe
     .if xe
         ; using 64KB lookup table
-        ; 58-77 cycles
-        ; clobbers x, y, dest to dest + 3
+        ; 53-72 cycles
+        ; clobbers x, y, dest, ptr
         .scope
             output = dest
-            ptr = dest + 2 ; scratch space assumed
 
             ; bottom 14 bits except the LSB are the per-bank table index
             ; add $4000 for the bank pointer
             lda arg1     ; 3 cyc
             and #$fe     ; 2 cyc
-            sta ptr      ; 3 cyc
+            tay          ; 2 cyc
             lda arg2     ; 3 cyc
             and #$3f     ; 2 cyc
-            clc          ; 2 cyc
-            adc #$40     ; 2 cyc
+            ora #$40     ; 2 cyc
             sta ptr + 1  ; 3 cyc
             
             ; top 2 bits are the table bank selector
@@ -476,7 +475,6 @@ bank_switch_table:
 
 
             ; copy the entry into output
-            ldy #0       ; 2 cyc
             lda (ptr),y  ; 5 cyc
             sta output   ; 3 cyc
             iny          ; 2 cyc
@@ -609,6 +607,9 @@ init:
     lda #$00
     sta arg1
     sta arg2
+    sta ptr
+    lda #$40
+    sta ptr + 1
 
     ; $00 * $00 -> $3f * $ff
     bank_switch 0

From 9b7f6b8937a0c7e647eec09f87c12b10de4f7ad8 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 14:22:03 -0800
Subject: [PATCH 052/104] add a viewport in the front spike

---
 mandel.s | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/mandel.s b/mandel.s
index 97a8863..9704a22 100644
--- a/mandel.s
+++ b/mandel.s
@@ -116,6 +116,13 @@ KEY_RIGHT = $87
 KEY_1     = $1f
 KEY_2     = $1e
 KEY_3     = $1a
+KEY_4     = 24
+KEY_5     = 29
+KEY_6     = 27
+KEY_7     = 51
+KEY_8     = 53
+KEY_9     = 48
+KEY_0     = 50
 
 .struct float48
     exponent .byte
@@ -279,16 +286,19 @@ viewport_zoom:
     .byte 1
     .byte 6
     .byte 8
+    .byte 6
 
 viewport_ox:
     .word $0000
     .word $f110
     .word $f110
+    .word $e400
 
 viewport_oy:
     .word $0000
     .word $fb60
     .word $fbe0
+    .word $0000
 
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
@@ -1279,6 +1289,8 @@ skip_luma:
     beq two
     cpy #KEY_3
     beq three
+    cpy #KEY_4
+    beq four
  
 skip_char:
     lda #0
@@ -1316,6 +1328,9 @@ two:
     jmp load_key_viewport
 three:
     ldx #2
+    jmp load_key_viewport
+four:
+    ldx #3
     ; fall through
 load_key_viewport:
     jsr load_viewport

From 6db8cef82d4117ae2b3ede21e9ed3cf1ab720a22 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 15:17:50 -0800
Subject: [PATCH 053/104] 51-70 cycles for xe :D

---
 mandel.s | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/mandel.s b/mandel.s
index 9704a22..4ac8d4d 100644
--- a/mandel.s
+++ b/mandel.s
@@ -463,28 +463,27 @@ bank_switch_table:
 .macro imul8 dest, arg1, arg2, xe
     .if xe
         ; using 64KB lookup table
-        ; 53-72 cycles
+        ; 51-70 cycles
         ; clobbers x, y, dest, ptr
         .scope
             output = dest
 
-            ; bottom 14 bits except the LSB are the per-bank table index
-            ; add $4000 for the bank pointer
-            lda arg1     ; 3 cyc
-            and #$fe     ; 2 cyc
-            tay          ; 2 cyc
-            lda arg2     ; 3 cyc
-            and #$3f     ; 2 cyc
-            ora #$40     ; 2 cyc
-            sta ptr + 1  ; 3 cyc
-            
             ; top 2 bits are the table bank selector
             ldx arg2                ; 3 cyc
             lda bank_switch_table,x ; 4 cyc
             sta PORTB               ; 4 cyc
 
+            ; bottom 14 bits except the LSB are the per-bank table index
+            ; add $4000 for the bank pointer
+            txa          ; 2 cyc
+            and #$3f     ; 2 cyc
+            ora #$40     ; 2 cyc
+            sta ptr + 1  ; 3 cyc
 
             ; copy the entry into output
+            lda arg1     ; 3 cyc
+            and #$fe     ; 2 cyc
+            tay          ; 2 cyc
             lda (ptr),y  ; 5 cyc
             sta output   ; 3 cyc
             iny          ; 2 cyc
@@ -503,9 +502,9 @@ bank_switch_table:
             and #1       ; 2 cyc
             beq done     ; 2 cyc
 
-            ; add the second param one last time for the skipped bit
+            ; add arg2 one last time for the skipped bit
             clc          ; 2 cyc
-            lda arg2     ; 3 cyc
+            txa          ; 2 cyc
             adc output   ; 3 cyc
             sta output   ; 3 cyc
             lda #0       ; 2 cyc

From e6cbe0bc6be5e97b151d0ffd0696b290992722c9 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 16:43:18 -0800
Subject: [PATCH 054/104] notes

---
 todo.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/todo.md b/todo.md
index 4aaedc0..1281de7 100644
--- a/todo.md
+++ b/todo.md
@@ -2,14 +2,13 @@ things to try:
 
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 
-* optimize out a store/load with mul8_add16 and mul8_add24
-
 * try 3.13 fixed point instead of 4.12 for more precision
   * can we get away without the extra bit?
 
 * y-axis mirror optimization
 
 * 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
+  * maybe redo tiering to just 4x4, 2x2, 1x1?
 
 * extract viewport for display & re-input via keyboard
 

From ed79c80b167607f0c59d7c8f33569f9bf3e981f5 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 16:50:25 -0800
Subject: [PATCH 055/104] update readme

---
 readme.md | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/readme.md b/readme.md
index 873793f..f297d60 100644
--- a/readme.md
+++ b/readme.md
@@ -14,15 +14,18 @@ Non-goals:
 
 Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
 
--- brooke, january 2023 - february 2024
+-- brooke, january 2023 - december 2024
 
 ## Current state
 
-Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.
+Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys.
 
-The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
+The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
 
-The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.
+* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition
+* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
+* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
+* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
 
 The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
 
@@ -30,17 +33,18 @@ Iterations are capped at 255.
 
 The pixels are run in a progressive layout to get the basic shape on screen faster.
 
-## Next steps
+There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D
 
-Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
+There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
 
-Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
-
-I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
-(done)
+There's some cute color cycling.
 
 ## Deps and build instructions
 
 I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.
 
 Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
+
+## Todo
+
+See ideas in `todo.md`.
\ No newline at end of file

From 67649d47434b8b30a9c6a3319616e6531d3ba6a5 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 19:17:02 -0800
Subject: [PATCH 056/104] annotations, tweak

---
 mandel.s | 55 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 17 deletions(-)

diff --git a/mandel.s b/mandel.s
index 4ac8d4d..787243f 100644
--- a/mandel.s
+++ b/mandel.s
@@ -310,18 +310,21 @@ viewport_oy:
     .endrepeat
 .endmacro
 
+; 20 cycles
 .macro add16 dest, arg1, arg2
     add 2, dest, arg1, arg2
 .endmacro
 
+; 38 cycles
 .macro add32 dest, arg1, arg2
     add 4, dest, arg2, dest
 .endmacro
 
+; 8 cycles
 .macro add_carry dest
-    lda dest
-    adc #0
-    sta dest
+    lda dest ; 3 cyc
+    adc #0   ; 2 cyc
+    sta dest ; 3 cyc
 .endmacro
 
 ; 2 + 9 * byte cycles
@@ -334,29 +337,35 @@ viewport_oy:
     .endrepeat
 .endmacro
 
+; 20 cycles
 .macro sub16 dest, arg1, arg2
     sub 2, dest, arg1, arg2
 .endmacro
 
+; 38 cycles
 .macro sub32 dest, arg1, arg2
     sub 4, dest, arg1, arg2
 .endmacro
 
+; 3 + 5 * bytes cycles
 .macro shl bytes, arg
-    asl arg
+    asl arg              ; 3 cyc
     .repeat bytes-1, i
-        rol arg + 1 + i
+        rol arg + 1 + i  ; 5 cyc
     .endrepeat
 .endmacro
 
+; 13 cycles
 .macro shl16 arg
     shl 2, arg
 .endmacro
 
+; 18 cycles
 .macro shl24 arg
     shl 3, arg
 .endmacro
 
+; 23 cycles
 .macro shl32 arg
     shl 4, arg
 .endmacro
@@ -369,14 +378,17 @@ viewport_oy:
     .endrepeat
 .endmacro
 
+; 12 cycles
 .macro copy16 dest, arg
     copy 2, dest, arg
 .endmacro
 
+; 24 cycles
 .macro copy32 dest, arg
     copy 4, dest, arg
 .endmacro
 
+; 36 cycles
 .macro copyfloat dest, arg
     copy 6, dest, arg
 .endmacro
@@ -401,9 +413,10 @@ viewport_oy:
     neg 4, arg
 .endmacro
 
+; 23 * shift
 .macro shift_round_16 arg, shift
     .repeat shift
-        shl32 arg
+        shl32 arg ; 23 cycles
     .endrepeat
     round16 arg
 .endmacro
@@ -806,6 +819,7 @@ arg2_pos:
     sqr16_impl 1
 .endproc
 
+; 11-27 cycles
 .macro round16 arg
     ; Round top 16 bits of 32-bit fixed-point number in-place
     .local increment
@@ -818,21 +832,28 @@ arg2_pos:
     ;                   round down if negative
     ;          < $8000: round down
 
-    lda arg + 1
-    cmp #$80
-    beq high_half
-    bpl increment
-    bmi next
+    ; $8000 17
+    ; $8001 27
+    ; $8100 21
+    ; $7fff 11
+
+    lda arg + 1    ; 3 cyc
+    cmp #$80       ; 2 cyc
+    beq high_half  ; 2 cyc
+
+    bpl increment  ; 2 cyc
+
+    bmi next       ; 2 cyc
 
 high_half:
-    lda arg
-    beq check_sign
-    bpl increment
-    bmi next
+    lda arg        ; 3 cyc
+    beq check_sign ; 2 cyc
+
+    jmp increment  ; 3 cyc
 
 check_sign:
-    lda arg + 3
-    bmi next
+    lda arg + 3  ; 3 cyc
+    bmi next     ; 2 cyc
 
 increment:       ; 5-10 cyc
     inc arg + 2  ; 5 cyc

From ec42f672d43ab8aecb863791ec55b22569436524 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 19:48:28 -0800
Subject: [PATCH 057/104] use an 8-item z buffer for slightly fasterness

---
 mandel.s | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mandel.s b/mandel.s
index 787243f..39e71b0 100644
--- a/mandel.s
+++ b/mandel.s
@@ -262,7 +262,10 @@ palette_chroma_entries = 15
 
 .code
 
-z_buffer_len = 16
+;z_buffer_len = 16 ; 10.863 ms/px
+;z_buffer_len = 12 ; 10.619 ms/px
+z_buffer_len = 8 ; 10.612 ms/px
+;z_buffer_len = 4 ; 12.395 ms/px
 z_buffer_mask = z_buffer_len - 1
 z_buffer:
     ; the last N zx/zy values

From 0a7293d8bca6cc56182c356c993002ae1482f017 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 19:52:35 -0800
Subject: [PATCH 058/104] do 4x4 2x2 1x1 only

in prep for bigger pixels
---
 mandel.s | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mandel.s b/mandel.s
index 39e71b0..b88105b 100644
--- a/mandel.s
+++ b/mandel.s
@@ -276,11 +276,12 @@ z_buffer:
 
 .export start
 
-max_fill_level = 6
+;max_fill_level = 6
+max_fill_level = 3
 fill_masks:
-    .byte %00011111
-    .byte %00001111
-    .byte %00000111
+;    .byte %00011111
+;    .byte %00001111
+;    .byte %00000111
     .byte %00000011
     .byte %00000001
     .byte %00000000

From b56dc1e98bfeb3c18c4f90df0e0d19fbe5362cde Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 20:38:33 -0800
Subject: [PATCH 059/104] notes

---
 mandel.s | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mandel.s b/mandel.s
index b88105b..fc30532 100644
--- a/mandel.s
+++ b/mandel.s
@@ -417,19 +417,20 @@ viewport_oy:
     neg 4, arg
 .endmacro
 
-; 23 * shift
+; 11-27 + 23 * shift cycles
+; 103-119 cycles for shift=4
 .macro shift_round_16 arg, shift
     .repeat shift
         shl32 arg ; 23 cycles
     .endrepeat
-    round16 arg
+    round16 arg ; 11-27 cycles
 .endmacro
 
 .macro imul16_round dest, arg1, arg2, shift
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
     jsr imul16_func   ; ? cyc
-    shift_round_16 FR2, shift
+    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
@@ -437,7 +438,7 @@ viewport_oy:
     ;imul16_round dest, arg, arg, shift
     copy16 FR0, arg   ; 12 cyc
     jsr sqr16_func      ; ? cyc
-    shift_round_16 FR2, shift
+    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 

From 61eb1aaf21fdac377e6f04db117aa855ad73b940 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 05:11:26 -0800
Subject: [PATCH 060/104] notes

---
 todo.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/todo.md b/todo.md
index 1281de7..6fb0282 100644
--- a/todo.md
+++ b/todo.md
@@ -1,5 +1,11 @@
 things to try:
 
+* skip add on the top-byte multiply in sqr8/mul8
+  * should save a few cycles, suggestion by jamey
+
+* perform the zx += zx^s + cx in 32-bit space, before rounding
+  * should improve precision on max zoom, might cost a few cycles
+
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 
 * try 3.13 fixed point instead of 4.12 for more precision

From 0d086a179cf8e91b839f306bb597ef9e6125f6b2 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 07:20:53 -0800
Subject: [PATCH 061/104] wip

---
 mandel.s | 108 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 68 insertions(+), 40 deletions(-)

diff --git a/mandel.s b/mandel.s
index fc30532..50213ad 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1,43 +1,42 @@
 ; Our zero-page vars
-sx    = $80     ; i16: screen pixel x
-sy    = $82     ; i16: screen pixel y
-ox    = $84     ; fixed4.12: center point x
-oy    = $86     ; fixed4.12: center point y
-cx    = $88     ; fixed4.12: c_x
-cy    = $8a     ; fixed4.12: c_y
-zx    = $8c     ; fixed4.12: z_x
-zy    = $8e     ; fixed4.12: z_y
+ox              = $80 ; fixed8.24: center point x
+oy              = $84 ; fixed8.24: center point y
+cx              = $88 ; fixed8.24: c_x
+cy              = $8c ; fixed8.24: c_y
 
-zx_2  = $90     ; fixed4.12: z_x^2
-zy_2  = $92     ; fixed4.12: z_y^2
-zx_zy = $94     ; fixed4.12: z_x * z_y
-dist  = $96     ; fixed4.12: z_x^2 + z_y^2
+zx              = $90 ; fixed8.24: z_x
+zy              = $94 ; fixed8.24: z_y
+zx_2            = $98 ; fixed8.24: z_x^2
+zy_2            = $9c ; fixed8.24: z_y^2
 
-iter          = $a0 ; u8: iteration count
+zx_zy           = $a0 ; fixed8.24: z_x * z_y
+dist            = $a4 ; fixed8.24: z_x^2 + z_y^2
+sx              = $a8 ; i16: screen pixel x
+sy              = $aa ; i16: screen pixel y
+z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
+z_buffer_start  = $ad ; u8: index into z_buffer
+z_buffer_end    = $ae ; u8: index into z_buffer
+iter            = $af ; u8: iteration count
 
-zoom          = $a1 ; u8: zoom shift level
-count_frames  = $a2 ; u8
-count_pixels  = $a3 ; u8
-total_ms      = $a4 ; float48
-total_pixels  = $aa ; float48
+ptr             = $b0 ; u16
+pixel_ptr       = $b2 ; u16
+zoom            = $b4 ; u8: zoom shift level
+fill_level      = $b5 ; u8
+pixel_color     = $b6 ; u8
+pixel_mask      = $b7 ; u8
+pixel_shift     = $b8 ; u8
+pixel_offset    = $b9 ; u8
+palette_offset  = $ba ; u8
+chroma_offset   = $bb ; u8
+palette_ticks   = $bc ; u8
+chroma_ticks    = $bd ; u8
+count_frames    = $be ; u8
+count_pixels    = $bf ; u8
 
-z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
-z_buffer_start  = $b1 ; u8: index into z_buffer
-z_buffer_end    = $b2 ; u8: index into z_buffer
-temp            = $b4 ; u16
-temp2           = $b6 ; u16
-pixel_ptr       = $b8 ; u16
-pixel_color     = $ba ; u8
-pixel_mask      = $bb ; u8
-pixel_shift     = $bc ; u8
-pixel_offset    = $bd ; u8
-fill_level      = $be ; u8
-palette_offset  = $bf ; u8
-
-palette_ticks = $c0 ; u8
-chroma_ticks  = $c1 ; u8
-chroma_offset = $c2 ; u8
-ptr           = $c4 ; u16
+total_pixels    = $c0 ; float48
+total_ms        = $c6 ; float48
+temp            = $cc ; u16
+temp2           = $ce ; u16
 
 palette_delay = 23
 chroma_delay = 137
@@ -884,12 +883,41 @@ next:
     ; zx_zy = 0
     ; dist = 0
     ; iter = 0
+;    lda #00
+;    ldx #(iter - zx + 1)
+;initloop:
+;    sta zx - 1,x
+;    dex
+;    bne initloop
+;    sta z_buffer_start
+;    sta z_buffer_end
+
     lda #00
-    ldx #(iter - zx + 1)
-initloop:
-    sta zx - 1,x
-    dex
-    bne initloop
+    sta zx
+    sta zx + 1
+    sta zx + 2
+    sta zx + 3
+    sta zy
+    sta zy + 1
+    sta zy + 2
+    sta zy + 3
+    sta zx_2
+    sta zx_2 + 1
+    sta zx_2 + 2
+    sta zx_2 + 3
+    sta zy_2
+    sta zy_2 + 1
+    sta zy_2 + 2
+    sta zy_2 + 3
+    sta zx_zy
+    sta zx_zy + 1
+    sta zx_zy + 2
+    sta zx_zy + 3
+    sta dist
+    sta dist + 1
+    sta dist + 2
+    sta dist + 3
+    sta iter
     sta z_buffer_start
     sta z_buffer_end
 

From 4a1e35699adcce1af0f60ea51573e8a215975c66 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 07:19:45 -0800
Subject: [PATCH 062/104] wip

---
 mandel.s | 71 ++++++++++++++++++++++++++++++++++++++------------------
 todo.md  |  2 +-
 2 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/mandel.s b/mandel.s
index 50213ad..622ff62 100644
--- a/mandel.s
+++ b/mandel.s
@@ -433,6 +433,13 @@ viewport_oy:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
+.macro imul16 dest, arg1, arg2
+    copy16 FR0, arg1  ; 12 cyc
+    copy16 FR1, arg2  ; 12 cyc
+    jsr imul16_func   ; ? cyc
+    copy32 dest, FR2  ; 24 cyc
+.endmacro
+
 .macro sqr16_round dest, arg, shift
     ;imul16_round dest, arg, arg, shift
     copy16 FR0, arg   ; 12 cyc
@@ -441,6 +448,12 @@ viewport_oy:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
+.macro sqr16 dest, arg
+    copy16 FR0, arg   ; 12 cyc
+    jsr sqr16_func    ; ? cyc
+    copy32 dest, FR2  ; 24 cyc
+.endmacro
+
 ; clobbers a, x
 .macro sqr8 dest, arg
     ldx arg
@@ -870,8 +883,8 @@ next:
 
 .proc mandelbrot
     ; input:
-    ; cx: position scaled to 4.12 fixed point - -8..+7.9
-    ; cy: position scaled to 4.12
+    ; cx: position scaled to 8.24 fixed point - -128..+127.9
+    ; cy: position scaled to 8.24
     ;
     ; output:
     ; iter: iteration count at escape or 0
@@ -909,10 +922,6 @@ next:
     sta zy_2 + 1
     sta zy_2 + 2
     sta zy_2 + 3
-    sta zx_zy
-    sta zx_zy + 1
-    sta zx_zy + 2
-    sta zx_zy + 3
     sta dist
     sta dist + 1
     sta dist + 2
@@ -929,6 +938,8 @@ loop:
 keep_going:
 
     .macro quick_exit arg, max
+        ; arg: fixed8.24
+        ; max: integer
         .local positive
         .local negative
         .local nope_out
@@ -936,51 +947,61 @@ keep_going:
         .local all_done
 
         ; check sign bit
-        lda arg + 1
+        lda arg + 3
         bmi negative
 
     positive:
-        cmp #((max) << 4)
+        cmp #max
         bmi all_done ; 'less than'
         jmp exit_path
 
     negative:
-        cmp #(256 - ((max) << 4))
+        cmp #(256 - max)
         beq first_equal ; 'equal' on first byte
         bpl all_done    ; 'greater than'
 
     nope_out:
         jmp exit_path
-    
+
     first_equal:
+        ; following bytes all 0 shows it's really 'equal'
+        lda arg + 2
+        bne all_done
+        lda arg + 1
+        bne all_done
         lda arg
-        beq nope_out  ; 2nd byte 0 shows it's really 'equal'
+        bne all_done
+        jmp exit_path
 
     all_done:
     .endmacro
 
-    ; 4.12: (-8 .. +7.9)
+    ; 8.24: (-128 .. 127.9) / (-8 .. +7.9)
     ; zx = zx_2  - zy_2  + cx
-    sub16 zx, zx_2, zy_2
-    add16 zx, zx, cx
+    sub32 zx, zx_2, zy_2
+    add32 zx, zx, cx
     quick_exit zx, 2
 
     ; zy = zx_zy + zx_zy + cy
-    add16 zy, zx_zy, zx_zy
-    add16 zy, zy, cy
+    add32 zy, zx_zy, zx_zy
+    add32 zy, zy, cy
     quick_exit zy, 2
 
+    ; convert 8.24 -> 4.12
+    shift_round_16 zx, 4
+    shift_round_16 zy, 4
+
     ; zx_2 = zx * zx
-    sqr16_round zx_2, zx, 4
+    sqr16 zx_2, zx + 2
 
     ; zy_2 = zy * zy
-    sqr16_round zy_2, zy, 4
+    sqr16 zy_2, zy + 2
 
     ; zx_zy = zx * zy
-    imul16_round zx_zy, zx, zy, 4
+    imul16 zx_zy, zx + 2, zy + 2
 
     ; dist = zx_2 + zy_2
-    add16 dist, zx_2, zy_2
+    add32 dist, zx_2, zy_2
     quick_exit dist, 4
 
     ; if may be in the lake, look for looping output with a small buffer
@@ -1090,13 +1111,17 @@ enough:
 .endmacro
 
 .macro zoom_factor dest, src, zoom, aspect
+    ; output: dest: fixed8.24
+    ; input: src: fixed4.12
+    ; input: zoom: u8 ???
+    ; aspect: fixed4.12
     ; clobbers A, X, flags, etc
     copy16 dest, src
     scale_zoom dest
 
     ; cy = cy * (3 / 4)
     ; cx = cx * (5 / 4)
-    imul16_round dest, dest, aspect, 4
+    imul16 dest, dest, aspect
 .endmacro
 
 .proc pset
@@ -1567,9 +1592,9 @@ not_skipped_mask:
 
     ; run the fractal!
     zoom_factor cx, sx, zoom, aspect_x
-    add16 cx, cx, ox
+    add32 cx, cx, ox
     zoom_factor cy, sy, zoom, aspect_y
-    add16 cy, cy, oy
+    add32 cy, cy, oy
     jsr mandelbrot
     jsr pset
 
diff --git a/todo.md b/todo.md
index 6fb0282..29217cd 100644
--- a/todo.md
+++ b/todo.md
@@ -3,7 +3,7 @@ things to try:
 * skip add on the top-byte multiply in sqr8/mul8
   * should save a few cycles, suggestion by jamey
 
-* perform the zx += zx^s + cx in 32-bit space, before rounding
+* perform the zx_next = zx^s + cx in 32-bit space, before rounding
   * should improve precision on max zoom, might cost a few cycles
 
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D

From 7184b8e03f2748efd532277995afe5fa7d4a3cf6 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 07:33:20 -0800
Subject: [PATCH 063/104] wip

---
 mandel.s | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/mandel.s b/mandel.s
index 622ff62..86a6b48 100644
--- a/mandel.s
+++ b/mandel.s
@@ -292,16 +292,16 @@ viewport_zoom:
     .byte 6
 
 viewport_ox:
-    .word $0000
-    .word $f110
-    .word $f110
-    .word $e400
+    .dword $00000000
+    .dword $ff110000
+    .dword $ff110000
+    .dword $fe400000
 
 viewport_oy:
-    .word $0000
-    .word $fb60
-    .word $fbe0
-    .word $0000
+    .dword $00000000
+    .dword $ffb60000
+    .dword $ffbe0000
+    .dword $00000000
 
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
@@ -1459,17 +1459,32 @@ zero_byte_loop:
 
     txa
     asl a
+    asl a
+
     tax
     lda viewport_ox,x
     sta ox
     lda viewport_oy,x
     sta oy
+
     inx
     lda viewport_ox,x
     sta ox + 1
     lda viewport_oy,x
     sta oy + 1
 
+    inx
+    lda viewport_ox,x
+    sta ox + 2
+    lda viewport_oy,x
+    sta oy + 2
+
+    inx
+    lda viewport_ox,x
+    sta ox + 3
+    lda viewport_oy,x
+    sta oy + 3
+
     rts
 .endproc
 

From 13257309dc3a6493e05575404f5deddd09e9192d Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 08:34:02 -0800
Subject: [PATCH 064/104] init fix

---
 mandel.s | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mandel.s b/mandel.s
index 86a6b48..76816c2 100644
--- a/mandel.s
+++ b/mandel.s
@@ -922,6 +922,10 @@ next:
     sta zy_2 + 1
     sta zy_2 + 2
     sta zy_2 + 3
+    sta zx_zy
+    sta zx_zy + 1
+    sta zx_zy + 2
+    sta zx_zy + 3
     sta dist
     sta dist + 1
     sta dist + 2

From 2fcb30b76a66819ab96ec3353b8ce4978f723675 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 08:56:59 -0800
Subject: [PATCH 065/104] wip

---
 mandel.s | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mandel.s b/mandel.s
index 76816c2..0400003 100644
--- a/mandel.s
+++ b/mandel.s
@@ -980,7 +980,7 @@ keep_going:
     all_done:
     .endmacro
 
-    ; 8.24: (-128 .. 127.9) / (-8 .. +7.9)
+    ; 8.24: (-128 .. 127.9)
     ; zx = zx_2  - zy_2  + cx
     sub32 zx, zx_2, zy_2
     add32 zx, zx, cx
@@ -991,7 +991,7 @@ keep_going:
     add32 zy, zy, cy
     quick_exit zy, 2
 
-    ; convert 8.24 -> 4.12
+    ; convert 8.24 -> 4.12: (-8 .. +7.9)
     shift_round_16 zx, 4
     shift_round_16 zy, 4
 
@@ -1042,10 +1042,10 @@ z_buffer_loop:
 
     ; Compare the previously stored z values
     ldy #0
-    z_compare zx
-    z_compare zx + 1
-    z_compare zy
-    z_compare zy + 1
+    z_compare zx + 2
+    z_compare zx + 3
+    z_compare zy + 2
+    z_compare zy + 3
 
     cpy #4
     bne z_no_matches
@@ -1060,10 +1060,10 @@ z_no_matches:
 z_nothing_to_read:
 
     ; Store and expand
-    z_store zx
-    z_store zx + 1
-    z_store zy
-    z_store zy + 1
+    z_store zx + 2
+    z_store zx + 3
+    z_store zy + 2
+    z_store zy + 3
     z_advance
     stx z_buffer_end
 

From d2f41f964435b3803ce694a70bf38687fd467caa Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 09:02:42 -0800
Subject: [PATCH 066/104] wip

---
 mandel.s | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/mandel.s b/mandel.s
index 0400003..8b63941 100644
--- a/mandel.s
+++ b/mandel.s
@@ -425,14 +425,8 @@ viewport_oy:
     round16 arg ; 11-27 cycles
 .endmacro
 
-.macro imul16_round dest, arg1, arg2, shift
-    copy16 FR0, arg1  ; 12 cyc
-    copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; ? cyc
-    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
-    copy16 dest, FR2 + 2  ; 12 cyc
-.endmacro
-
+; input: arg1, arg2 as fixed4.12
+; output: dest as fixed8.24
 .macro imul16 dest, arg1, arg2
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
@@ -440,20 +434,16 @@ viewport_oy:
     copy32 dest, FR2  ; 24 cyc
 .endmacro
 
-.macro sqr16_round dest, arg, shift
-    ;imul16_round dest, arg, arg, shift
-    copy16 FR0, arg   ; 12 cyc
-    jsr sqr16_func      ; ? cyc
-    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
-    copy16 dest, FR2 + 2  ; 12 cyc
-.endmacro
-
+; input: arg as fixed4.12
+; output: dest as fixed8.24
 .macro sqr16 dest, arg
     copy16 FR0, arg   ; 12 cyc
     jsr sqr16_func    ; ? cyc
     copy32 dest, FR2  ; 24 cyc
 .endmacro
 
+; input: arg as u8
+; output: dest as u16
 ; clobbers a, x
 .macro sqr8 dest, arg
     ldx arg

From 1e0f577e099b3d7787d6e6d4fce1813ccd6b489c Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 09:09:11 -0800
Subject: [PATCH 067/104] wip

---
 mandel.s | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mandel.s b/mandel.s
index 8b63941..6977582 100644
--- a/mandel.s
+++ b/mandel.s
@@ -453,6 +453,8 @@ viewport_oy:
     sta dest + 1
 .endmacro
 
+; input: arg as u8
+; input/output: dest as u16
 ; clobbers a, x
 .macro sqr8_add16 dest, arg
     ldx arg

From 81bf7f3c434646f0374c35f20131050bd314d1b2 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 09:53:22 -0800
Subject: [PATCH 068/104] tweak

---
 mandel.s | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mandel.s b/mandel.s
index 6977582..4ab6c19 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1106,10 +1106,9 @@ cont:
 enough:
 .endmacro
 
-.macro zoom_factor dest, src, zoom, aspect
+.macro zoom_factor dest, src, aspect
     ; output: dest: fixed8.24
     ; input: src: fixed4.12
-    ; input: zoom: u8 ???
     ; aspect: fixed4.12
     ; clobbers A, X, flags, etc
     copy16 dest, src
@@ -1602,9 +1601,9 @@ skipped_mask:
 not_skipped_mask:
 
     ; run the fractal!
-    zoom_factor cx, sx, zoom, aspect_x
+    zoom_factor cx, sx, aspect_x
     add32 cx, cx, ox
-    zoom_factor cy, sy, zoom, aspect_y
+    zoom_factor cy, sy, aspect_y
     add32 cy, cy, oy
     jsr mandelbrot
     jsr pset

From 2e8893fd7892429bc07bd1d653ef1319be7d2d7b Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 13:54:53 -0800
Subject: [PATCH 069/104] haha fuck me

---
 mandel.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mandel.s b/mandel.s
index 4ab6c19..04edec5 100644
--- a/mandel.s
+++ b/mandel.s
@@ -320,7 +320,7 @@ viewport_oy:
 
 ; 38 cycles
 .macro add32 dest, arg1, arg2
-    add 4, dest, arg2, dest
+    add 4, dest, arg1, arg2
 .endmacro
 
 ; 8 cycles

From cc83c76706519cce3fff61ce46df9589d31025d6 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 14:16:43 -0800
Subject: [PATCH 070/104] update docs for 32-bit intermediates

---
 readme.md | 4 ++--
 todo.md   | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/readme.md b/readme.md
index f297d60..d60644c 100644
--- a/readme.md
+++ b/readme.md
@@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
 * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
 
-The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
 
 Iterations are capped at 255.
 
@@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e
 
 ## Todo
 
-See ideas in `todo.md`.
\ No newline at end of file
+See ideas in `todo.md`.
diff --git a/todo.md b/todo.md
index 29217cd..284d653 100644
--- a/todo.md
+++ b/todo.md
@@ -3,13 +3,11 @@ things to try:
 * skip add on the top-byte multiply in sqr8/mul8
   * should save a few cycles, suggestion by jamey
 
-* perform the zx_next = zx^s + cx in 32-bit space, before rounding
-  * should improve precision on max zoom, might cost a few cycles
-
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 
 * try 3.13 fixed point instead of 4.12 for more precision
   * can we get away without the extra bit?
+  * since exit compare space would be 6.26 i think so
 
 * y-axis mirror optimization
 

From 7985ea9a399554340a76f8cfc340bb566d86a952 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 14:45:38 -0800
Subject: [PATCH 071/104] fix panning for 32-bi

---
 mandel.s | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/mandel.s b/mandel.s
index 04edec5..fe86366 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1341,12 +1341,15 @@ skip_luma:
     cpy #KEY_MINUS
     beq minus
 
-    ; temp = $0010 << (8 - zoom)
-    lda #$10
-    sta temp
+    ; temp+temp2 = $00010000 << (8 - zoom)
     lda #$00
+    sta temp
     sta temp + 1
-    scale_zoom temp
+    lda #$01
+    sta temp + 2
+    lda #$00
+    sta temp + 3
+    scale_zoom temp + 2
 
     cpy #KEY_UP
     beq up
@@ -1356,14 +1359,7 @@ skip_luma:
     beq left
     cpy #KEY_RIGHT
     beq right
-    cpy #KEY_1
-    beq one
-    cpy #KEY_2
-    beq two
-    cpy #KEY_3
-    beq three
-    cpy #KEY_4
-    beq four
+    jmp number_keys
  
 skip_char:
     lda #0
@@ -1382,17 +1378,28 @@ minus:
     dec zoom
     jmp done
 up:
-    sub16 oy, oy, temp 
+    sub32 oy, oy, temp
     jmp done
 down:
-    add16 oy, oy, temp
+    add32 oy, oy, temp
     jmp done
 left:
-    sub16 ox, ox, temp
+    sub32 ox, ox, temp
     jmp done
 right:
-    add16 ox, ox, temp
+    add32 ox, ox, temp
     jmp done
+
+number_keys:
+    cpy #KEY_1
+    beq one
+    cpy #KEY_2
+    beq two
+    cpy #KEY_3
+    beq three
+    cpy #KEY_4
+    beq four
+
 one:
     ldx #0
     jmp load_key_viewport

From d8601bb856ac0858ea7a06f4c60f162f1664c52a Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 15:03:43 -0800
Subject: [PATCH 072/104] fix fix

---
 mandel.s | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mandel.s b/mandel.s
index fe86366..b8985b3 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1399,6 +1399,7 @@ number_keys:
     beq three
     cpy #KEY_4
     beq four
+    jmp skip_char
 
 one:
     ldx #0

From 87caa52543f3aec6ff3c87dc79dd734182d6be87 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 15:45:03 -0800
Subject: [PATCH 073/104] add viewport number 5 full zoom

---
 mandel.s | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mandel.s b/mandel.s
index b8985b3..9546b68 100644
--- a/mandel.s
+++ b/mandel.s
@@ -290,18 +290,21 @@ viewport_zoom:
     .byte 6
     .byte 8
     .byte 6
+    .byte 8
 
 viewport_ox:
     .dword $00000000
     .dword $ff110000
     .dword $ff110000
     .dword $fe400000
+    .dword $fe3b0000
 
 viewport_oy:
     .dword $00000000
     .dword $ffb60000
     .dword $ffbe0000
     .dword $00000000
+    .dword $fffe0000
 
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
@@ -1399,6 +1402,8 @@ number_keys:
     beq three
     cpy #KEY_4
     beq four
+    cpy #KEY_5
+    beq five
     jmp skip_char
 
 one:
@@ -1412,6 +1417,9 @@ three:
     jmp load_key_viewport
 four:
     ldx #3
+    jmp load_key_viewport
+five:
+    ldx #4
     ; fall through
 load_key_viewport:
     jsr load_viewport

From f1ebb21bcbf9861d19c1dcb9e38f37503b1d22ee Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 17:49:13 -0800
Subject: [PATCH 074/104] wip not working wide pixels

---
 mandel.s | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/mandel.s b/mandel.s
index 9546b68..f777cbc 100644
--- a/mandel.s
+++ b/mandel.s
@@ -234,9 +234,9 @@ display_list_len = display_list_end - display_list_start
 color_map:
     .byte 0
     .repeat 85
-        .byte 1
-        .byte 2
-        .byte 3
+        .byte %01010101
+        .byte %10101010
+        .byte %11111111
     .endrepeat
 
 
@@ -285,6 +285,11 @@ fill_masks:
     .byte %00000001
     .byte %00000000
 
+pixel_masks:
+    .byte $ff
+    .byte $0f
+    .byte $03
+
 viewport_zoom:
     .byte 1
     .byte 6
@@ -1130,8 +1135,11 @@ enough:
     ; iter -> color
     ldx iter
     lda color_map,x
+    ldx fill_level
+    and pixel_masks,x
     sta pixel_color
-    lda #(255 - 3)
+    lda pixel_masks,x
+    eor #$ff
     sta pixel_mask
 
     ; sy -> line base address in temp

From 49fe3155294c0e392904a85154bf7dbe9d2e7808 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 20:13:11 -0800
Subject: [PATCH 075/104] 'wide pixels'

should get better color on the composite video because the
scanlines will be fuller of data
---
 mandel.s | 17 +++++++----------
 todo.md  |  3 +--
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/mandel.s b/mandel.s
index f777cbc..472f613 100644
--- a/mandel.s
+++ b/mandel.s
@@ -286,9 +286,9 @@ fill_masks:
     .byte %00000000
 
 pixel_masks:
-    .byte $ff
-    .byte $0f
-    .byte $03
+    .byte %11111111
+    .byte %11110000
+    .byte %11000000
 
 viewport_zoom:
     .byte 1
@@ -1188,18 +1188,15 @@ point:
     ; pixel_mask <<= pixel_shift (shifting in ones)
     and #3
     sta pixel_shift
-    lda #3
-    sec
-    sbc pixel_shift
     tax
 shift_loop:
     beq shift_done
-    asl pixel_color
-    asl pixel_color
+    lsr pixel_color
+    lsr pixel_color
     sec
-    rol pixel_mask
+    ror pixel_mask
     sec
-    rol pixel_mask
+    ror pixel_mask
     dex
     jmp shift_loop
 shift_done:
diff --git a/todo.md b/todo.md
index 284d653..e8cffe3 100644
--- a/todo.md
+++ b/todo.md
@@ -11,8 +11,7 @@ things to try:
 
 * y-axis mirror optimization
 
-* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
-  * maybe redo tiering to just 4x4, 2x2, 1x1?
+* try filling in the extra scanlines on 4x4 and 2x2 tiering
 
 * extract viewport for display & re-input via keyboard
 

From c424f1b8bc784c1b3bdbed15bb841a068b637039 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 22:10:27 -0800
Subject: [PATCH 076/104] fill in scanlines during tiering

---
 mandel.s | 17 +++++++++++++++++
 todo.md  |  2 --
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index 472f613..1f5a06f 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1201,6 +1201,10 @@ shift_loop:
     jmp shift_loop
 shift_done:
 
+    ldy fill_level
+    ldx fill_masks,y
+    inx
+
     ; pixel_offset = temp >> 2
     lda temp
     lsr a
@@ -1208,12 +1212,25 @@ shift_done:
     sta pixel_offset
     tay
 
+draw_pixel:
     ; read, mask, or, write
     lda (pixel_ptr),y
     and pixel_mask
     ora pixel_color
     sta (pixel_ptr),y
 
+    dex
+    beq done
+    clc
+    lda #40
+    adc pixel_ptr
+    sta pixel_ptr
+    lda #0
+    adc pixel_ptr + 1
+    sta pixel_ptr + 1
+    jmp draw_pixel
+
+done:
     rts
 .endproc
 
diff --git a/todo.md b/todo.md
index e8cffe3..7ab092b 100644
--- a/todo.md
+++ b/todo.md
@@ -11,8 +11,6 @@ things to try:
 
 * y-axis mirror optimization
 
-* try filling in the extra scanlines on 4x4 and 2x2 tiering
-
 * extract viewport for display & re-input via keyboard
 
 * fujinet screenshot/viewport uploader

From 65fcb44934d1eedd4ec149082674ac491eef76f8 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Wed, 1 Jan 2025 15:37:12 -0800
Subject: [PATCH 077/104] 3.13 / 6.26 gives nicer results!

---
 mandel.s  | 86 +++++++++++++++++++++++++++++--------------------------
 readme.md |  4 +--
 todo.md   |  4 ---
 3 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/mandel.s b/mandel.s
index 1f5a06f..9996c53 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1,16 +1,16 @@
 ; Our zero-page vars
-ox              = $80 ; fixed8.24: center point x
-oy              = $84 ; fixed8.24: center point y
-cx              = $88 ; fixed8.24: c_x
-cy              = $8c ; fixed8.24: c_y
+ox              = $80 ; fixed6.26: center point x
+oy              = $84 ; fixed6.26: center point y
+cx              = $88 ; fixed6.26: c_x
+cy              = $8c ; fixed6.26: c_y
 
-zx              = $90 ; fixed8.24: z_x
-zy              = $94 ; fixed8.24: z_y
-zx_2            = $98 ; fixed8.24: z_x^2
-zy_2            = $9c ; fixed8.24: z_y^2
+zx              = $90 ; fixed6.26: z_x
+zy              = $94 ; fixed6.26: z_y
+zx_2            = $98 ; fixed6.26: z_x^2
+zy_2            = $9c ; fixed6.26: z_y^2
 
-zx_zy           = $a0 ; fixed8.24: z_x * z_y
-dist            = $a4 ; fixed8.24: z_x^2 + z_y^2
+zx_zy           = $a0 ; fixed6.26: z_x * z_y
+dist            = $a4 ; fixed6.26: z_x^2 + z_y^2
 sx              = $a8 ; i16: screen pixel x
 sy              = $aa ; i16: screen pixel y
 z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
@@ -189,11 +189,11 @@ aspect:
     ;
     ; 184h is the equiv of 220.8h at square pixels
     ; 320 / 220.8 = 1.45 display aspect ratio
-aspect_x: ; fixed4.16 5/4
-    .word 5 << (12 - 2)
+aspect_x: ; fixed3.13 5/4
+    .word 5 << (13 - 2)
 
-aspect_y: ; fixed4.16 3/4
-    .word 3 << (12 - 2)
+aspect_y: ; fixed3.13 3/4
+    .word 3 << (13 - 2)
 
 ms_per_frame: ; float48 16.66666667
     .byte 64  ; exponent/sign
@@ -291,25 +291,26 @@ pixel_masks:
     .byte %11000000
 
 viewport_zoom:
-    .byte 1
-    .byte 6
-    .byte 8
-    .byte 6
+    .byte 0
+    .byte 5
+    .byte 7
+    .byte 5
+    .byte 7
     .byte 8
 
 viewport_ox:
-    .dword $00000000
-    .dword $ff110000
-    .dword $ff110000
-    .dword $fe400000
-    .dword $fe3b0000
+    .dword ($00000000 & $3fffffff) << 2
+    .dword ($ff110000 & $3fffffff) << 2
+    .dword ($ff110000 & $3fffffff) << 2
+    .dword ($fe400000 & $3fffffff) << 2
+    .dword ($fe3b0000 & $3fffffff) << 2
 
 viewport_oy:
-    .dword $00000000
-    .dword $ffb60000
-    .dword $ffbe0000
-    .dword $00000000
-    .dword $fffe0000
+    .dword ($00000000 & $3fffffff) << 2
+    .dword ($ffb60000 & $3fffffff) << 2
+    .dword ($ffbe0000 & $3fffffff) << 2
+    .dword ($00000000 & $3fffffff) << 2
+    .dword ($fffe0000 & $3fffffff) << 2
 
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
@@ -883,8 +884,8 @@ next:
 
 .proc mandelbrot
     ; input:
-    ; cx: position scaled to 8.24 fixed point - -128..+127.9
-    ; cy: position scaled to 8.24
+    ; cx: position scaled to 6.26 fixed point - -32..+31.9
+    ; cy: position scaled to 6.26
     ;
     ; output:
     ; iter: iteration count at escape or 0
@@ -942,7 +943,7 @@ loop:
 keep_going:
 
     .macro quick_exit arg, max
-        ; arg: fixed8.24
+        ; arg: fixed6.26
         ; max: integer
         .local positive
         .local negative
@@ -955,12 +956,12 @@ keep_going:
         bmi negative
 
     positive:
-        cmp #max
+        cmp #(max << 2)
         bmi all_done ; 'less than'
         jmp exit_path
 
     negative:
-        cmp #(256 - max)
+        cmp #(256 - (max << 2))
         beq first_equal ; 'equal' on first byte
         bpl all_done    ; 'greater than'
 
@@ -980,7 +981,7 @@ keep_going:
     all_done:
     .endmacro
 
-    ; 8.24: (-128 .. 127.9)
+    ; 6.26: (-32 .. 31.9)
     ; zx = zx_2  - zy_2  + cx
     sub32 zx, zx_2, zy_2
     add32 zx, zx, cx
@@ -991,9 +992,9 @@ keep_going:
     add32 zy, zy, cy
     quick_exit zy, 2
 
-    ; convert 8.24 -> 4.12: (-8 .. +7.9)
-    shift_round_16 zx, 4
-    shift_round_16 zy, 4
+    ; convert 6.26 -> 3.13: (-4 .. +3.9)
+    shift_round_16 zx, 3
+    shift_round_16 zy, 3
 
     ; zx_2 = zx * zx
     sqr16 zx_2, zx + 2
@@ -1115,9 +1116,9 @@ enough:
 .endmacro
 
 .macro zoom_factor dest, src, aspect
-    ; output: dest: fixed8.24
-    ; input: src: fixed4.12
-    ; aspect: fixed4.12
+    ; output: dest: fixed6.26
+    ; input: src: fixed3.13
+    ; aspect: fixed3.13
     ; clobbers A, X, flags, etc
     copy16 dest, src
     scale_zoom dest
@@ -1426,6 +1427,8 @@ number_keys:
     beq four
     cpy #KEY_5
     beq five
+    cpy #KEY_6
+    beq six
     jmp skip_char
 
 one:
@@ -1442,6 +1445,9 @@ four:
     jmp load_key_viewport
 five:
     ldx #4
+    jmp load_key_viewport
+six:
+    ldx #5
     ; fall through
 load_key_viewport:
     jsr load_viewport
diff --git a/readme.md b/readme.md
index d60644c..2c9efc1 100644
--- a/readme.md
+++ b/readme.md
@@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g
 
 ## Current state
 
-Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys.
+Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
 
 The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
 
@@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
 * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
 
-The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
+The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates.
 
 Iterations are capped at 255.
 
diff --git a/todo.md b/todo.md
index 7ab092b..1d46281 100644
--- a/todo.md
+++ b/todo.md
@@ -5,10 +5,6 @@ things to try:
 
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 
-* try 3.13 fixed point instead of 4.12 for more precision
-  * can we get away without the extra bit?
-  * since exit compare space would be 6.26 i think so
-
 * y-axis mirror optimization
 
 * extract viewport for display & re-input via keyboard

From 837082cf56d0b6325788da5d71e444c04f50fb69 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Wed, 1 Jan 2025 15:45:26 -0800
Subject: [PATCH 078/104] tweak viewports

skip experimental 6th viewport that got forgotten
and limit max zoom to 7 (range 0-7) which is what looks good
---
 mandel.s  | 8 +-------
 readme.md | 2 +-
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/mandel.s b/mandel.s
index 9996c53..8bd1a27 100644
--- a/mandel.s
+++ b/mandel.s
@@ -296,7 +296,6 @@ viewport_zoom:
     .byte 7
     .byte 5
     .byte 7
-    .byte 8
 
 viewport_ox:
     .dword ($00000000 & $3fffffff) << 2
@@ -1393,7 +1392,7 @@ skip_char:
 
 plus:
     lda zoom
-    cmp #8
+    cmp #7
     bpl skip_char
     inc zoom
     jmp done
@@ -1427,8 +1426,6 @@ number_keys:
     beq four
     cpy #KEY_5
     beq five
-    cpy #KEY_6
-    beq six
     jmp skip_char
 
 one:
@@ -1445,9 +1442,6 @@ four:
     jmp load_key_viewport
 five:
     ldx #4
-    jmp load_key_viewport
-six:
-    ldx #5
     ; fall through
 load_key_viewport:
     jsr load_viewport
diff --git a/readme.md b/readme.md
index 2c9efc1..881890a 100644
--- a/readme.md
+++ b/readme.md
@@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g
 
 ## Current state
 
-Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
+Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 5 preset viewports via the number keys.
 
 The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
 

From dcf5a3f59e1c9c5081a556e93838304abb038c36 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Wed, 1 Jan 2025 21:15:38 -0800
Subject: [PATCH 079/104] sixth viewport

---
 mandel.s  | 8 ++++++++
 readme.md | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/mandel.s b/mandel.s
index 8bd1a27..6be37d2 100644
--- a/mandel.s
+++ b/mandel.s
@@ -296,6 +296,7 @@ viewport_zoom:
     .byte 7
     .byte 5
     .byte 7
+    .byte 7
 
 viewport_ox:
     .dword ($00000000 & $3fffffff) << 2
@@ -303,6 +304,7 @@ viewport_ox:
     .dword ($ff110000 & $3fffffff) << 2
     .dword ($fe400000 & $3fffffff) << 2
     .dword ($fe3b0000 & $3fffffff) << 2
+    .dword $fd220000
 
 viewport_oy:
     .dword ($00000000 & $3fffffff) << 2
@@ -310,6 +312,7 @@ viewport_oy:
     .dword ($ffbe0000 & $3fffffff) << 2
     .dword ($00000000 & $3fffffff) << 2
     .dword ($fffe0000 & $3fffffff) << 2
+    .dword $ff000000
 
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
@@ -1426,6 +1429,8 @@ number_keys:
     beq four
     cpy #KEY_5
     beq five
+    cpy #KEY_6
+    beq six
     jmp skip_char
 
 one:
@@ -1442,6 +1447,9 @@ four:
     jmp load_key_viewport
 five:
     ldx #4
+    jmp load_key_viewport
+six:
+    ldx #5
     ; fall through
 load_key_viewport:
     jsr load_viewport
diff --git a/readme.md b/readme.md
index 881890a..2c9efc1 100644
--- a/readme.md
+++ b/readme.md
@@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g
 
 ## Current state
 
-Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 5 preset viewports via the number keys.
+Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
 
 The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
 

From d157fe1306267caa489a70dd176593873445820b Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 4 Jan 2025 10:06:12 -0800
Subject: [PATCH 080/104] Faster pixel skipping on 4x4, 2x2 tiers

Iterate at fill_masks[fill_level]+1 instead of every pixel and then
skipping, saves a smidge of time

view 1 with expanded memory:
10.514 ms/px before
10.430 ms/px after
---
 mandel.s | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/mandel.s b/mandel.s
index 6be37d2..210799a 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1737,12 +1737,15 @@ update_status:
 
 skipped:
 
+    ; sx += fill_level[fill_masks] + 1
+    ldx fill_level
+    lda fill_masks,x
     clc
-    lda sx
-    adc #1
+    adc #1 ; will never carry
+    adc sx
     sta sx
-    lda sx + 1
-    adc #0
+    lda #0
+    adc sx + 1
     sta sx + 1
 
     lda sx
@@ -1752,12 +1755,15 @@ skipped:
 
 loop_sx_done:
 
+    ; sy += fill_level[fill_masks] + 1
+    ldx fill_level
+    lda fill_masks,x
     clc
-    lda sy
-    adc #1
+    adc #1 ; will never carry
+    adc sy
     sta sy
-    lda sy + 1
-    adc #0
+    lda #0
+    adc sy + 1
     sta sy + 1
 
     lda sy

From 582ddf497f3c4f1aeae39201b2490dff14ff7f16 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 4 Jan 2025 10:53:51 -0800
Subject: [PATCH 081/104] apply jamey's suggestion of skipping add for high
 byte muls

rather than saving 0 into the high bytes, then adding the high-byte
multiplication later, write it directly in place. this saves a few
cycles on every iteration, and it adds up nicely.

View 1 overview render times:
130XE: 10.050 ms/px - 4m56s
800XL: 10.906 ms/px - 5m21s
---
 mandel.s | 29 ++++-------------------------
 todo.md  |  3 ---
 2 files changed, 4 insertions(+), 28 deletions(-)

diff --git a/mandel.s b/mandel.s
index 210799a..526953a 100644
--- a/mandel.s
+++ b/mandel.s
@@ -464,20 +464,6 @@ viewport_oy:
     sta dest + 1
 .endmacro
 
-; input: arg as u8
-; input/output: dest as u16
-; clobbers a, x
-.macro sqr8_add16 dest, arg
-    ldx arg
-    clc
-    lda sqr_lobyte,x
-    adc dest
-    sta dest
-    lda sqr_hibyte,x
-    adc dest + 1
-    sta dest + 1
-.endmacro
-
 .segment "TABLES"
 ; lookup table for top byte -> PORTB value for bank-switch
 .align 256
@@ -760,9 +746,8 @@ inner_loop:
     ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
 
     imul8 result, arg1, arg2, xe
-    lda #0
-    sta result + 2
-    sta result + 3
+
+    imul8 result + 2, arg1 + 1, arg2 + 1, xe
 
     imul8 inter, arg1 + 1, arg2, xe
     add16 result + 1, result + 1, inter
@@ -772,9 +757,6 @@ inner_loop:
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
-    imul8 inter, arg1 + 1, arg2 + 1, xe
-    add16 result + 2, result + 2, inter
-
     ; In case of negative inputs, adjust high word
     ; https://stackoverflow.com/a/28827013
     lda arg1 + 1
@@ -807,9 +789,8 @@ arg2_pos:
         ; h*h*256*256 + h*l*256 + h*l*256 + l*l
 
         sqr8 result, arg
-        lda #0
-        sta result + 2
-        sta result + 3
+
+        sqr8 result + 2, arg + 1
 
         imul8 inter, arg + 1, arg, xe
         add16 result + 1, result + 1, inter
@@ -817,8 +798,6 @@ arg2_pos:
         add16 result + 1, result + 1, inter
         add_carry result + 3
 
-        sqr8_add16 result + 2, arg + 1
-
         rts ; 6 cyc
     .endscope
 .endmacro
diff --git a/todo.md b/todo.md
index 1d46281..a78a2d5 100644
--- a/todo.md
+++ b/todo.md
@@ -1,8 +1,5 @@
 things to try:
 
-* skip add on the top-byte multiply in sqr8/mul8
-  * should save a few cycles, suggestion by jamey
-
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 
 * y-axis mirror optimization

From d2bf77dc26218ae1c2a342fd424a7d532d064904 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 4 Jan 2025 12:13:27 -0800
Subject: [PATCH 082/104] todo notes

---
 todo.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/todo.md b/todo.md
index a78a2d5..6807ae2 100644
--- a/todo.md
+++ b/todo.md
@@ -1,7 +1,15 @@
 things to try:
 
+* fix status bar to show elapsed time, per-iter time, per-pixel iter count
+
+* 'turbo' mode disabling graphics in full or part
+
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 
+* maybe clean up the load/layout of the big mul table
+
+* consider alternate lookup tables in the top 16KB under ROM
+
 * y-axis mirror optimization
 
 * extract viewport for display & re-input via keyboard

From 7e5ca79d9a4bd419a3a004f7c96a612c9e41cee7 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 4 Jan 2025 14:25:25 -0800
Subject: [PATCH 083/104] move total_ms, total_pixels out of zero page

this frees up 12 bytes of zero page space and costs no measurable
time as these variables are not in the hot path and there was only
a tiny bit different.
---
 mandel.s | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/mandel.s b/mandel.s
index 526953a..317e3b1 100644
--- a/mandel.s
+++ b/mandel.s
@@ -33,8 +33,7 @@ chroma_ticks    = $bd ; u8
 count_frames    = $be ; u8
 count_pixels    = $bf ; u8
 
-total_pixels    = $c0 ; float48
-total_ms        = $c6 ; float48
+; free space c0-cb
 temp            = $cc ; u16
 temp2           = $ce ; u16
 
@@ -63,6 +62,7 @@ FADD   = $DA66 ; ADDITION       (FR0 += FR1)
 FSUB   = $DA60 ; SUBTRACTION    (FR0 -= FR1)
 FMUL   = $DADB ; MULTIPLICATION (FR0 *= FR1)
 FDIV   = $DB28 ; DIVISION       (FR0 /= FR1)
+ZFR0   = $DA44 ; clear FR0
 ZF1    = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
 FLD0R  = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
 FLD1R  = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
@@ -203,6 +203,16 @@ ms_per_frame: ; float48 16.66666667
     .byte $66
     .byte $67
 
+total_pixels: ; float48
+    .repeat 6
+        .byte 0
+    .endrepeat
+
+total_ms: ; float48
+    .repeat 6
+        .byte 0
+    .endrepeat
+
 display_list_start:
     ; 24 lines overscan
     .repeat 3
@@ -1565,10 +1575,13 @@ main_loop:
     sta count_pixels
 
     ; total_ms = 0.0; total_pixels = 0.0
-    ldx #total_ms
-    jsr ZF1
-    ldx #total_pixels
-    jsr ZF1
+    jsr ZFR0
+    ldx #.lobyte(total_ms)
+    ldy #.hibyte(total_ms)
+    jsr FST0R
+    ldx #.lobyte(total_pixels)
+    ldy #.hibyte(total_pixels)
+    jsr FST0R
 
     jsr clear_screen
     jsr status_bar
@@ -1691,19 +1704,19 @@ update_status:
     jsr FMUL
 
     ; FR0 += total_ms
-    ldx #total_ms
-    ldy #0
+    ldx #.lobyte(total_ms)
+    ldy #.hibyte(total_ms)
     jsr FLD1R
     jsr FADD
 
     ; total_ms = FR0
-    ldx #total_ms
-    ldy #0
+    ldx #.lobyte(total_ms)
+    ldy #.hibyte(total_ms)
     jsr FST0R
 
     ; FR0 /= total_pixels
-    ldx #total_pixels
-    ldy #0
+    ldx #.lobyte(total_pixels)
+    ldy #.hibyte(total_pixels)
     jsr FLD1R
     jsr FDIV
 

From eaa00a055ac6ff39291a42b458b3e41806025035 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 4 Jan 2025 18:46:51 -0800
Subject: [PATCH 084/104] wip changing time units

it does this weird thing where sometimes it's reading out wrong digits
and then switches to expected unit of sec/px

work in progress no clue what's going on
---
 mandel.s | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/mandel.s b/mandel.s
index 317e3b1..f053748 100644
--- a/mandel.s
+++ b/mandel.s
@@ -141,7 +141,7 @@ str_self:
     .byte "MANDEL-6502"
 str_self_end:
 str_speed:
-    .byte " ms/px"
+    .byte " sec/px"
 str_speed_end:
 str_run:
     .byte " RUN"
@@ -195,9 +195,9 @@ aspect_x: ; fixed3.13 5/4
 aspect_y: ; fixed3.13 3/4
     .word 3 << (13 - 2)
 
-ms_per_frame: ; float48 16.66666667
-    .byte 64  ; exponent/sign
-    .byte $16 ; BCD digits
+sec_per_frame: ; float48 0.016666667
+    .byte 63  ; exponent/sign
+    .byte $01 ; BCD digits
     .byte $66
     .byte $66
     .byte $66
@@ -208,7 +208,7 @@ total_pixels: ; float48
         .byte 0
     .endrepeat
 
-total_ms: ; float48
+total_sec: ; float48
     .repeat 6
         .byte 0
     .endrepeat
@@ -1574,10 +1574,10 @@ main_loop:
     sta count_frames
     sta count_pixels
 
-    ; total_ms = 0.0; total_pixels = 0.0
+    ; total_sec = 0.0; total_pixels = 0.0
     jsr ZFR0
-    ldx #.lobyte(total_ms)
-    ldy #.hibyte(total_ms)
+    ldx #.lobyte(total_sec)
+    ldy #.hibyte(total_sec)
     jsr FST0R
     ldx #.lobyte(total_pixels)
     ldy #.hibyte(total_pixels)
@@ -1697,21 +1697,21 @@ update_status:
     sta count_frames
     jsr IFP
 
-    ; FR0 *= ms_per_frame
-    ldx #.lobyte(ms_per_frame)
-    ldy #.hibyte(ms_per_frame)
+    ; FR0 *= sec_per_frame
+    ldx #.lobyte(sec_per_frame)
+    ldy #.hibyte(sec_per_frame)
     jsr FLD1R
     jsr FMUL
 
-    ; FR0 += total_ms
-    ldx #.lobyte(total_ms)
-    ldy #.hibyte(total_ms)
+    ; FR0 += total_sec
+    ldx #.lobyte(total_sec)
+    ldy #.hibyte(total_sec)
     jsr FLD1R
     jsr FADD
 
-    ; total_ms = FR0
-    ldx #.lobyte(total_ms)
-    ldy #.hibyte(total_ms)
+    ; total_sec = FR0
+    ldx #.lobyte(total_sec)
+    ldy #.hibyte(total_sec)
     jsr FST0R
 
     ; FR0 /= total_pixels

From 918d15e8139d21c15f05776bfdb6780000a687f9 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 5 Jan 2025 14:05:24 -0800
Subject: [PATCH 085/104] wip us/iter counter

seems wrong, gives 32 all the time and that seems too small
---
 mandel.s | 103 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 68 insertions(+), 35 deletions(-)

diff --git a/mandel.s b/mandel.s
index f053748..09af6a5 100644
--- a/mandel.s
+++ b/mandel.s
@@ -31,9 +31,10 @@ chroma_offset   = $bb ; u8
 palette_ticks   = $bc ; u8
 chroma_ticks    = $bd ; u8
 count_frames    = $be ; u8
-count_pixels    = $bf ; u8
+; free space $bf
 
-; free space c0-cb
+count_iters     = $c0 ; u16
+; free space c2-cb
 temp            = $cc ; u16
 temp2           = $ce ; u16
 
@@ -58,6 +59,7 @@ LBUFF  = $0580 ; result buffer for FASC routine
 ; FP ROM routine vectors
 FASC   = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
 IFP    = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
+FPI    = $D9D2 ; floating point to integer
 FADD   = $DA66 ; ADDITION       (FR0 += FR1)
 FSUB   = $DA60 ; SUBTRACTION    (FR0 -= FR1)
 FMUL   = $DADB ; MULTIPLICATION (FR0 *= FR1)
@@ -141,7 +143,7 @@ str_self:
     .byte "MANDEL-6502"
 str_self_end:
 str_speed:
-    .byte " sec/px"
+    .byte "us/iter: "
 str_speed_end:
 str_run:
     .byte " RUN"
@@ -154,7 +156,7 @@ str_self_len = str_self_end - str_self
 str_speed_len = str_speed_end - str_speed
 str_run_len = str_run_end - str_run
 str_done_len = str_done_end - str_done
-speed_precision = 6
+speed_precision = 5
 
 speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
 speed_len = 14 + str_speed_len
@@ -196,14 +198,22 @@ aspect_y: ; fixed3.13 3/4
     .word 3 << (13 - 2)
 
 sec_per_frame: ; float48 0.016666667
-    .byte 63  ; exponent/sign
+    .byte 63  ; exponent/sign - -2
     .byte $01 ; BCD digits
     .byte $66
     .byte $66
     .byte $66
     .byte $67
 
-total_pixels: ; float48
+us_per_sec: ; float48 1e9
+    .byte 68  ; exponent/sign +8
+    .byte $10 ; BCD digits
+    .byte $00
+    .byte $00
+    .byte $00
+    .byte $00
+
+total_iters: ; float48
     .repeat 6
         .byte 0
     .endrepeat
@@ -927,6 +937,11 @@ next:
     sta z_buffer_end
 
 loop:
+    inc count_iters
+    bne low_iters
+    inc count_iters + 1
+low_iters:
+
     ; iter++ & max-iters break
     inc iter
     bne keep_going
@@ -1230,6 +1245,7 @@ done:
     ; clobbers A, X
     .local loop
     .local done
+    .local padding
     ldx #0
 loop:
     cpx #len
@@ -1237,11 +1253,23 @@ loop:
     txa
     tay
     lda (strptr),y
+    bmi padding
     tay
     lda char_map,y
     sta textbuffer + col,x
     inx
     jmp loop
+
+padding:
+    ldy #32 ; space
+    lda char_map,y
+
+    cpx #len
+    beq done
+    sta textbuffer + col,x
+    inx
+    jmp padding
+
 done:
 .endmacro
 
@@ -1569,18 +1597,19 @@ copy_byte_loop:
     jsr SETVBV
 
 main_loop:
-    ; count_frames = 0; count_pixels = 0
+    ; count_frames = 0; count_iters = 0
     lda #0
     sta count_frames
-    sta count_pixels
+    sta count_iters
+    sta count_iters + 1
 
-    ; total_sec = 0.0; total_pixels = 0.0
+    ; total_sec = 0.0; total_iters = 0.0
     jsr ZFR0
     ldx #.lobyte(total_sec)
     ldy #.hibyte(total_sec)
     jsr FST0R
-    ldx #.lobyte(total_pixels)
-    ldy #.hibyte(total_pixels)
+    ldx #.lobyte(total_iters)
+    ldy #.hibyte(total_iters)
     jsr FST0R
 
     jsr clear_screen
@@ -1653,38 +1682,32 @@ not_skipped_mask:
 
 no_key:
     ; check if we should update the counters
-    ;
-    ; count_pixels >= width? update!
-    inc count_pixels
-    lda count_pixels
-    cmp #width
-    bmi update_status
 
     ; count_frames >= 120? update!
     lda count_frames
     cmp #120 ; >= 2 seconds
-    bmi skipped
+    bpl update_status
+    jmp skipped
 
 update_status:
-    ; FR0 = (float)count_pixels & clear count_pixels
-    lda count_pixels
-    sta FR0
-    lda #0
-    sta FR0 + 1
-    sta count_pixels
+    ; FR0 = (float)count_iters & clear count_iters
+    copy16 FR0, count_iters
     jsr IFP
+    lda #0
+    sta count_iters
+    sta count_iters + 1
 
-    ; FR1 = total_pixels
-    ldx #.lobyte(total_pixels)
-    ldy #.hibyte(total_pixels)
+    ; FR1 = total_iters
+    ldx #.lobyte(total_iters)
+    ldy #.hibyte(total_iters)
     jsr FLD1R
 
     ; FR0 += FR1
     jsr FADD
 
-    ; total_pixels = FR0
-    ldx #.lobyte(total_pixels)
-    ldy #.hibyte(total_pixels)
+    ; total_iters = FR0
+    ldx #.lobyte(total_iters)
+    ldy #.hibyte(total_iters)
     jsr FST0R
 
 
@@ -1714,18 +1737,28 @@ update_status:
     ldy #.hibyte(total_sec)
     jsr FST0R
 
-    ; FR0 /= total_pixels
-    ldx #.lobyte(total_pixels)
-    ldy #.hibyte(total_pixels)
+    ; FR0 /= total_iters
+    ldx #.lobyte(total_iters)
+    ldy #.hibyte(total_iters)
     jsr FLD1R
     jsr FDIV
 
+    ; FR0 *= us_per_sec
+    ldx #.lobyte(us_per_sec)
+    ldy #.hibyte(us_per_sec)
+    jsr FLD1R
+    jsr FMUL
+
+    ; round to integer
+    jsr FPI
+    jsr IFP
+
     ; convert to ASCII in INBUFF
     jsr FASC
 
     ; print the first 6 digits
-    draw_text_indirect speed_start, speed_precision, INBUFF
-    draw_text speed_start + speed_precision, str_speed_len, str_speed
+    draw_text speed_start, str_speed_len, str_speed
+    draw_text_indirect speed_start + str_speed_len, speed_precision, INBUFF
 
 skipped:
 

From 7c04862d70b16a8e35392255371cdbaca0340396 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 5 Jan 2025 14:29:27 -0800
Subject: [PATCH 086/104] workaround for rounding us/iter

for some reason rounding is giving me wrong results
not sure what i'm doing wrong :D

just show 6 digits :P

ok this gets the us/iter working, and it is more stable
but the elapsed time still needs to be added
---
 mandel.s | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/mandel.s b/mandel.s
index 09af6a5..7466cd9 100644
--- a/mandel.s
+++ b/mandel.s
@@ -156,7 +156,7 @@ str_self_len = str_self_end - str_self
 str_speed_len = str_speed_end - str_speed
 str_run_len = str_run_end - str_run
 str_done_len = str_done_end - str_done
-speed_precision = 5
+speed_precision = 6
 
 speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
 speed_len = 14 + str_speed_len
@@ -197,17 +197,17 @@ aspect_x: ; fixed3.13 5/4
 aspect_y: ; fixed3.13 3/4
     .word 3 << (13 - 2)
 
-sec_per_frame: ; float48 0.016666667
-    .byte 63  ; exponent/sign - -2
+sec_per_frame: ; float48 00 . 01 66 66 66 67
+    .byte 63  ; exponent/sign - -1 bytes
     .byte $01 ; BCD digits
     .byte $66
     .byte $66
     .byte $66
     .byte $67
 
-us_per_sec: ; float48 1e9
-    .byte 68  ; exponent/sign +8
-    .byte $10 ; BCD digits
+us_per_sec: ; float48 1e9 01 00 0,0 00 . 00
+    .byte 67  ; exponent/sign +3 bytes
+    .byte $01 ; BCD digits
     .byte $00
     .byte $00
     .byte $00
@@ -1749,9 +1749,12 @@ update_status:
     jsr FLD1R
     jsr FMUL
 
+    ; @fixme
     ; round to integer
-    jsr FPI
-    jsr IFP
+    ; for some reason this gives bad results?
+    ;clc
+    ;jsr FPI
+    ;jsr IFP
 
     ; convert to ASCII in INBUFF
     jsr FASC

From e0cc704d9906a3801b8812e3cb994f71995391bf Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Wed, 8 Jan 2025 18:34:46 -0800
Subject: [PATCH 087/104] Fix drawing terminator, round usec

---
 mandel.s | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/mandel.s b/mandel.s
index 7466cd9..db072be 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1253,11 +1253,15 @@ loop:
     txa
     tay
     lda (strptr),y
-    bmi padding
+    pha ; save the char for terminator check
+    and #$7f ; strip the high bit (terminator)
     tay
     lda char_map,y
     sta textbuffer + col,x
     inx
+
+    pla
+    bmi padding
     jmp loop
 
 padding:
@@ -1749,12 +1753,10 @@ update_status:
     jsr FLD1R
     jsr FMUL
 
-    ; @fixme
-    ; round to integer
-    ; for some reason this gives bad results?
-    ;clc
-    ;jsr FPI
-    ;jsr IFP
+    ; round (down) to integer
+    jsr FPI
+    clc
+    jsr IFP
 
     ; convert to ASCII in INBUFF
     jsr FASC

From d182d33b3579668be8db81034f4197f6ee381fa8 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 1 Feb 2025 10:02:01 -0800
Subject: [PATCH 088/104] draw_string

---
 mandel.s | 141 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 84 insertions(+), 57 deletions(-)

diff --git a/mandel.s b/mandel.s
index db072be..b7c323b 100644
--- a/mandel.s
+++ b/mandel.s
@@ -34,7 +34,9 @@ count_frames    = $be ; u8
 ; free space $bf
 
 count_iters     = $c0 ; u16
-; free space c2-cb
+text_col        = $c2 ; u8
+text_row        = $c3 ; u8
+; free space c4-cb
 temp            = $cc ; u16
 temp2           = $ce ; u16
 
@@ -140,16 +142,16 @@ KEY_0     = 50
 
 strings:
 str_self:
-    .byte "MANDEL-6502"
+    .byte "MANDEL-6502", 0
 str_self_end:
 str_speed:
-    .byte "us/iter: "
+    .byte "us/iter: ", 0
 str_speed_end:
 str_run:
-    .byte " RUN"
+    .byte " RUN", 0
 str_run_end:
 str_done:
-    .byte "DONE"
+    .byte "DONE", 0
 str_done_end:
 
 str_self_len = str_self_end - str_self
@@ -1241,57 +1243,50 @@ done:
     rts
 .endproc
 
-.macro draw_text_indirect col, len, strptr
-    ; clobbers A, X
-    .local loop
-    .local done
-    .local padding
-    ldx #0
+; in/out: column in text_col
+; in: row in text_row @fixme implement
+; in: pointer to string in INBUFF
+; clobbers x/y/a/temp
+.proc draw_string
+    drawptr = temp
+    strptr = INBUFF
+
+    clc
+    lda #.lobyte(textbuffer)
+    adc text_col
+    sta temp
+    lda #.hibyte(textbuffer)
+    adc #0
+    sta temp + 1
+
+    ldy #0
 loop:
-    cpx #len
-    beq done
-    txa
-    tay
     lda (strptr),y
-    pha ; save the char for terminator check
-    and #$7f ; strip the high bit (terminator)
-    tay
-    lda char_map,y
-    sta textbuffer + col,x
-    inx
+    ; if char's null, terminate c-style
+    beq done
+    ; save the char for terminator check
+    pha
+    ; strip the high bit (terminator)
+    and #$7f
+    tax
+    lda char_map,x
+    sta (drawptr),y
+    iny
 
     pla
-    bmi padding
+    ; _last_ char has high bit set in atari rom routines
+    bmi done
     jmp loop
 
-padding:
-    ldy #32 ; space
-    lda char_map,y
-
-    cpx #len
-    beq done
-    sta textbuffer + col,x
-    inx
-    jmp padding
-
 done:
-.endmacro
+    ; move the text column pointer
+    tya
+    clc
+    adc text_col
+    sta text_col
 
-.macro draw_text col, len, cstr
-    ; clobbers A, X
-    .local loop
-    .local done
-    ldx #0
-loop:
-    cpx #len
-    beq done
-    ldy cstr,x
-    lda char_map,y
-    sta textbuffer + col,x
-    inx
-    jmp loop
-done:
-.endmacro
+    rts
+.endproc
 
 .proc vblank_handler
     inc count_frames
@@ -1506,8 +1501,24 @@ zero_byte_loop:
 
 .proc status_bar
     ; Status bar
-    draw_text 0, str_self_len, str_self
-    draw_text 40 - str_run_len, str_run_len, str_run
+
+    lda #0
+    sta text_col
+    lda #0
+    sta text_row
+    lda #.lobyte(str_self)
+    sta INBUFF
+    lda #.hibyte(str_self)
+    sta INBUFF + 1
+    jsr draw_string
+
+    lda #(40 - str_run_len)
+    sta text_col
+    lda #.lobyte(str_run)
+    sta INBUFF
+    lda #.hibyte(str_run)
+    sta INBUFF + 1
+    jsr draw_string
 
     rts
 .endproc
@@ -1758,13 +1769,19 @@ update_status:
     clc
     jsr IFP
 
-    ; convert to ASCII in INBUFF
+    lda #speed_start
+    sta text_col
+    lda #0
+    sta text_row
+    lda #.lobyte(str_speed)
+    sta INBUFF
+    lda #.hibyte(str_speed)
+    sta INBUFF + 1
+    jsr draw_string
+
+    ; convert to ASCII in INBUFF and print
     jsr FASC
-
-    ; print the first 6 digits
-    draw_text speed_start, str_speed_len, str_speed
-    draw_text_indirect speed_start + str_speed_len, speed_precision, INBUFF
-
+    jsr draw_string
 skipped:
 
     ; sx += fill_level[fill_masks] + 1
@@ -1812,7 +1829,17 @@ fill_loop_done:
 
 loop:
     ; finished
-    draw_text 40 - str_done_len, str_done_len, str_done
+
+    lda #(40 - str_done_len)
+    sta text_col
+    lda #0
+    sta text_row
+    lda #.lobyte(str_done)
+    sta INBUFF
+    lda #.hibyte(str_done)
+    sta INBUFF + 1
+    jsr draw_string
+
     jsr keycheck
     beq loop
     jmp main_loop

From 25da81c64bfddf0a39e09288386ad76e19290be2 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 2 Feb 2025 16:40:58 -0800
Subject: [PATCH 089/104] clean up text draw, fix offset by one

---
 mandel.s | 60 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/mandel.s b/mandel.s
index b7c323b..ed7ae40 100644
--- a/mandel.s
+++ b/mandel.s
@@ -142,26 +142,32 @@ KEY_0     = 50
 
 strings:
 str_self:
-    .byte "MANDEL-6502", 0
+    .byte "MANDEL-6502"
 str_self_end:
+    .byte 0
 str_speed:
-    .byte "us/iter: ", 0
+    .byte "us/iter: "
 str_speed_end:
+    .byte 0
 str_run:
-    .byte " RUN", 0
+    .byte " RUN"
 str_run_end:
+    .byte 0
 str_done:
-    .byte "DONE", 0
+    .byte "DONE"
 str_done_end:
+    .byte 0
+str_padding:
+    .byte "      "
+str_padding_end:
+    .byte 0
 
-str_self_len = str_self_end - str_self
 str_speed_len = str_speed_end - str_speed
 str_run_len = str_run_end - str_run
 str_done_len = str_done_end - str_done
-speed_precision = 6
+str_padding_len = str_padding_end - str_padding
 
-speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
-speed_len = 14 + str_speed_len
+speed_start = 40 - str_done_len - str_speed_len - str_padding_len - 1
 
 
 char_map:
@@ -1288,6 +1294,14 @@ done:
     rts
 .endproc
 
+.macro draw_string_const str
+    lda #.lobyte(str)
+    sta INBUFF
+    lda #.hibyte(str)
+    sta INBUFF + 1
+    jsr draw_string
+.endmacro
+
 .proc vblank_handler
     inc count_frames
 
@@ -1506,19 +1520,11 @@ zero_byte_loop:
     sta text_col
     lda #0
     sta text_row
-    lda #.lobyte(str_self)
-    sta INBUFF
-    lda #.hibyte(str_self)
-    sta INBUFF + 1
-    jsr draw_string
+    draw_string_const str_self
 
     lda #(40 - str_run_len)
     sta text_col
-    lda #.lobyte(str_run)
-    sta INBUFF
-    lda #.hibyte(str_run)
-    sta INBUFF + 1
-    jsr draw_string
+    draw_string_const str_run
 
     rts
 .endproc
@@ -1773,11 +1779,13 @@ update_status:
     sta text_col
     lda #0
     sta text_row
-    lda #.lobyte(str_speed)
-    sta INBUFF
-    lda #.hibyte(str_speed)
-    sta INBUFF + 1
-    jsr draw_string
+    draw_string_const str_speed
+
+    lda text_col
+    pha
+    draw_string_const str_padding
+    pla
+    sta text_col
 
     ; convert to ASCII in INBUFF and print
     jsr FASC
@@ -1834,11 +1842,7 @@ loop:
     sta text_col
     lda #0
     sta text_row
-    lda #.lobyte(str_done)
-    sta INBUFF
-    lda #.hibyte(str_done)
-    sta INBUFF + 1
-    jsr draw_string
+    draw_string_const str_done
 
     jsr keycheck
     beq loop

From 26d612b6f3640e4ee6e15eb9817f0037ee91ed80 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Fri, 21 Feb 2025 19:42:10 -0800
Subject: [PATCH 090/104] move 8 scan lines on the bottom to status bar

---
 mandel.s | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mandel.s b/mandel.s
index ed7ae40..81ee0eb 100644
--- a/mandel.s
+++ b/mandel.s
@@ -80,7 +80,7 @@ framebuffer_bottom = $b000
 display_list       = $bf00
 framebuffer_end    = $c000
 
-height = 184
+height = 176
 half_height = height >> 1
 width = 160
 half_width = width >> 1
@@ -254,6 +254,10 @@ display_list_start:
         .byte $0e
     .endrep
 
+    ; 8 scan lines, 1 row of 40-column text
+    .byte $42
+    .addr textbuffer + 40
+
     .byte $41 ; jump and blank
     .addr display_list
 display_list_end:

From 07db3d00d7ff36afe331c6adbeae2398a53c8173 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 22 Feb 2025 11:23:13 -0800
Subject: [PATCH 091/104] second status bar display with coords/zoom

currently using 3.13 precision to output to floats for formatting
---
 mandel.s | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 160 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index 81ee0eb..a89be85 100644
--- a/mandel.s
+++ b/mandel.s
@@ -169,6 +169,27 @@ str_padding_len = str_padding_end - str_padding
 
 speed_start = 40 - str_done_len - str_speed_len - str_padding_len - 1
 
+col_x = 1
+str_x:
+    .byte "X:"
+    .byte 0
+str_x_len = 2
+str_x_space = 12
+str_x_padding = 2
+
+col_y = str_x_len + str_x_space + str_x_padding
+str_y:
+    .byte "Y:"
+    .byte 0
+str_y_len = 2
+str_y_space = 12
+str_y_padding = 2
+
+col_zoom = col_y + str_y_len + str_y_space + str_y_padding
+str_zoom:
+    .byte "ZOOM:"
+    .byte 0
+str_zoom_len = 5
 
 char_map:
     ; Map ATASCII string values to framebuffer font entries
@@ -205,6 +226,17 @@ aspect_x: ; fixed3.13 5/4
 aspect_y: ; fixed3.13 3/4
     .word 3 << (13 - 2)
 
+fixed3_13_as_float: ; float48
+    ; 1 << 13
+    ; 8192
+    ; 81 92 . 00 00 00
+    .byte 65 ; exponent/sign - +1 byte
+    .byte $81
+    .byte $92
+    .byte $00
+    .byte $00
+    .byte $00
+
 sec_per_frame: ; float48 00 . 01 66 66 66 67
     .byte 63  ; exponent/sign - -1 bytes
     .byte $01 ; BCD digits
@@ -895,6 +927,68 @@ next:
 
 .endmacro
 
+; input in FR0, 16 bits signed 3.13 fixed
+; output in FR0, Atari float
+; clobbers a, x, y, FR0, FR1
+.proc fixed3_13_to_float
+    ldx #.lobyte(fixed3_13_as_float)
+    ldy #.hibyte(fixed3_13_as_float)
+    jsr FLD1R
+
+    ; check sign bit! conversion routine is for unsigned
+    lda FR0 + 1
+    bpl positive
+
+negative:
+    neg16 FR0
+    jsr IFP
+
+    ; set float sign bit
+    lda FR0
+    ora #$80
+    sta FR0
+    jmp common
+
+positive:
+    jsr IFP
+
+common:
+    jsr FDIV
+    rts
+
+.endproc
+
+; input in FR0, Atari float
+; output in FR0, 16 bits signed 3.13 fixed
+; clobbers a, x, y, FR0, FR1
+.proc float_to_fixed3_13
+    ldx #.lobyte(fixed3_13_as_float)
+    ldy #.hibyte(fixed3_13_as_float)
+    jsr FLD1R
+    jsr FMUL
+
+    ; check sign bit! conversion routine is for unsigned
+    lda FR0
+    bcc positive
+
+negative:
+    ; clearfloat sign bit
+    lda FR0
+    eor #$80
+    sta FR0
+
+    jsr FPI
+    neg16 FR0
+    jmp common
+
+positive:
+    jsr FPI
+
+common:
+    rts
+
+.endproc
+
 .proc mandelbrot
     ; input:
     ; cx: position scaled to 6.26 fixed point - -32..+31.9
@@ -1254,7 +1348,7 @@ done:
 .endproc
 
 ; in/out: column in text_col
-; in: row in text_row @fixme implement
+; in: row in text_row
 ; in: pointer to string in INBUFF
 ; clobbers x/y/a/temp
 .proc draw_string
@@ -1269,6 +1363,21 @@ done:
     adc #0
     sta temp + 1
 
+    ldx text_row
+    beq done_rows
+continue_rows:
+    clc
+    lda temp
+    adc #40
+    sta temp
+    lda temp + 1
+    adc #0
+    sta temp + 1
+    dex
+    bne continue_rows
+
+done_rows:
+
     ldy #0
 loop:
     lda (strptr),y
@@ -1429,7 +1538,7 @@ skip_char:
 
 plus:
     lda zoom
-    cmp #7
+    cmp #8
     bpl skip_char
     inc zoom
     jmp done
@@ -1441,15 +1550,19 @@ minus:
     jmp done
 up:
     sub32 oy, oy, temp
+    jsr display_coords
     jmp done
 down:
     add32 oy, oy, temp
+    jsr display_coords
     jmp done
 left:
     sub32 ox, ox, temp
+    jsr display_coords
     jmp done
 right:
     add32 ox, ox, temp
+    jsr display_coords
     jmp done
 
 number_keys:
@@ -1533,6 +1646,49 @@ zero_byte_loop:
     rts
 .endproc
 
+.proc display_coords
+    lda #1
+    sta text_row
+    lda #col_x
+    sta text_col
+    draw_string_const str_x
+
+    copy32 FR0, ox
+    shift_round_16 FR0, 3
+    copy16 FR0, FR0 + 2
+    jsr fixed3_13_to_float
+    jsr FASC
+    jsr draw_string
+
+    lda #col_y
+    sta text_col
+    draw_string_const str_y
+
+    copy32 FR0, oy
+    shift_round_16 FR0, 3
+    copy16 FR0, FR0 + 2
+    jsr fixed3_13_to_float
+    jsr FASC
+    jsr draw_string
+
+    lda #col_zoom
+    sta text_col
+    draw_string_const str_zoom
+
+    lda zoom
+    clc
+    adc #0
+    sta FR0
+    lda #0
+    sta FR0 + 1
+    jsr IFP
+    jsr FASC
+    jsr draw_string
+
+    rts
+
+.endproc
+
 ; input: viewport selector in x
 ; clobbers: a, x
 .proc load_viewport
@@ -1584,6 +1740,7 @@ zero_byte_loop:
     sta DMACTL
 
     jsr clear_screen
+    jsr display_coords
 
     ; Copy the display list into properly aligned memory
     ; Can't cross 1024-byte boundaries :D
@@ -1639,6 +1796,7 @@ main_loop:
 
     jsr clear_screen
     jsr status_bar
+    jsr display_coords
 
     lda #0
     sta fill_level

From 6e66145ec6ea2be22f81819c2c797bfa9b1425aa Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 22 Feb 2025 15:37:11 -0800
Subject: [PATCH 092/104] whoops fixes

---
 mandel.s | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index a89be85..9840325 100644
--- a/mandel.s
+++ b/mandel.s
@@ -177,7 +177,7 @@ str_x_len = 2
 str_x_space = 12
 str_x_padding = 2
 
-col_y = str_x_len + str_x_space + str_x_padding
+col_y = col_x + str_x_len + str_x_space + str_x_padding
 str_y:
     .byte "Y:"
     .byte 0
@@ -1538,7 +1538,7 @@ skip_char:
 
 plus:
     lda zoom
-    cmp #8
+    cmp #7
     bpl skip_char
     inc zoom
     jmp done

From 89b4e459013a74b070f730a4389e2ce90a177b57 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 22 Feb 2025 20:24:04 -0800
Subject: [PATCH 093/104] flip the y coordinate sign

---
 mandel.s | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index 9840325..3419466 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1549,11 +1549,11 @@ minus:
     dec zoom
     jmp done
 up:
-    sub32 oy, oy, temp
+    add32 oy, oy, temp
     jsr display_coords
     jmp done
 down:
-    add32 oy, oy, temp
+    sub32 oy, oy, temp
     jsr display_coords
     jmp done
 left:
@@ -1854,6 +1854,7 @@ not_skipped_mask:
     zoom_factor cx, sx, aspect_x
     add32 cx, cx, ox
     zoom_factor cy, sy, aspect_y
+    neg32 cy
     add32 cy, cy, oy
     jsr mandelbrot
     jsr pset

From 689363d0834c1cc3a04a19999a3ca2485a3a0f30 Mon Sep 17 00:00:00 2001
From: Brooke <brooke@Starbase.local>
Date: Sun, 22 Jun 2025 20:00:35 -0700
Subject: [PATCH 094/104] WIP code for elapsed time

not finished, doesn't work right
---
 mandel.s | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/mandel.s b/mandel.s
index 3419466..dc7e323 100644
--- a/mandel.s
+++ b/mandel.s
@@ -162,12 +162,28 @@ str_padding:
 str_padding_end:
     .byte 0
 
+str_space:
+    .byte " "
+    .byte 0
+
+str_h:
+    .byte "h"
+    .byte 0
+str_m:
+    .byte "m"
+    .byte 0
+str_s:
+    .byte "s"
+    .byte 0
+
 str_speed_len = str_speed_end - str_speed
 str_run_len = str_run_end - str_run
 str_done_len = str_done_end - str_done
 str_padding_len = str_padding_end - str_padding
 
-speed_start = 40 - str_done_len - str_speed_len - str_padding_len - 1
+; "3h59m59s"
+str_elapsed_spacer = 8
+speed_start = 40 - str_done_len - str_speed_len - str_padding_len - str_elapsed_spacer - 1
 
 col_x = 1
 str_x:
@@ -1953,6 +1969,55 @@ update_status:
     ; convert to ASCII in INBUFF and print
     jsr FASC
     jsr draw_string
+
+    ; elapsed time
+    ; FR0 = total_sec
+    ldx #.lobyte(total_sec)
+    ldy #.hibyte(total_sec)
+    jsr FLD0R
+    ; FR0 -> integer -> temp
+    jsr FPI
+    lda FR0
+    sta temp
+    lda FR0 + 1
+    sta temp + 1
+
+    .macro countdown divisor, string
+        .scope
+            ; count the hours
+            ldx #0
+        countdown_loop:
+            lda temp + 1
+            cmp #.hibyte(divisor)
+            bmi countdown_done
+            lda temp
+            cmp #.lobyte(divisor)
+            bmi countdown_done
+            sec
+            lda temp
+            sbc #.lobyte(divisor)
+            sta temp
+            lda temp + 1
+            sbc #.hibyte(divisor)
+            sta temp + 1
+            inx
+            jmp countdown_loop
+        countdown_done:
+            stx FR0
+            ldx #0
+            stx FR0 + 1
+            jsr IFP
+            jsr FASC
+            jsr draw_string
+            draw_string_const string
+        .endscope
+    .endmacro
+
+    draw_string_const str_space
+    countdown 3600, str_h
+    countdown 60, str_m
+    countdown 1, str_s
+
 skipped:
 
     ; sx += fill_level[fill_masks] + 1

From f7082ab371b0454c32ed2b120cdac5f28e02a587 Mon Sep 17 00:00:00 2001
From: Brooke <brooke@Starbase.local>
Date: Sun, 22 Jun 2025 22:21:26 -0700
Subject: [PATCH 095/104] wip subtraction method, still not working

---
 mandel.s | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/mandel.s b/mandel.s
index dc7e323..73a8ebb 100644
--- a/mandel.s
+++ b/mandel.s
@@ -221,8 +221,12 @@ char_map:
     .endrepeat
 
 hex_chars:
+digits_zero:
     .byte "0123456789abcdef"
 
+digits_space:
+    .byte " 123456789abcdef"
+
 aspect:
     ; aspect ratio!
     ; pixels at 320w are 5:6 (narrow)
@@ -1982,7 +1986,11 @@ update_status:
     lda FR0 + 1
     sta temp + 1
 
-    .macro countdown divisor, string
+    ;jsr IFP
+    ;jsr FASC
+    ;jsr draw_string
+
+    .macro countdown divisor, digits
         .scope
             ; count the hours
             ldx #0
@@ -2003,20 +2011,27 @@ update_status:
             inx
             jmp countdown_loop
         countdown_done:
-            stx FR0
-            ldx #0
-            stx FR0 + 1
-            jsr IFP
-            jsr FASC
+            lda digits,x
+            eor #$80
+            sta temp2
+            lda #.lobyte(temp2)
+            sta INBUFF
+            lda #.hibyte(temp2)
+            sta INBUFF + 1
             jsr draw_string
-            draw_string_const string
         .endscope
     .endmacro
 
     draw_string_const str_space
-    countdown 3600, str_h
-    countdown 60, str_m
-    countdown 1, str_s
+    countdown 36000, digits_space
+    countdown 3600, digits_zero
+    draw_string_const str_h
+    countdown 600, digits_zero
+    countdown 60, digits_zero
+    draw_string_const str_m
+    countdown 10, digits_zero
+    countdown 1, digits_zero
+    draw_string_const str_s
 
 skipped:
 

From 5cf64970c8a9ca8efe7821a9f425cb9282c92e29 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sun, 22 Jun 2025 23:10:43 -0700
Subject: [PATCH 096/104] Ah that's better

used the appropriate instruction for comparison
---
 mandel.s | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/mandel.s b/mandel.s
index 73a8ebb..6837e00 100644
--- a/mandel.s
+++ b/mandel.s
@@ -398,6 +398,11 @@ viewport_oy:
     .dword ($fffe0000 & $3fffffff) << 2
     .dword $ff000000
 
+elapsed_work:
+    .dword 0
+elapsed_digit:
+    .byte 0
+
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
     clc ; 2 cyc
@@ -1979,12 +1984,12 @@ update_status:
     ldx #.lobyte(total_sec)
     ldy #.hibyte(total_sec)
     jsr FLD0R
-    ; FR0 -> integer -> temp
+    ; FR0 -> integer -> elapsed_work
     jsr FPI
     lda FR0
-    sta temp
+    sta elapsed_work
     lda FR0 + 1
-    sta temp + 1
+    sta elapsed_work + 1
 
     ;jsr IFP
     ;jsr FASC
@@ -1995,28 +2000,28 @@ update_status:
             ; count the hours
             ldx #0
         countdown_loop:
-            lda temp + 1
+            lda elapsed_work + 1
             cmp #.hibyte(divisor)
-            bmi countdown_done
-            lda temp
+            bcc countdown_done
+            lda elapsed_work
             cmp #.lobyte(divisor)
-            bmi countdown_done
+            bcc countdown_done
             sec
-            lda temp
+            lda elapsed_work
             sbc #.lobyte(divisor)
-            sta temp
-            lda temp + 1
+            sta elapsed_work
+            lda elapsed_work + 1
             sbc #.hibyte(divisor)
-            sta temp + 1
+            sta elapsed_work + 1
             inx
             jmp countdown_loop
         countdown_done:
             lda digits,x
             eor #$80
-            sta temp2
-            lda #.lobyte(temp2)
+            sta elapsed_digit
+            lda #.lobyte(elapsed_digit)
             sta INBUFF
-            lda #.hibyte(temp2)
+            lda #.hibyte(elapsed_digit)
             sta INBUFF + 1
             jsr draw_string
         .endscope

From 4bac47a4fdad2fd56f9023eaae9a4be83fe9fced Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 23 Jun 2025 00:31:53 -0700
Subject: [PATCH 097/104] fix at 256 seconds

---
 mandel.s | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mandel.s b/mandel.s
index 6837e00..714f6b6 100644
--- a/mandel.s
+++ b/mandel.s
@@ -2002,10 +2002,14 @@ update_status:
         countdown_loop:
             lda elapsed_work + 1
             cmp #.hibyte(divisor)
+            beq countdown_lobyte
             bcc countdown_done
+            bcs countdown_inc
+        countdown_lobyte:
             lda elapsed_work
             cmp #.lobyte(divisor)
             bcc countdown_done
+        countdown_inc:
             sec
             lda elapsed_work
             sbc #.lobyte(divisor)

From fd954da47e75a99b0033ec11f0dd145e1d8ab544 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 23 Jun 2025 08:17:39 -0700
Subject: [PATCH 098/104] Create map file for convenience

export a symbol and it'll appear in mandel.map
---
 Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index bd14c7d..711adcd 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
 all : mandel.xex
 
 mandel.xex : mandel.o tables.o atari-asm-xex.cfg
-	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg --mapfile mandel.map -o $@ mandel.o tables.o
 
 %.o : %.s
 	ca65 -o $@ $<
@@ -15,4 +15,6 @@ clean :
 	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex
+	rm -f mandel.map
+
 

From fab2760394b9b74f07488bc345997fbd9e566389 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 28 Jun 2025 13:43:43 -0700
Subject: [PATCH 099/104] refactor countdown as a procedure call

---
 mandel.s | 104 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 58 insertions(+), 46 deletions(-)

diff --git a/mandel.s b/mandel.s
index 714f6b6..cad4e5e 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1991,55 +1991,25 @@ update_status:
     lda FR0 + 1
     sta elapsed_work + 1
 
-    ;jsr IFP
-    ;jsr FASC
-    ;jsr draw_string
-
-    .macro countdown divisor, digits
-        .scope
-            ; count the hours
-            ldx #0
-        countdown_loop:
-            lda elapsed_work + 1
-            cmp #.hibyte(divisor)
-            beq countdown_lobyte
-            bcc countdown_done
-            bcs countdown_inc
-        countdown_lobyte:
-            lda elapsed_work
-            cmp #.lobyte(divisor)
-            bcc countdown_done
-        countdown_inc:
-            sec
-            lda elapsed_work
-            sbc #.lobyte(divisor)
-            sta elapsed_work
-            lda elapsed_work + 1
-            sbc #.hibyte(divisor)
-            sta elapsed_work + 1
-            inx
-            jmp countdown_loop
-        countdown_done:
-            lda digits,x
-            eor #$80
-            sta elapsed_digit
-            lda #.lobyte(elapsed_digit)
-            sta INBUFF
-            lda #.hibyte(elapsed_digit)
-            sta INBUFF + 1
-            jsr draw_string
-        .endscope
-    .endmacro
-
     draw_string_const str_space
-    countdown 36000, digits_space
-    countdown 3600, digits_zero
+    
+    .macro do_countdown divisor, digits
+        ldx #.lobyte(divisor)
+        ldy #.hibyte(divisor)
+        lda #.lobyte(digits)
+        sta INBUFF
+        lda #.hibyte(digits)
+        sta INBUFF + 1
+        jsr countdown
+    .endmacro
+    do_countdown 36000, digits_space
+    do_countdown 3600, digits_zero
     draw_string_const str_h
-    countdown 600, digits_zero
-    countdown 60, digits_zero
+    do_countdown 600, digits_zero
+    do_countdown 60, digits_zero
     draw_string_const str_m
-    countdown 10, digits_zero
-    countdown 1, digits_zero
+    do_countdown 10, digits_zero
+    do_countdown 1, digits_zero
     draw_string_const str_s
 
 skipped:
@@ -2101,3 +2071,45 @@ loop:
     jmp main_loop
 
 .endproc
+
+; digit string in INBUFF
+; divisor X/Y
+; clobbers temp, calls draw_string
+.proc countdown
+    divisor = temp
+    stx divisor
+    sty divisor + 1
+
+    ; count the hours
+    ldy #0
+countdown_loop:
+    lda elapsed_work + 1
+    cmp divisor + 1
+    beq countdown_lobyte
+    bcc countdown_done
+    bcs countdown_inc
+countdown_lobyte:
+    lda elapsed_work
+    cmp divisor
+    bcc countdown_done
+countdown_inc:
+    sec
+    lda elapsed_work
+    sbc divisor
+    sta elapsed_work
+    lda elapsed_work + 1
+    sbc divisor + 1
+    sta elapsed_work + 1
+    iny
+    jmp countdown_loop
+countdown_done:
+    lda (INBUFF),y
+    eor #$80
+    sta elapsed_digit
+    lda #.lobyte(elapsed_digit)
+    sta INBUFF
+    lda #.hibyte(elapsed_digit)
+    sta INBUFF + 1
+    jsr draw_string
+    rts
+.endproc

From 96e0356e578f7c94dd2c5764ecbf59f70dbe29d9 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Thu, 3 Jul 2025 18:41:24 -0700
Subject: [PATCH 100/104] WIP input handling for coords

experimental output via 32-bits mult, looses precision in conversion
---
 mandel.s | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 128 insertions(+), 8 deletions(-)

diff --git a/mandel.s b/mandel.s
index cad4e5e..4cc0f39 100644
--- a/mandel.s
+++ b/mandel.s
@@ -126,6 +126,10 @@ KEY_7     = 51
 KEY_8     = 53
 KEY_9     = 48
 KEY_0     = 50
+KEY_PERIOD = 34
+KEY_E     = 42
+KEY_X     = 22
+KEY_Y     = 43
 
 .struct float48
     exponent .byte
@@ -257,6 +261,28 @@ fixed3_13_as_float: ; float48
     .byte $00
     .byte $00
 
+u65536_as_float: ; float48
+    ; 1 << 16
+    ; 65536
+    ; 06 55 36 . 00 00
+    .byte 66 ; exponent/sign - +2 bytes
+    .byte $06
+    .byte $55
+    .byte $36
+    .byte $00
+    .byte $00
+
+fixed6_26_as_float: ; float48
+    ; 1 << 26
+    ; 67108864
+    ; 67 10 88 64 . 00
+    .byte 67 ; exponent/sign - +3 bytes
+    .byte $67
+    .byte $10
+    .byte $88
+    .byte $64
+    .byte $00
+
 sec_per_frame: ; float48 00 . 01 66 66 66 67
     .byte 63  ; exponent/sign - -1 bytes
     .byte $01 ; BCD digits
@@ -403,6 +429,13 @@ elapsed_work:
 elapsed_digit:
     .byte 0
 
+input_col:
+    .byte 0
+input_row:
+    .byte 0
+input_max:
+    .byte 0
+
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
     clc ; 2 cyc
@@ -983,6 +1016,66 @@ common:
 
 .endproc
 
+; input in FR0, 32 bits signed 6.26 fixed
+; output in FR0, Atari float
+; clobbers a, x, y, FR0, FR1
+.proc fixed6_26_to_float
+    ; check sign bit! conversion routine is for unsigned
+    lda FR0 + 3
+    and #$80
+    sta temp
+
+    beq positive
+    neg32 FR0
+positive:
+
+    ; save low word
+    lda FR0
+    pha
+    lda FR0 + 1
+    pha
+
+    ; convert high word
+    sta FR0 + 2
+    sta FR1
+    lda FR0 + 3
+    sta FR0 + 1
+    jsr IFP
+
+    lda temp
+    beq positive2
+    ; set float sign bit
+    lda FR0
+    ora #$80
+    sta FR0
+positive2:
+
+    ; high word to FR1
+    ldx #.lobyte(u65536_as_float)
+    ldy #.hibyte(u65536_as_float)
+    jsr FLD1R
+    jsr FMUL
+    jsr FMOVE
+
+    ; convert low word
+    pla
+    lda temp + 1
+    pla
+    lda temp
+    jsr IFP
+
+    ; combine
+    jsr FADD
+
+    ; scale
+    ldx #.lobyte(fixed6_26_as_float)
+    ldy #.hibyte(fixed6_26_as_float)
+    jsr FLD1R
+    jsr FDIV
+
+    rts
+.endproc
+
 ; input in FR0, Atari float
 ; output in FR0, 16 bits signed 3.13 fixed
 ; clobbers a, x, y, FR0, FR1
@@ -1603,7 +1696,7 @@ number_keys:
     beq five
     cpy #KEY_6
     beq six
-    jmp skip_char
+    jmp letter_keys
 
 one:
     ldx #0
@@ -1622,7 +1715,21 @@ five:
     jmp load_key_viewport
 six:
     ldx #5
-    ; fall through
+    jmp load_key_viewport
+
+letter_keys:
+    cpy #KEY_X
+    bne not_x
+    jsr input_x
+    jmp load_key_viewport
+not_x:
+    cpy #KEY_Y
+    bne not_y
+    jsr input_y
+    jmp load_key_viewport
+not_y:
+    jmp skip_char
+
 load_key_viewport:
     jsr load_viewport
     ; fall through
@@ -1632,6 +1739,23 @@ done:
 
 .endproc
 
+.proc input_x
+    ldx #col_x
+    ldy #1
+    jsr input_number
+
+
+    rts
+.endproc
+
+.proc input_y
+    rts
+.endproc
+
+.proc input_number
+    rts
+.endproc
+
 .proc clear_screen
     ; zero the range from framebuffer_top to display_list
     lda #.lobyte(framebuffer_top)
@@ -1679,9 +1803,7 @@ zero_byte_loop:
     draw_string_const str_x
 
     copy32 FR0, ox
-    shift_round_16 FR0, 3
-    copy16 FR0, FR0 + 2
-    jsr fixed3_13_to_float
+    jsr fixed6_26_to_float
     jsr FASC
     jsr draw_string
 
@@ -1690,9 +1812,7 @@ zero_byte_loop:
     draw_string_const str_y
 
     copy32 FR0, oy
-    shift_round_16 FR0, 3
-    copy16 FR0, FR0 + 2
-    jsr fixed3_13_to_float
+    jsr fixed6_26_to_float
     jsr FASC
     jsr draw_string
 

From f2a6af0995512835bae7e6ecd094e15ef5115fc3 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Thu, 3 Jul 2025 18:43:10 -0700
Subject: [PATCH 101/104] Replace the not-enough-precision 32 bit to float impl

keep the proc though to encapsulate it but uses the older
logic of rounding down to 3.13 first
---
 mandel.s | 79 +++-----------------------------------------------------
 1 file changed, 4 insertions(+), 75 deletions(-)

diff --git a/mandel.s b/mandel.s
index 4cc0f39..a97d6e3 100644
--- a/mandel.s
+++ b/mandel.s
@@ -261,28 +261,6 @@ fixed3_13_as_float: ; float48
     .byte $00
     .byte $00
 
-u65536_as_float: ; float48
-    ; 1 << 16
-    ; 65536
-    ; 06 55 36 . 00 00
-    .byte 66 ; exponent/sign - +2 bytes
-    .byte $06
-    .byte $55
-    .byte $36
-    .byte $00
-    .byte $00
-
-fixed6_26_as_float: ; float48
-    ; 1 << 26
-    ; 67108864
-    ; 67 10 88 64 . 00
-    .byte 67 ; exponent/sign - +3 bytes
-    .byte $67
-    .byte $10
-    .byte $88
-    .byte $64
-    .byte $00
-
 sec_per_frame: ; float48 00 . 01 66 66 66 67
     .byte 63  ; exponent/sign - -1 bytes
     .byte $01 ; BCD digits
@@ -1016,63 +994,14 @@ common:
 
 .endproc
 
+; rounds to 16-bit first!
 ; input in FR0, 32 bits signed 6.26 fixed
 ; output in FR0, Atari float
 ; clobbers a, x, y, FR0, FR1
 .proc fixed6_26_to_float
-    ; check sign bit! conversion routine is for unsigned
-    lda FR0 + 3
-    and #$80
-    sta temp
-
-    beq positive
-    neg32 FR0
-positive:
-
-    ; save low word
-    lda FR0
-    pha
-    lda FR0 + 1
-    pha
-
-    ; convert high word
-    sta FR0 + 2
-    sta FR1
-    lda FR0 + 3
-    sta FR0 + 1
-    jsr IFP
-
-    lda temp
-    beq positive2
-    ; set float sign bit
-    lda FR0
-    ora #$80
-    sta FR0
-positive2:
-
-    ; high word to FR1
-    ldx #.lobyte(u65536_as_float)
-    ldy #.hibyte(u65536_as_float)
-    jsr FLD1R
-    jsr FMUL
-    jsr FMOVE
-
-    ; convert low word
-    pla
-    lda temp + 1
-    pla
-    lda temp
-    jsr IFP
-
-    ; combine
-    jsr FADD
-
-    ; scale
-    ldx #.lobyte(fixed6_26_as_float)
-    ldy #.hibyte(fixed6_26_as_float)
-    jsr FLD1R
-    jsr FDIV
-
+    shift_round_16 FR0, 3
+    copy16 FR0, FR0 + 2
+    jsr fixed3_13_to_float
     rts
 .endproc
 

From b46e6fb343e425985f53f868f1dbe4e3f6c0e384 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 1 Sep 2025 12:28:33 -0700
Subject: [PATCH 102/104] fix typo on stub x/y inputs

was accidentally falling through to the load
a viewport from a keypress thingy which was
not needed here
---
 mandel.s | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index a97d6e3..5edcbce 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1650,12 +1650,12 @@ letter_keys:
     cpy #KEY_X
     bne not_x
     jsr input_x
-    jmp load_key_viewport
+    jmp done
 not_x:
     cpy #KEY_Y
     bne not_y
     jsr input_y
-    jmp load_key_viewport
+    jmp done
 not_y:
     jmp skip_char
 

From 29cd3d968fbff768c23959fab2f6addcaff8ccfe Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 6 Sep 2025 19:53:25 -0700
Subject: [PATCH 103/104] Shaves 3 seconds off initial view runtime on XE :D

Instead of relying solely on the JMP thunks added to
imul16_func and sqr16_func, three call sites within the
mandelbrot iteration function are patched directly to
jsr to the XE versions, saving like 15 cycles per iter

Ok so it's not a lot, but every seconds counts. ;)

with XE code disabled:
1539 us/iter
5m13s

with old XE code:
1417 us/iter
4m48s

with new XE code:
1406 us/iter
4m45s
---
 mandel.s | 147 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 82 insertions(+), 65 deletions(-)

diff --git a/mandel.s b/mandel.s
index 5edcbce..b0f9c28 100644
--- a/mandel.s
+++ b/mandel.s
@@ -485,6 +485,7 @@ input_max:
 .endmacro
 
 ; 6 * bytes cycles
+; 4 * bytes bytes
 .macro copy bytes, dest, arg
     .repeat bytes, byte ; 6 * bytes cycles
         lda arg + byte  ; 3 cyc
@@ -493,6 +494,7 @@ input_max:
 .endmacro
 
 ; 12 cycles
+; 8 bytes
 .macro copy16 dest, arg
     copy 2, dest, arg
 .endmacro
@@ -538,6 +540,8 @@ input_max:
 
 ; input: arg1, arg2 as fixed4.12
 ; output: dest as fixed8.24
+; patch point jsr at 16 bytes in
+imul16_patch_offset = 16
 .macro imul16 dest, arg1, arg2
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
@@ -547,6 +551,8 @@ input_max:
 
 ; input: arg as fixed4.12
 ; output: dest as fixed8.24
+; patch point jsr at 8 bytes in
+sqr16_patch_offset = 8
 .macro sqr16 dest, arg
     copy16 FR0, arg   ; 12 cyc
     jsr sqr16_func    ; ? cyc
@@ -692,71 +698,6 @@ bank_switch_table:
     .endif
 .endmacro
 
-.proc imul8xe_init
-
-    bank_switch 0
-    lda #0
-    sta EXTENDED_RAM
-    bank_switch 1
-    lda #1
-    sta EXTENDED_RAM
-    bank_switch 0
-    lda EXTENDED_RAM
-    beq init
-
-    ; no bank switching available, we just overwrite the value in base ram
-    rts
-
-init:
-
-    ; patch imul16_func into a forwarding thunk to imul16xe_func
-    lda #$4c ; 'jmp' opcode
-    sta imul16_func
-    lda #.lobyte(imul16xe_func)
-    sta imul16_func + 1
-    lda #.hibyte(imul16xe_func)
-    sta imul16_func + 2
-
-    ; ditto for sqr16_func -> sqr16xe_func
-    lda #$4c ; 'jmp' opcode
-    sta sqr16_func
-    lda #.lobyte(sqr16xe_func)
-    sta sqr16_func + 1
-    lda #.hibyte(sqr16xe_func)
-    sta sqr16_func + 2
-
-    ; create the lookup table
-    ; go through the input set, in four 16KB chunks
-
-    arg1 = FR1
-    arg2 = FR2
-    result = FR0
-
-    lda #$00
-    sta arg1
-    sta arg2
-    sta ptr
-    lda #$40
-    sta ptr + 1
-
-    ; $00 * $00 -> $3f * $ff
-    bank_switch 0
-    jsr imul8xe_init_section
-
-    ; $40 * $00 -> $7f * $ff
-    bank_switch 1
-    jsr imul8xe_init_section
-
-    ; $80 * $00 -> $bf * $ff
-    bank_switch 2
-    jsr imul8xe_init_section
-
-    ; $c0 * $00 -> $ff * $ff
-    bank_switch 3
-    jsr imul8xe_init_section
-
-    rts
-.endproc
 
 ; Initialize a 16 KB chunk of the table
 ; input: multipliers in temp
@@ -1156,12 +1097,15 @@ keep_going:
     shift_round_16 zy, 3
 
     ; zx_2 = zx * zx
+fixup_sqr16_1:
     sqr16 zx_2, zx + 2
 
     ; zy_2 = zy * zy
+fixup_sqr16_2:
     sqr16 zy_2, zy + 2
 
     ; zx_zy = zx * zy
+fixup_imul16_1:
     imul16 zx_zy, zx + 2, zy + 2
 
     ; dist = zx_2 + zy_2
@@ -2162,3 +2106,76 @@ countdown_done:
     jsr draw_string
     rts
 .endproc
+
+.proc imul8xe_init
+
+    bank_switch 0
+    lda #0
+    sta EXTENDED_RAM
+    bank_switch 1
+    lda #1
+    sta EXTENDED_RAM
+    bank_switch 0
+    lda EXTENDED_RAM
+    beq init
+
+    ; no bank switching available, we just overwrite the value in base ram
+    rts
+
+init:
+
+    ; patch imul16_func into a forwarding thunk to imul16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta imul16_func
+    lda #.lobyte(imul16xe_func)
+    sta imul16_func + 1
+    sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1
+    lda #.hibyte(imul16xe_func)
+    sta imul16_func + 2
+    sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2
+
+    ; ditto for sqr16_func -> sqr16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta sqr16_func
+    lda #.lobyte(sqr16xe_func)
+    sta sqr16_func + 1
+    sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1
+    sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1
+    lda #.hibyte(sqr16xe_func)
+    sta sqr16_func + 2
+    sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2
+    sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2
+
+
+    ; create the lookup table
+    ; go through the input set, in four 16KB chunks
+
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+
+    lda #$00
+    sta arg1
+    sta arg2
+    sta ptr
+    lda #$40
+    sta ptr + 1
+
+    ; $00 * $00 -> $3f * $ff
+    bank_switch 0
+    jsr imul8xe_init_section
+
+    ; $40 * $00 -> $7f * $ff
+    bank_switch 1
+    jsr imul8xe_init_section
+
+    ; $80 * $00 -> $bf * $ff
+    bank_switch 2
+    jsr imul8xe_init_section
+
+    ; $c0 * $00 -> $ff * $ff
+    bank_switch 3
+    jsr imul8xe_init_section
+
+    rts
+.endproc

From 6479cf530c1c584f33b96f2b19885d02415863bb Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 16 Sep 2025 21:29:40 -0700
Subject: [PATCH 104/104] update some timings

---
 mandel.s | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mandel.s b/mandel.s
index b0f9c28..b52f24a 100644
--- a/mandel.s
+++ b/mandel.s
@@ -461,7 +461,7 @@ input_max:
     sub 4, dest, arg1, arg2
 .endmacro
 
-; 3 + 5 * bytes cycles
+; 3 + 5 * (bytes - 1) cycles
 .macro shl bytes, arg
     asl arg              ; 3 cyc
     .repeat bytes-1, i
@@ -469,17 +469,17 @@ input_max:
     .endrepeat
 .endmacro
 
-; 13 cycles
+; 8 cycles
 .macro shl16 arg
     shl 2, arg
 .endmacro
 
-; 18 cycles
+; 13 cycles
 .macro shl24 arg
     shl 3, arg
 .endmacro
 
-; 23 cycles
+; 18 cycles
 .macro shl32 arg
     shl 4, arg
 .endmacro
@@ -529,11 +529,11 @@ input_max:
     neg 4, arg
 .endmacro
 
-; 11-27 + 23 * shift cycles
-; 103-119 cycles for shift=4
+; 11-27 + 18 * shift cycles
+; 65-81 cycles for shift=3
 .macro shift_round_16 arg, shift
     .repeat shift
-        shl32 arg ; 23 cycles
+        shl32 arg ; 18 cycles
     .endrepeat
     round16 arg ; 11-27 cycles
 .endmacro