From 0d086a179cf8e91b839f306bb597ef9e6125f6b2 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 07:20:53 -0800
Subject: [PATCH 01/10] wip

---
 mandel.s | 108 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 68 insertions(+), 40 deletions(-)

diff --git a/mandel.s b/mandel.s
index fc30532..50213ad 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1,43 +1,42 @@
 ; Our zero-page vars
-sx    = $80     ; i16: screen pixel x
-sy    = $82     ; i16: screen pixel y
-ox    = $84     ; fixed4.12: center point x
-oy    = $86     ; fixed4.12: center point y
-cx    = $88     ; fixed4.12: c_x
-cy    = $8a     ; fixed4.12: c_y
-zx    = $8c     ; fixed4.12: z_x
-zy    = $8e     ; fixed4.12: z_y
+ox              = $80 ; fixed8.24: center point x
+oy              = $84 ; fixed8.24: center point y
+cx              = $88 ; fixed8.24: c_x
+cy              = $8c ; fixed8.24: c_y
 
-zx_2  = $90     ; fixed4.12: z_x^2
-zy_2  = $92     ; fixed4.12: z_y^2
-zx_zy = $94     ; fixed4.12: z_x * z_y
-dist  = $96     ; fixed4.12: z_x^2 + z_y^2
+zx              = $90 ; fixed8.24: z_x
+zy              = $94 ; fixed8.24: z_y
+zx_2            = $98 ; fixed8.24: z_x^2
+zy_2            = $9c ; fixed8.24: z_y^2
 
-iter          = $a0 ; u8: iteration count
+zx_zy           = $a0 ; fixed8.24: z_x * z_y
+dist            = $a4 ; fixed8.24: z_x^2 + z_y^2
+sx              = $a8 ; i16: screen pixel x
+sy              = $aa ; i16: screen pixel y
+z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
+z_buffer_start  = $ad ; u8: index into z_buffer
+z_buffer_end    = $ae ; u8: index into z_buffer
+iter            = $af ; u8: iteration count
 
-zoom          = $a1 ; u8: zoom shift level
-count_frames  = $a2 ; u8
-count_pixels  = $a3 ; u8
-total_ms      = $a4 ; float48
-total_pixels  = $aa ; float48
+ptr             = $b0 ; u16
+pixel_ptr       = $b2 ; u16
+zoom            = $b4 ; u8: zoom shift level
+fill_level      = $b5 ; u8
+pixel_color     = $b6 ; u8
+pixel_mask      = $b7 ; u8
+pixel_shift     = $b8 ; u8
+pixel_offset    = $b9 ; u8
+palette_offset  = $ba ; u8
+chroma_offset   = $bb ; u8
+palette_ticks   = $bc ; u8
+chroma_ticks    = $bd ; u8
+count_frames    = $be ; u8
+count_pixels    = $bf ; u8
 
-z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
-z_buffer_start  = $b1 ; u8: index into z_buffer
-z_buffer_end    = $b2 ; u8: index into z_buffer
-temp            = $b4 ; u16
-temp2           = $b6 ; u16
-pixel_ptr       = $b8 ; u16
-pixel_color     = $ba ; u8
-pixel_mask      = $bb ; u8
-pixel_shift     = $bc ; u8
-pixel_offset    = $bd ; u8
-fill_level      = $be ; u8
-palette_offset  = $bf ; u8
-
-palette_ticks = $c0 ; u8
-chroma_ticks  = $c1 ; u8
-chroma_offset = $c2 ; u8
-ptr           = $c4 ; u16
+total_pixels    = $c0 ; float48
+total_ms        = $c6 ; float48
+temp            = $cc ; u16
+temp2           = $ce ; u16
 
 palette_delay = 23
 chroma_delay = 137
@@ -884,12 +883,41 @@ next:
     ; zx_zy = 0
     ; dist = 0
     ; iter = 0
+;    lda #00
+;    ldx #(iter - zx + 1)
+;initloop:
+;    sta zx - 1,x
+;    dex
+;    bne initloop
+;    sta z_buffer_start
+;    sta z_buffer_end
+
     lda #00
-    ldx #(iter - zx + 1)
-initloop:
-    sta zx - 1,x
-    dex
-    bne initloop
+    sta zx
+    sta zx + 1
+    sta zx + 2
+    sta zx + 3
+    sta zy
+    sta zy + 1
+    sta zy + 2
+    sta zy + 3
+    sta zx_2
+    sta zx_2 + 1
+    sta zx_2 + 2
+    sta zx_2 + 3
+    sta zy_2
+    sta zy_2 + 1
+    sta zy_2 + 2
+    sta zy_2 + 3
+    sta zx_zy
+    sta zx_zy + 1
+    sta zx_zy + 2
+    sta zx_zy + 3
+    sta dist
+    sta dist + 1
+    sta dist + 2
+    sta dist + 3
+    sta iter
     sta z_buffer_start
     sta z_buffer_end
 

From 4a1e35699adcce1af0f60ea51573e8a215975c66 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 07:19:45 -0800
Subject: [PATCH 02/10] wip

---
 mandel.s | 71 ++++++++++++++++++++++++++++++++++++++------------------
 todo.md  |  2 +-
 2 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/mandel.s b/mandel.s
index 50213ad..622ff62 100644
--- a/mandel.s
+++ b/mandel.s
@@ -433,6 +433,13 @@ viewport_oy:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
+.macro imul16 dest, arg1, arg2
+    copy16 FR0, arg1  ; 12 cyc
+    copy16 FR1, arg2  ; 12 cyc
+    jsr imul16_func   ; ? cyc
+    copy32 dest, FR2  ; 24 cyc
+.endmacro
+
 .macro sqr16_round dest, arg, shift
     ;imul16_round dest, arg, arg, shift
     copy16 FR0, arg   ; 12 cyc
@@ -441,6 +448,12 @@ viewport_oy:
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
+.macro sqr16 dest, arg
+    copy16 FR0, arg   ; 12 cyc
+    jsr sqr16_func    ; ? cyc
+    copy32 dest, FR2  ; 24 cyc
+.endmacro
+
 ; clobbers a, x
 .macro sqr8 dest, arg
     ldx arg
@@ -870,8 +883,8 @@ next:
 
 .proc mandelbrot
     ; input:
-    ; cx: position scaled to 4.12 fixed point - -8..+7.9
-    ; cy: position scaled to 4.12
+    ; cx: position scaled to 8.24 fixed point - -128..+127.9
+    ; cy: position scaled to 8.24
     ;
     ; output:
     ; iter: iteration count at escape or 0
@@ -909,10 +922,6 @@ next:
     sta zy_2 + 1
     sta zy_2 + 2
     sta zy_2 + 3
-    sta zx_zy
-    sta zx_zy + 1
-    sta zx_zy + 2
-    sta zx_zy + 3
     sta dist
     sta dist + 1
     sta dist + 2
@@ -929,6 +938,8 @@ loop:
 keep_going:
 
     .macro quick_exit arg, max
+        ; arg: fixed8.24
+        ; max: integer
         .local positive
         .local negative
         .local nope_out
@@ -936,51 +947,61 @@ keep_going:
         .local all_done
 
         ; check sign bit
-        lda arg + 1
+        lda arg + 3
         bmi negative
 
     positive:
-        cmp #((max) << 4)
+        cmp #max
         bmi all_done ; 'less than'
         jmp exit_path
 
     negative:
-        cmp #(256 - ((max) << 4))
+        cmp #(256 - max)
         beq first_equal ; 'equal' on first byte
         bpl all_done    ; 'greater than'
 
     nope_out:
         jmp exit_path
-    
+
     first_equal:
+        ; following bytes all 0 shows it's really 'equal'
+        lda arg + 2
+        bne all_done
+        lda arg + 1
+        bne all_done
         lda arg
-        beq nope_out  ; 2nd byte 0 shows it's really 'equal'
+        bne all_done
+        jmp exit_path
 
     all_done:
     .endmacro
 
-    ; 4.12: (-8 .. +7.9)
+    ; 8.24: (-128 .. 127.9) / (-8 .. +7.9)
     ; zx = zx_2  - zy_2  + cx
-    sub16 zx, zx_2, zy_2
-    add16 zx, zx, cx
+    sub32 zx, zx_2, zy_2
+    add32 zx, zx, cx
     quick_exit zx, 2
 
     ; zy = zx_zy + zx_zy + cy
-    add16 zy, zx_zy, zx_zy
-    add16 zy, zy, cy
+    add32 zy, zx_zy, zx_zy
+    add32 zy, zy, cy
     quick_exit zy, 2
 
+    ; convert 8.24 -> 4.12
+    shift_round_16 zx, 4
+    shift_round_16 zy, 4
+
     ; zx_2 = zx * zx
-    sqr16_round zx_2, zx, 4
+    sqr16 zx_2, zx + 2
 
     ; zy_2 = zy * zy
-    sqr16_round zy_2, zy, 4
+    sqr16 zy_2, zy + 2
 
     ; zx_zy = zx * zy
-    imul16_round zx_zy, zx, zy, 4
+    imul16 zx_zy, zx + 2, zy + 2
 
     ; dist = zx_2 + zy_2
-    add16 dist, zx_2, zy_2
+    add32 dist, zx_2, zy_2
     quick_exit dist, 4
 
     ; if may be in the lake, look for looping output with a small buffer
@@ -1090,13 +1111,17 @@ enough:
 .endmacro
 
 .macro zoom_factor dest, src, zoom, aspect
+    ; output: dest: fixed8.24
+    ; input: src: fixed4.12
+    ; input: zoom: u8 ???
+    ; aspect: fixed4.12
     ; clobbers A, X, flags, etc
     copy16 dest, src
     scale_zoom dest
 
     ; cy = cy * (3 / 4)
     ; cx = cx * (5 / 4)
-    imul16_round dest, dest, aspect, 4
+    imul16 dest, dest, aspect
 .endmacro
 
 .proc pset
@@ -1567,9 +1592,9 @@ not_skipped_mask:
 
     ; run the fractal!
     zoom_factor cx, sx, zoom, aspect_x
-    add16 cx, cx, ox
+    add32 cx, cx, ox
     zoom_factor cy, sy, zoom, aspect_y
-    add16 cy, cy, oy
+    add32 cy, cy, oy
     jsr mandelbrot
     jsr pset
 
diff --git a/todo.md b/todo.md
index 6fb0282..29217cd 100644
--- a/todo.md
+++ b/todo.md
@@ -3,7 +3,7 @@ things to try:
 * skip add on the top-byte multiply in sqr8/mul8
   * should save a few cycles, suggestion by jamey
 
-* perform the zx += zx^s + cx in 32-bit space, before rounding
+* perform the zx_next = zx^s + cx in 32-bit space, before rounding
   * should improve precision on max zoom, might cost a few cycles
 
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D

From 7184b8e03f2748efd532277995afe5fa7d4a3cf6 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 07:33:20 -0800
Subject: [PATCH 03/10] wip

---
 mandel.s | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/mandel.s b/mandel.s
index 622ff62..86a6b48 100644
--- a/mandel.s
+++ b/mandel.s
@@ -292,16 +292,16 @@ viewport_zoom:
     .byte 6
 
 viewport_ox:
-    .word $0000
-    .word $f110
-    .word $f110
-    .word $e400
+    .dword $00000000
+    .dword $ff110000
+    .dword $ff110000
+    .dword $fe400000
 
 viewport_oy:
-    .word $0000
-    .word $fb60
-    .word $fbe0
-    .word $0000
+    .dword $00000000
+    .dword $ffb60000
+    .dword $ffbe0000
+    .dword $00000000
 
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
@@ -1459,17 +1459,32 @@ zero_byte_loop:
 
     txa
     asl a
+    asl a
+
     tax
     lda viewport_ox,x
     sta ox
     lda viewport_oy,x
     sta oy
+
     inx
     lda viewport_ox,x
     sta ox + 1
     lda viewport_oy,x
     sta oy + 1
 
+    inx
+    lda viewport_ox,x
+    sta ox + 2
+    lda viewport_oy,x
+    sta oy + 2
+
+    inx
+    lda viewport_ox,x
+    sta ox + 3
+    lda viewport_oy,x
+    sta oy + 3
+
     rts
 .endproc
 

From 13257309dc3a6493e05575404f5deddd09e9192d Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 08:34:02 -0800
Subject: [PATCH 04/10] init fix

---
 mandel.s | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mandel.s b/mandel.s
index 86a6b48..76816c2 100644
--- a/mandel.s
+++ b/mandel.s
@@ -922,6 +922,10 @@ next:
     sta zy_2 + 1
     sta zy_2 + 2
     sta zy_2 + 3
+    sta zx_zy
+    sta zx_zy + 1
+    sta zx_zy + 2
+    sta zx_zy + 3
     sta dist
     sta dist + 1
     sta dist + 2

From 2fcb30b76a66819ab96ec3353b8ce4978f723675 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 08:56:59 -0800
Subject: [PATCH 05/10] wip

---
 mandel.s | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mandel.s b/mandel.s
index 76816c2..0400003 100644
--- a/mandel.s
+++ b/mandel.s
@@ -980,7 +980,7 @@ keep_going:
     all_done:
     .endmacro
 
-    ; 8.24: (-128 .. 127.9) / (-8 .. +7.9)
+    ; 8.24: (-128 .. 127.9)
     ; zx = zx_2  - zy_2  + cx
     sub32 zx, zx_2, zy_2
     add32 zx, zx, cx
@@ -991,7 +991,7 @@ keep_going:
     add32 zy, zy, cy
     quick_exit zy, 2
 
-    ; convert 8.24 -> 4.12
+    ; convert 8.24 -> 4.12: (-8 .. +7.9)
     shift_round_16 zx, 4
     shift_round_16 zy, 4
 
@@ -1042,10 +1042,10 @@ z_buffer_loop:
 
     ; Compare the previously stored z values
     ldy #0
-    z_compare zx
-    z_compare zx + 1
-    z_compare zy
-    z_compare zy + 1
+    z_compare zx + 2
+    z_compare zx + 3
+    z_compare zy + 2
+    z_compare zy + 3
 
     cpy #4
     bne z_no_matches
@@ -1060,10 +1060,10 @@ z_no_matches:
 z_nothing_to_read:
 
     ; Store and expand
-    z_store zx
-    z_store zx + 1
-    z_store zy
-    z_store zy + 1
+    z_store zx + 2
+    z_store zx + 3
+    z_store zy + 2
+    z_store zy + 3
     z_advance
     stx z_buffer_end
 

From d2f41f964435b3803ce694a70bf38687fd467caa Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 09:02:42 -0800
Subject: [PATCH 06/10] wip

---
 mandel.s | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/mandel.s b/mandel.s
index 0400003..8b63941 100644
--- a/mandel.s
+++ b/mandel.s
@@ -425,14 +425,8 @@ viewport_oy:
     round16 arg ; 11-27 cycles
 .endmacro
 
-.macro imul16_round dest, arg1, arg2, shift
-    copy16 FR0, arg1  ; 12 cyc
-    copy16 FR1, arg2  ; 12 cyc
-    jsr imul16_func   ; ? cyc
-    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
-    copy16 dest, FR2 + 2  ; 12 cyc
-.endmacro
-
+; input: arg1, arg2 as fixed4.12
+; output: dest as fixed8.24
 .macro imul16 dest, arg1, arg2
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
@@ -440,20 +434,16 @@ viewport_oy:
     copy32 dest, FR2  ; 24 cyc
 .endmacro
 
-.macro sqr16_round dest, arg, shift
-    ;imul16_round dest, arg, arg, shift
-    copy16 FR0, arg   ; 12 cyc
-    jsr sqr16_func      ; ? cyc
-    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
-    copy16 dest, FR2 + 2  ; 12 cyc
-.endmacro
-
+; input: arg as fixed4.12
+; output: dest as fixed8.24
 .macro sqr16 dest, arg
     copy16 FR0, arg   ; 12 cyc
     jsr sqr16_func    ; ? cyc
     copy32 dest, FR2  ; 24 cyc
 .endmacro
 
+; input: arg as u8
+; output: dest as u16
 ; clobbers a, x
 .macro sqr8 dest, arg
     ldx arg

From 1e0f577e099b3d7787d6e6d4fce1813ccd6b489c Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 09:09:11 -0800
Subject: [PATCH 07/10] wip

---
 mandel.s | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mandel.s b/mandel.s
index 8b63941..6977582 100644
--- a/mandel.s
+++ b/mandel.s
@@ -453,6 +453,8 @@ viewport_oy:
     sta dest + 1
 .endmacro
 
+; input: arg as u8
+; input/output: dest as u16
 ; clobbers a, x
 .macro sqr8_add16 dest, arg
     ldx arg

From 81bf7f3c434646f0374c35f20131050bd314d1b2 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 09:53:22 -0800
Subject: [PATCH 08/10] tweak

---
 mandel.s | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mandel.s b/mandel.s
index 6977582..4ab6c19 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1106,10 +1106,9 @@ cont:
 enough:
 .endmacro
 
-.macro zoom_factor dest, src, zoom, aspect
+.macro zoom_factor dest, src, aspect
     ; output: dest: fixed8.24
     ; input: src: fixed4.12
-    ; input: zoom: u8 ???
     ; aspect: fixed4.12
     ; clobbers A, X, flags, etc
     copy16 dest, src
@@ -1602,9 +1601,9 @@ skipped_mask:
 not_skipped_mask:
 
     ; run the fractal!
-    zoom_factor cx, sx, zoom, aspect_x
+    zoom_factor cx, sx, aspect_x
     add32 cx, cx, ox
-    zoom_factor cy, sy, zoom, aspect_y
+    zoom_factor cy, sy, aspect_y
     add32 cy, cy, oy
     jsr mandelbrot
     jsr pset

From 2e8893fd7892429bc07bd1d653ef1319be7d2d7b Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 13:54:53 -0800
Subject: [PATCH 09/10] haha fuck me

---
 mandel.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mandel.s b/mandel.s
index 4ab6c19..04edec5 100644
--- a/mandel.s
+++ b/mandel.s
@@ -320,7 +320,7 @@ viewport_oy:
 
 ; 38 cycles
 .macro add32 dest, arg1, arg2
-    add 4, dest, arg2, dest
+    add 4, dest, arg1, arg2
 .endmacro
 
 ; 8 cycles

From cc83c76706519cce3fff61ce46df9589d31025d6 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Tue, 31 Dec 2024 14:16:43 -0800
Subject: [PATCH 10/10] update docs for 32-bit intermediates

---
 readme.md | 4 ++--
 todo.md   | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/readme.md b/readme.md
index f297d60..d60644c 100644
--- a/readme.md
+++ b/readme.md
@@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
 * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
 
-The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
 
 Iterations are capped at 255.
 
@@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e
 
 ## Todo
 
-See ideas in `todo.md`.
\ No newline at end of file
+See ideas in `todo.md`.
diff --git a/todo.md b/todo.md
index 29217cd..284d653 100644
--- a/todo.md
+++ b/todo.md
@@ -3,13 +3,11 @@ things to try:
 * skip add on the top-byte multiply in sqr8/mul8
   * should save a few cycles, suggestion by jamey
 
-* perform the zx_next = zx^s + cx in 32-bit space, before rounding
-  * should improve precision on max zoom, might cost a few cycles
-
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 
 * try 3.13 fixed point instead of 4.12 for more precision
   * can we get away without the extra bit?
+  * since exit compare space would be 6.26 i think so
 
 * y-axis mirror optimization