From dbbec8ed6d648ab2984a8fb0513d14f60b30ca80 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sun, 22 Jan 2023 09:34:42 -0800
Subject: [PATCH 1/5] ok two things wrong:

1) bit masks are backwards
2) iter always returning 0
---
 mandel.s | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mandel.s b/mandel.s
index be1f59e..9d3a2a6 100644
--- a/mandel.s
+++ b/mandel.s
@@ -438,9 +438,9 @@ next:
     ; dist = 0
     ; iter = 0
     lda #00
-    ldx iter - zx
+    ldx #(iter - zx + 1)
 initloop:
-    sta zx,x
+    sta zx - 1,x
     dex
     bne initloop
 
@@ -705,6 +705,7 @@ loop_sx:
     zoom_factor cx, sx, zoom, aspect_x
     zoom_factor cy, sy, zoom, aspect_y
     jsr mandelbrot
+    inc iter
     jsr pset
 
     clc

From b4721ae46b0399deeffed8cf2eca6f3eb9feed5f Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sun, 22 Jan 2023 09:37:37 -0800
Subject: [PATCH 2/5] fix pixel shift

---
 mandel.s | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mandel.s b/mandel.s
index 9d3a2a6..df80ed7 100644
--- a/mandel.s
+++ b/mandel.s
@@ -584,6 +584,9 @@ point:
     ; pixel_mask <<= pixel_shift (shifting in ones)
     and #3
     sta pixel_shift
+    lda #3
+    sec
+    sbc pixel_shift
     tax
 shift_loop:
     beq shift_done

From ae9dd0674d002e726566a20e7756c3619244576d Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sun, 22 Jan 2023 10:42:37 -0800
Subject: [PATCH 3/5] corrupt! but it produces pixels

---
 mandel.s | 64 ++++++++++++++++++++++++--------------------------------
 1 file changed, 27 insertions(+), 37 deletions(-)

diff --git a/mandel.s b/mandel.s
index df80ed7..b4baad5 100644
--- a/mandel.s
+++ b/mandel.s
@@ -168,7 +168,7 @@ color_map:
 .endmacro
 
 .macro add32 dest, arg1, arg2
-    add 2, dest, arg2, dest
+    add 4, dest, arg2, dest
 .endmacro
 
 ; 2 + 9 * byte cycles
@@ -422,6 +422,13 @@ next:
 
 .endmacro
 
+.macro shift_round_16 arg, shift
+    .repeat shift
+        shl32 arg
+    .endrepeat
+    round16 arg
+.endmacro
+
 .proc mandelbrot
     ; input:
     ; cx: position scaled to 4.12 fixed point - -8..+7.9
@@ -445,64 +452,48 @@ initloop:
     bne initloop
 
 loop:
-    ; 1939 - 3007 cyc
-
-    ; iter++ & max-iters break = 7 cyc
-    inc iter       ; 5 cyc
-    bne keep_going ; 2 cyc
+    ; iter++ & max-iters break
+    inc iter
+    bne keep_going
     rts
 keep_going:
 
     ; 4.12: (-8 .. +7.9)
-    ; zx = zx_2  - zy_2  + cx   = 3 * 20 = 60 cyc
-    sub16 zx, zx_2, zy_2
+    ; zx = zx_2  - zy_2  + cx
+    sub16 zx, zx_2 + 2, zy_2 + 2
     add16 zx, zx, cx
 
-    ; zy = zx_zy + zx_zy + cy   = 3 * 20 = 60 cyc
-    sub16 zy, zx_zy, zx_zy
+    ; zy = zx_zy + zx_zy + cy
+    add16 zy, zx_zy + 2, zx_zy + 2
     add16 zy, zy, cy
 
     ; 8.24: (-128 .. +127.9)
-    ; zx_2 = zx * zx            = 518 - 828 cyc
+    ; zx_2 = zx * zx
     imul16 zx_2, zx, zx
 
-    ; zy_2 = zy * zy            = 518 - 828 cyc
+    ; zy_2 = zy * zy
     imul16 zy_2, zy, zy
 
-    ; zx_zy = zx * zy           = 518 - 828 cyc
+    ; zx_zy = zx * zy
     imul16 zx_zy, zx, zy
 
-    ; dist = zx_2 + zy_2        = 38 cyc
+    ; dist = zx_2 + zy_2
     add32 dist, zx_2, zy_2
 
-    ; if dist >= 4 break, else continue iterating = 7 cyc
-    lda dist + 3  ; 3 cyc
-    cmp #4        ; 2 cyc
-    bmi still_in  ; 2 cyc
+    ; if dist >= 4 break, else continue iterating
+    lda dist + 3
+    cmp #4
+    bmi still_in
     rts
 still_in:
 
-    ; shift and round zx_2 to 4.12 = (60 + 5) - (60 + 28) = 65 - 88 cyc
-    .repeat 4      ; 60 cyc
-        shl24 zx_2 ; 15 cyc
-    .endrepeat
-    round16 zx_2   ; 5-28 cycles
-
-    ; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
-    .repeat 4      ; 60 cyc
-        shl24 zy_2 ; 15 cyc
-    .endrepeat
-    round16 zy_2   ; 5-28 cycles
-
-    ; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
-    .repeat 4       ; 60 cyc
-        shl24 zx_zy ; 15 cyc
-    .endrepeat
-    round16 zx_zy   ; 5-28 cycles
+    shift_round_16 zx_2, 4
+    shift_round_16 zy_2, 4
+    shift_round_16 zx_zy, 4
 
     ; if may be in the lake, look for looping output with a small buffer
     ; as an optimization vs running to max iters
-    jmp loop ; 3 cycles
+    jmp loop
 
 .endproc
 
@@ -708,7 +699,6 @@ loop_sx:
     zoom_factor cx, sx, zoom, aspect_x
     zoom_factor cy, sy, zoom, aspect_y
     jsr mandelbrot
-    inc iter
     jsr pset
 
     clc

From 1bef004ccd08d5716f181417870ea837d7d96cb6 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sun, 22 Jan 2023 11:17:51 -0800
Subject: [PATCH 4/5] precision cleanup

using 4.12 and 8.24 consistently
---
 mandel.s | 68 ++++++++++++++++++--------------------------------------
 1 file changed, 22 insertions(+), 46 deletions(-)

diff --git a/mandel.s b/mandel.s
index b4baad5..10782b8 100644
--- a/mandel.s
+++ b/mandel.s
@@ -1,17 +1,17 @@
 ; Our zero-page vars
 sx    = $80     ; i16: screen pixel x
 sy    = $82     ; i16: screen pixel y
-ox    = $84     ; fixed3.13: center point x
-oy    = $86     ; fixed3.13: center point y
-cx    = $84     ; fixed3.13: c_x
-cy    = $86     ; fixed3.13: c_y
-zx    = $88     ; fixed3.13: z_x
-zy    = $8a     ; fixed3.13: z_y
+ox    = $84     ; fixed4.12: center point x
+oy    = $86     ; fixed4.12: center point y
+cx    = $84     ; fixed4.12: c_x
+cy    = $86     ; fixed4.12: c_y
+zx    = $88     ; fixed4.12: z_x
+zy    = $8a     ; fixed4.12: z_y
 
-zx_2  = $90     ; fixed6.26: z_x^2
-zy_2  = $94     ; fixed6.26: z_y^2
-zx_zy = $98     ; fixed6.26: z_x * z_y
-dist  = $9c     ; fixed6.26: z_x^2 + z_y^2
+zx_2  = $90     ; fixed8.24: z_x^2
+zy_2  = $94     ; fixed8.24: z_y^2
+zx_zy = $98     ; fixed8.24: z_x * z_y
+dist  = $9c     ; fixed8.24: z_x^2 + z_y^2
 
 iter  = $a0     ; u8: iteration count
 zoom  = $a1     ; u8: zoom shift level
@@ -42,8 +42,6 @@ half_height = height >> 1
 width = 160
 half_width = width >> 1
 stride = width >> 2
-width_ratio_3_13 = (5 << 11) ; 5/4
-height_ratio_3_13 = (3 << 11) ; 5/4
 
 DMACTL = $D400
 DLISTL = $D402
@@ -101,18 +99,12 @@ aspect:
     ; 184h is the equiv of 220.8h at square pixels
     ; 320 / 220.8 = 1.45 display aspect ratio
 aspect_x:
-    .word 5 << (13 - 2)
+    .word 5 << (12 - 2)
 
 aspect_y:
-    .word 3 << (13 - 2)
+    .word 3 << (12 - 2)
 
 
-bit_masks:
-    .byte 3
-    .byte 3 << 2
-    .byte 3 << 4
-    .byte 3 << 6
-
 display_list_start:
     ; 24 lines overscan
     .repeat 3
@@ -244,21 +236,6 @@ color_map:
     neg 4, arg
 .endmacro
 
-.macro extend_8_16 dest, src
-    ; clobbers A, X
-    ; 13-15 cycles
-    .local positive
-    .local negative
-    ldx #0       ; 2 cyc
-    lda src      ; 3 cyc
-    sta dest     ; 3 cyc
-    bpl positive ; 2 cyc
-negative:
-    dex          ; 2 cyc
-positive:
-    stx dest + 1 ; 3 cyc
-.endmacro
-
 ; inner loop for imul16
 ; bitnum < 8: 25 or 41 cycles
 ; bitnum >= 8: 30 or 46 cycles
@@ -307,7 +284,6 @@ next:
         ror result ; 5 cyc
     .endif
 
-
 .endmacro
 
 ; 5 to 25 cycles
@@ -330,11 +306,18 @@ positive:
     copy32 dest, FR2  ; 24 cyc
 .endmacro
 
-.macro imul16_round dest, arg1, arg2
+.macro shift_round_16 arg, shift
+    .repeat shift
+        shl32 arg
+    .endrepeat
+    round16 arg
+.endmacro
+
+.macro imul16_round dest, arg1, arg2, shift
     copy16 FR0, arg1  ; 12 cyc
     copy16 FR1, arg2  ; 12 cyc
     jsr imul16_func   ; 470-780 cyc
-    round16 FR2       ; 5-28 cyc
+    shift_round_16 FR2, shift
     copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 
@@ -422,13 +405,6 @@ next:
 
 .endmacro
 
-.macro shift_round_16 arg, shift
-    .repeat shift
-        shl32 arg
-    .endrepeat
-    round16 arg
-.endmacro
-
 .proc mandelbrot
     ; input:
     ; cx: position scaled to 4.12 fixed point - -8..+7.9
@@ -514,7 +490,7 @@ enough:
 
     ; cy = cy * (3 / 4)
     ; cx = cx * (5 / 4)
-    imul16_round dest, dest, aspect
+    imul16_round dest, dest, aspect, 4
 .endmacro
 
 .proc pset

From 57975b7158c8dae6ee2d10519a4bca36f3b6dac1 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@pobox.com>
Date: Sun, 22 Jan 2023 12:02:15 -0800
Subject: [PATCH 5/5] not sure what's wrong have to hceck over

---
 mandel.s | 49 +++++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/mandel.s b/mandel.s
index 10782b8..f4862c6 100644
--- a/mandel.s
+++ b/mandel.s
@@ -254,10 +254,10 @@ color_map:
     ; 5 cycles either way
     .if bitnum < 8
         lda arg1                 ; 3 cyc
-        and #(1 << bitnum)       ; 2 cyc
+        and #(1 << (bitnum))       ; 2 cyc
     .else
         lda arg1 + 1             ; 3 cyc
-        and #(1 << (bitnum - 8)) ; 2 cyc
+        and #(1 << ((bitnum) - 8)) ; 2 cyc
     .endif
     bne one ; 2 cyc
 
@@ -434,43 +434,48 @@ loop:
     rts
 keep_going:
 
+    .macro quick_exit arg
+        .local keep_going
+        lda arg + 1
+        cmp #(4 << 4)
+        bmi keep_going
+        rts
+    keep_going:
+    .endmacro
+
     ; 4.12: (-8 .. +7.9)
     ; zx = zx_2  - zy_2  + cx
-    sub16 zx, zx_2 + 2, zy_2 + 2
+    sub16 zx, zx_2, zy_2
     add16 zx, zx, cx
+    quick_exit zx
 
     ; zy = zx_zy + zx_zy + cy
-    add16 zy, zx_zy + 2, zx_zy + 2
+    add16 zy, zx_zy, zx_zy
     add16 zy, zy, cy
 
-    ; 8.24: (-128 .. +127.9)
     ; zx_2 = zx * zx
-    imul16 zx_2, zx, zx
+    imul16_round zx_2, zx, zx, 4
+    quick_exit dist
 
     ; zy_2 = zy * zy
-    imul16 zy_2, zy, zy
+    imul16_round zy_2, zy, zy, 4
+    quick_exit dist
 
     ; zx_zy = zx * zy
-    imul16 zx_zy, zx, zy
+    imul16_round zx_zy, zx, zy, 4
+    quick_exit dist
 
     ; dist = zx_2 + zy_2
-    add32 dist, zx_2, zy_2
-
-    ; if dist >= 4 break, else continue iterating
-    lda dist + 3
-    cmp #4
-    bmi still_in
-    rts
-still_in:
-
-    shift_round_16 zx_2, 4
-    shift_round_16 zy_2, 4
-    shift_round_16 zx_zy, 4
+    add16 dist, zx_2, zy_2
+    quick_exit dist
 
     ; if may be in the lake, look for looping output with a small buffer
     ; as an optimization vs running to max iters
     jmp loop
 
+peace_out:
+    rts
+
 .endproc
 
 .macro zoom_factor dest, src, zoom, aspect
@@ -607,9 +612,13 @@ done:
     sta ox + 1
     sta oy
     sta oy + 1
+
+    ; zoom = 2x
+    lda #1
     sta zoom
 
     ; Disable display DMA
+    lda #0
     sta DMACTL
 
     ; zero the range from framebuffer_top to framebuffer_end