3 changed files with 96 additions and 157 deletions
--- a/mandel.s
+++ b/mandel.s
@ -1,42 +1,43 @@
 ; Our zero-page vars
-ox              = $80 ; fixed8.24: center point x
+sx    = $80     ; i16: screen pixel x
-oy              = $84 ; fixed8.24: center point y
+sy    = $82     ; i16: screen pixel y
-cx              = $88 ; fixed8.24: c_x
+ox    = $84     ; fixed4.12: center point x
-cy              = $8c ; fixed8.24: c_y
+oy    = $86     ; fixed4.12: center point y
 cx    = $88     ; fixed4.12: c_x
 cy    = $8a     ; fixed4.12: c_y
 zx    = $8c     ; fixed4.12: z_x
 zy    = $8e     ; fixed4.12: z_y
-zx              = $90 ; fixed8.24: z_x
+zx_2  = $90     ; fixed4.12: z_x^2
-zy              = $94 ; fixed8.24: z_y
+zy_2  = $92     ; fixed4.12: z_y^2
-zx_2            = $98 ; fixed8.24: z_x^2
+zx_zy = $94     ; fixed4.12: z_x * z_y
-zy_2            = $9c ; fixed8.24: z_y^2
+dist  = $96     ; fixed4.12: z_x^2 + z_y^2
-zx_zy           = $a0 ; fixed8.24: z_x * z_y
+iter          = $a0 ; u8: iteration count
 dist            = $a4 ; fixed8.24: z_x^2 + z_y^2
 sx              = $a8 ; i16: screen pixel x
 sy              = $aa ; i16: screen pixel y
 z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
 z_buffer_start  = $ad ; u8: index into z_buffer
 z_buffer_end    = $ae ; u8: index into z_buffer
 iter            = $af ; u8: iteration count
-ptr             = $b0 ; u16
+zoom          = $a1 ; u8: zoom shift level
-pixel_ptr       = $b2 ; u16
+count_frames  = $a2 ; u8
-zoom            = $b4 ; u8: zoom shift level
+count_pixels  = $a3 ; u8
-fill_level      = $b5 ; u8
+total_ms      = $a4 ; float48
-pixel_color     = $b6 ; u8
+total_pixels  = $aa ; float48
 pixel_mask      = $b7 ; u8
 pixel_shift     = $b8 ; u8
 pixel_offset    = $b9 ; u8
 palette_offset  = $ba ; u8
 chroma_offset   = $bb ; u8
 palette_ticks   = $bc ; u8
 chroma_ticks    = $bd ; u8
 count_frames    = $be ; u8
 count_pixels    = $bf ; u8
-total_pixels    = $c0 ; float48
+z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
-total_ms        = $c6 ; float48
+z_buffer_start  = $b1 ; u8: index into z_buffer
-temp            = $cc ; u16
+z_buffer_end    = $b2 ; u8: index into z_buffer
-temp2           = $ce ; u16
+temp            = $b4 ; u16
 temp2           = $b6 ; u16
 pixel_ptr       = $b8 ; u16
 pixel_color     = $ba ; u8
 pixel_mask      = $bb ; u8
 pixel_shift     = $bc ; u8
 pixel_offset    = $bd ; u8
 fill_level      = $be ; u8
 palette_offset  = $bf ; u8
 palette_ticks = $c0 ; u8
 chroma_ticks  = $c1 ; u8
 chroma_offset = $c2 ; u8
 ptr           = $c4 ; u16
 palette_delay = 23
 chroma_delay = 137
@ -292,16 +293,16 @@ viewport_zoom:
    .byte 6
 viewport_ox:
-    .dword $00000000
+    .word $0000
-    .dword $ff110000
+    .word $f110
-    .dword $ff110000
+    .word $f110
-    .dword $fe400000
+    .word $e400
 viewport_oy:
-    .dword $00000000
+    .word $0000
-    .dword $ffb60000
+    .word $fb60
-    .dword $ffbe0000
+    .word $fbe0
-    .dword $00000000
+    .word $0000
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
@ -320,7 +321,7 @@ viewport_oy:
 ; 38 cycles
 .macro add32 dest, arg1, arg2
-    add 4, dest, arg1, arg2
+    add 4, dest, arg2, dest
 .endmacro
 ; 8 cycles
@ -425,25 +426,22 @@ viewport_oy:
    round16 arg ; 11-27 cycles
 .endmacro
-; input: arg1, arg2 as fixed4.12
+.macro imul16_round dest, arg1, arg2, shift
 ; output: dest as fixed8.24
 .macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
    copy16 FR1, arg2  ; 12 cyc
    jsr imul16_func   ; ? cyc
-    copy32 dest, FR2  ; 24 cyc
+    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
-; input: arg as fixed4.12
+.macro sqr16_round dest, arg, shift
-; output: dest as fixed8.24
+    ;imul16_round dest, arg, arg, shift
 .macro sqr16 dest, arg
    copy16 FR0, arg   ; 12 cyc
    jsr sqr16_func      ; ? cyc
-    copy32 dest, FR2  ; 24 cyc
+    shift_round_16 FR2, shift ; 103-119 cycles for shift=4
    copy16 dest, FR2 + 2  ; 12 cyc
 .endmacro
 ; input: arg as u8
 ; output: dest as u16
 ; clobbers a, x
 .macro sqr8 dest, arg
    ldx arg
@ -453,8 +451,6 @@ viewport_oy:
    sta dest + 1
 .endmacro
 ; input: arg as u8
 ; input/output: dest as u16
 ; clobbers a, x
 .macro sqr8_add16 dest, arg
    ldx arg
@ -875,8 +871,8 @@ next:
 .proc mandelbrot
    ; input:
-    ; cx: position scaled to 8.24 fixed point - -128..+127.9
+    ; cx: position scaled to 4.12 fixed point - -8..+7.9
-    ; cy: position scaled to 8.24
+    ; cy: position scaled to 4.12
    ;
    ; output:
    ; iter: iteration count at escape or 0
@ -888,41 +884,12 @@ next:
    ; zx_zy = 0
    ; dist = 0
    ; iter = 0
 ;    lda #00
 ;    ldx #(iter - zx + 1)
 ;initloop:
 ;    sta zx - 1,x
 ;    dex
 ;    bne initloop
 ;    sta z_buffer_start
 ;    sta z_buffer_end
    lda #00
-    sta zx
+    ldx #(iter - zx + 1)
-    sta zx + 1
+initloop:
-    sta zx + 2
+    sta zx - 1,x
-    sta zx + 3
+    dex
-    sta zy
+    bne initloop
    sta zy + 1
    sta zy + 2
    sta zy + 3
    sta zx_2
    sta zx_2 + 1
    sta zx_2 + 2
    sta zx_2 + 3
    sta zy_2
    sta zy_2 + 1
    sta zy_2 + 2
    sta zy_2 + 3
    sta zx_zy
    sta zx_zy + 1
    sta zx_zy + 2
    sta zx_zy + 3
    sta dist
    sta dist + 1
    sta dist + 2
    sta dist + 3
    sta iter
    sta z_buffer_start
    sta z_buffer_end
@ -934,8 +901,6 @@ loop:
 keep_going:
    .macro quick_exit arg, max
        ; arg: fixed8.24
        ; max: integer
        .local positive
        .local negative
        .local nope_out
@ -943,16 +908,16 @@ keep_going:
        .local all_done
        ; check sign bit
-        lda arg + 3
+        lda arg + 1
        bmi negative
    positive:
-        cmp #max
+        cmp #((max) << 4)
        bmi all_done ; 'less than'
        jmp exit_path
    negative:
-        cmp #(256 - max)
+        cmp #(256 - ((max) << 4))
        beq first_equal ; 'equal' on first byte
        bpl all_done    ; 'greater than'
@ -960,44 +925,34 @@ keep_going:
        jmp exit_path
    first_equal:
        ; following bytes all 0 shows it's really 'equal'
        lda arg + 2
        bne all_done
        lda arg + 1
        bne all_done
        lda arg
-        bne all_done
+        beq nope_out  ; 2nd byte 0 shows it's really 'equal'
        jmp exit_path
    all_done:
    .endmacro
-    ; 8.24: (-128 .. 127.9)
+    ; 4.12: (-8 .. +7.9)
    ; zx = zx_2  - zy_2  + cx
-    sub32 zx, zx_2, zy_2
+    sub16 zx, zx_2, zy_2
-    add32 zx, zx, cx
+    add16 zx, zx, cx
    quick_exit zx, 2
    ; zy = zx_zy + zx_zy + cy
-    add32 zy, zx_zy, zx_zy
+    add16 zy, zx_zy, zx_zy
-    add32 zy, zy, cy
+    add16 zy, zy, cy
    quick_exit zy, 2
    ; convert 8.24 -> 4.12: (-8 .. +7.9)
    shift_round_16 zx, 4
    shift_round_16 zy, 4
    ; zx_2 = zx * zx
-    sqr16 zx_2, zx + 2
+    sqr16_round zx_2, zx, 4
    ; zy_2 = zy * zy
-    sqr16 zy_2, zy + 2
+    sqr16_round zy_2, zy, 4
    ; zx_zy = zx * zy
-    imul16 zx_zy, zx + 2, zy + 2
+    imul16_round zx_zy, zx, zy, 4
    ; dist = zx_2 + zy_2
-    add32 dist, zx_2, zy_2
+    add16 dist, zx_2, zy_2
    quick_exit dist, 4
    ; if may be in the lake, look for looping output with a small buffer
@ -1034,10 +989,10 @@ z_buffer_loop:
    ; Compare the previously stored z values
    ldy #0
-    z_compare zx + 2
+    z_compare zx
-    z_compare zx + 3
+    z_compare zx + 1
-    z_compare zy + 2
+    z_compare zy
-    z_compare zy + 3
+    z_compare zy + 1
    cpy #4
    bne z_no_matches
@ -1052,10 +1007,10 @@ z_no_matches:
 z_nothing_to_read:
    ; Store and expand
-    z_store zx + 2
+    z_store zx
-    z_store zx + 3
+    z_store zx + 1
-    z_store zy + 2
+    z_store zy
-    z_store zy + 3
+    z_store zy + 1
    z_advance
    stx z_buffer_end
@ -1106,17 +1061,14 @@ cont:
 enough:
 .endmacro
-.macro zoom_factor dest, src, aspect
+.macro zoom_factor dest, src, zoom, aspect
    ; output: dest: fixed8.24
    ; input: src: fixed4.12
    ; aspect: fixed4.12
    ; clobbers A, X, flags, etc
    copy16 dest, src
    scale_zoom dest
    ; cy = cy * (3 / 4)
    ; cx = cx * (5 / 4)
-    imul16 dest, dest, aspect
+    imul16_round dest, dest, aspect, 4
 .endmacro
 .proc pset
@ -1454,32 +1406,17 @@ zero_byte_loop:
    txa
    asl a
    asl a
    tax
    lda viewport_ox,x
    sta ox
    lda viewport_oy,x
    sta oy
    inx
    lda viewport_ox,x
    sta ox + 1
    lda viewport_oy,x
    sta oy + 1
    inx
    lda viewport_ox,x
    sta ox + 2
    lda viewport_oy,x
    sta oy + 2
    inx
    lda viewport_ox,x
    sta ox + 3
    lda viewport_oy,x
    sta oy + 3
    rts
 .endproc
@ -1601,10 +1538,10 @@ skipped_mask:
 not_skipped_mask:
    ; run the fractal!
-    zoom_factor cx, sx, aspect_x
+    zoom_factor cx, sx, zoom, aspect_x
-    add32 cx, cx, ox
+    add16 cx, cx, ox
-    zoom_factor cy, sy, aspect_y
+    zoom_factor cy, sy, zoom, aspect_y
-    add32 cy, cy, oy
+    add16 cy, cy, oy
    jsr mandelbrot
    jsr pset
--- a/readme.md
+++ b/readme.md
@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
 * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
-The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
+The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
 Iterations are capped at 255.
--- a/todo.md
+++ b/todo.md
@ -3,11 +3,13 @@ things to try:
 * skip add on the top-byte multiply in sqr8/mul8
  * should save a few cycles, suggestion by jamey
 * perform the zx += zx^s + cx in 32-bit space, before rounding
  * should improve precision on max zoom, might cost a few cycles
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 * try 3.13 fixed point instead of 4.12 for more precision
  * can we get away without the extra bit?
  * since exit compare space would be 6.26 i think so
 * y-axis mirror optimization