flesh out the mandelbrot iteration loop

some bits i missed increased total to: 1939 - 3007 cycles per iteration probably still buggy, will test later :D
2023-01-06 17:18:13 -08:00 · 2023-01-06 17:18:13 -08:00 · 32bd5a540c
commit 32bd5a540c
parent 3d94a9b5d4
1 changed files with 174 additions and 29 deletions
--- a/mandel.s
+++ b/mandel.s
@ -1,14 +1,99 @@
 ; Our zero-page vars
 sx    = $80     ; 8 bits: screen pixel x
 sy    = $81     ; 8 bits: screen pixel y
 cx    = $82     ; 16 bits fixed point
 cy    = $84     ; 16 bits fixed point
 zx    = $86     ; 16 bits fixed point
 zy    = $88     ; 16 bits fixed point
 zx_2  = $8a     ; 32 bits fixed point
 zy_2  = $8e     ; 32 bits fixed point
 zx_zy = $92     ; 32 bits fixed point
 dist  = $96     ; 32 bits fixed point  
 iter  = $9a     ; 8 bits iteration count
 temp  = $a0     ; debug temp area
 ; FP registers in zero page
 FR0 = $d4
 FRE = $da
 FR1 = $e0
 FR2 = $e6
 FRX = $ec
 .code
 .export start
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
    clc ; 2 cyc
    .repeat bytes, byte ; 9 * byte cycles
        lda arg1 + byte
        adc arg2 + byte
        sta dest + byte
    .endrepeat
 .endmacro
 .macro add16 dest, arg1, arg2
    add 2, dest, arg1, arg2
 .endmacro
 .macro add32 dest, arg1, arg2
    add 2, dest, arg2, dest
 .endmacro
 ; 2 + 9 * byte cycles
 .macro sub bytes, dest, arg1, arg2
    sec ; 2 cyc
    .repeat bytes, byte ; 9 * byte cycles
        lda arg1 + byte
        sbc arg2 + byte
        sta dest + byte
    .endrepeat
 .endmacro
 .macro sub16 dest, arg1, arg2
    sub 2, dest, arg1, arg2
 .endmacro
 .macro sub32 dest, arg1, arg2
    sub 4, dest, arg1, arg2
 .endmacro
 .macro shl bytes, arg
    asl arg
    .repeat bytes-1
        rol arg
    .endrepeat
 .endmacro
 .macro shl16 arg
    shl 2, arg
 .endmacro
 .macro shl24 arg
    shl 3, arg
 .endmacro
 .macro shl32 arg
    shl 4, arg
 .endmacro
 ; 6 * bytes cycles
 .macro copy bytes, dest, arg
    .repeat bytes, byte ; 6 * bytes cycles
        lda arg + byte  ; 3 cyc
        sta dest + byte ; 3 cyc
    .endrepeat
 .endmacro
 .macro copy16 dest, arg
    copy 2, dest, arg
 .endmacro
 .macro copy32 dest, arg
    copy 4, dest, arg
 .endmacro
 ; 2 + 8 * byte cycles
 .macro neg bytes, arg
    sec ; 2 cyc
@ -92,9 +177,17 @@ next:
 positive:
 .endmacro
 ; 518 - 828 cyc
 .macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
    copy16 FR1, arg2  ; 12 cyc
    jsr imul16_func   ; 470-780
    copy32 dest, FR2  ; 24 cyc
 .endmacro
 ; min 470 cycles
 ; max 780 cycles
-.proc imul16
+.proc imul16_func
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
@ -128,7 +221,7 @@ positive_result:
    rts ; 6 cyc
 .endproc
-.macro round16_incdec arg
+.macro round16 arg
    ; Round top 16 bits of 32-bit fixed-point number in-place
    .local zero
    .local one
@ -178,61 +271,113 @@ next:
-.proc iter
+.proc mandelbrot
-    ; still working on the fixed-point
+    ; input:
-    ; should we just use 16-bit adds?
+    ; cx: position scaled to 4.12 fixed point - -8..+7.9
-    ; does that require extra rounding?
+    ; cy: position scaled to 4.12
-    ; is the integer precision right?
+    ;
    ; output:
    ; iter: iteration count at escape or 0
    ; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9)
    ; zx = 0
    ; zy = 0
    ; zx_2 = 0
    ; zy_2 = 0
    ; zx_zy = 0
    ; dist = 0
    ; iter = 0
    lda #00
    ldx iter - zx
 initloop:
    sta zx,x
    dex
    bne initloop
 loop:
-    ; 1652 - 2651 cyc
+    ; 1939 - 3007 cyc
-    ; iters++ = 2 cyc
+    ; iter++ & max-iters break = 7 cyc
    inc iter       ; 5 cyc
    bne keep_going ; 2 cyc
    rts
 keep_going:
    ; 4.12: (-8 .. +7.9)
-    ; zx = zx_2  + zy_2  + cx   = 3 * 20 = 60 cyc
+    ; zx = zx_2  - zy_2  + cx   = 3 * 20 = 60 cyc
    sub16 zx, zx_2, zy_2
    add16 zx, zx, cx
    ; zy = zx_zy + zx_zy + cy   = 3 * 20 = 60 cyc
    sub16 zy, zx_zy, zx_zy
    add16 zy, zy, cy
    ; 8.24: (-128 .. +127.9)
-    ; zx_2 = zx * zx            = 470 - 780 cyc
+    ; zx_2 = zx * zx            = 518 - 828 cyc
-    ; zy_2 = zy * zy            = 470 - 780 cyc
+    imul16 zx_2, zx, zx
    ; zx_zy = zx * zy           = 470 - 780 cyc
    ; dist = zx_2 + zy_2        = 38 cyc
    ; if dist >= 4 break, else continue iterating = 7 cyc
-    ; shift and round zx_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
+    ; zy_2 = zy * zy            = 518 - 828 cyc
-    ; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
+    imul16 zy_2, zy, zy
-    ; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
+
    ; zx_zy = zx * zy           = 518 - 828 cyc
    imul16 zx_zy, zx, zy
    ; dist = zx_2 + zy_2        = 38 cyc
    add32 dist, zx_2, zy_2
    ; if dist >= 4 break, else continue iterating = 7 cyc
    lda dist + 3  ; 3 cyc
    cmp #4        ; 2 cyc
    bmi still_in  ; 2 cyc
    rts
 still_in:
    ; shift and round zx_2 to 4.12 = (60 + 5) - (60 + 28) = 65 - 88 cyc
    .repeat 4      ; 60 cyc
        shl24 zx_2 ; 15 cyc
    .endrepeat
    round16 zx_2   ; 5-28 cycles
    ; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
    .repeat 4      ; 60 cyc
        shl24 zy_2 ; 15 cyc
    .endrepeat
    round16 zy_2   ; 5-28 cycles
    ; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
    .repeat 4       ; 60 cyc
        shl24 zx_zy ; 15 cyc
    .endrepeat
    round16 zx_zy   ; 5-28 cycles
    ; if may be in the lake, look for looping output with a small buffer
    ; as an optimization vs running to max iters
    jmp loop ; 3 cycles
 .endproc
 .proc start
 looplong:
-    ; FR0 = 5
+    ; cx = -0.5
-    ; FR1 = -3
+    lda #$f7
-    lda #5
+    sta cx
    sta FR0
    lda #0
    sta FR0 + 1
    lda #$fd
    sta FR1
    lda #$ff
-    sta FR1 + 1
+    sta cx + 1
-    jsr imul16
+    ; cy = 1
    lda #$10
    sta cy
    lda #$00
    sta cy + 1
    jsr mandelbrot
    ; should have 32-bit -15 in FR2
    ; save the completed iter count for debugging
    lda iter
    sta temp
 loop:
-    jmp loop
+    ; keep looping over so we can work in the debugger
    jmp looplong
 .endproc