diff --git a/mandel.s b/mandel.s index 2b1e61d..31097bf 100644 --- a/mandel.s +++ b/mandel.s @@ -1,14 +1,99 @@ +; Our zero-page vars +sx = $80 ; 8 bits: screen pixel x +sy = $81 ; 8 bits: screen pixel y +cx = $82 ; 16 bits fixed point +cy = $84 ; 16 bits fixed point +zx = $86 ; 16 bits fixed point +zy = $88 ; 16 bits fixed point +zx_2 = $8a ; 32 bits fixed point +zy_2 = $8e ; 32 bits fixed point +zx_zy = $92 ; 32 bits fixed point +dist = $96 ; 32 bits fixed point +iter = $9a ; 8 bits iteration count + +temp = $a0 ; debug temp area + ; FP registers in zero page FR0 = $d4 FRE = $da FR1 = $e0 FR2 = $e6 -FRX = $ec .code .export start +; 2 + 9 * byte cycles +.macro add bytes, dest, arg1, arg2 + clc ; 2 cyc + .repeat bytes, byte ; 9 * byte cycles + lda arg1 + byte + adc arg2 + byte + sta dest + byte + .endrepeat +.endmacro + +.macro add16 dest, arg1, arg2 + add 2, dest, arg1, arg2 +.endmacro + +.macro add32 dest, arg1, arg2 + add 2, dest, arg2, dest +.endmacro + +; 2 + 9 * byte cycles +.macro sub bytes, dest, arg1, arg2 + sec ; 2 cyc + .repeat bytes, byte ; 9 * byte cycles + lda arg1 + byte + sbc arg2 + byte + sta dest + byte + .endrepeat +.endmacro + +.macro sub16 dest, arg1, arg2 + sub 2, dest, arg1, arg2 +.endmacro + +.macro sub32 dest, arg1, arg2 + sub 4, dest, arg1, arg2 +.endmacro + +.macro shl bytes, arg + asl arg + .repeat bytes-1 + rol arg + .endrepeat +.endmacro + +.macro shl16 arg + shl 2, arg +.endmacro + +.macro shl24 arg + shl 3, arg +.endmacro + +.macro shl32 arg + shl 4, arg +.endmacro + +; 6 * bytes cycles +.macro copy bytes, dest, arg + .repeat bytes, byte ; 6 * bytes cycles + lda arg + byte ; 3 cyc + sta dest + byte ; 3 cyc + .endrepeat +.endmacro + +.macro copy16 dest, arg + copy 2, dest, arg +.endmacro + +.macro copy32 dest, arg + copy 4, dest, arg +.endmacro + ; 2 + 8 * byte cycles .macro neg bytes, arg sec ; 2 cyc @@ -92,9 +177,17 @@ next: positive: .endmacro +; 518 - 828 cyc +.macro imul16 dest, arg1, arg2 + copy16 FR0, arg1 ; 12 cyc + copy16 FR1, arg2 ; 12 cyc + jsr imul16_func ; 470-780 + copy32 dest, FR2 ; 24 cyc +.endmacro + ; min 470 cycles ; max 780 cycles -.proc imul16 +.proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result @@ -128,7 +221,7 @@ positive_result: rts ; 6 cyc .endproc -.macro round16_incdec arg +.macro round16 arg ; Round top 16 bits of 32-bit fixed-point number in-place .local zero .local one @@ -178,61 +271,113 @@ next: -.proc iter - ; still working on the fixed-point - ; should we just use 16-bit adds? - ; does that require extra rounding? - ; is the integer precision right? +.proc mandelbrot + ; input: + ; cx: position scaled to 4.12 fixed point - -8..+7.9 + ; cy: position scaled to 4.12 + ; + ; output: + ; iter: iteration count at escape or 0 - ; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9) ; zx = 0 ; zy = 0 ; zx_2 = 0 ; zy_2 = 0 ; zx_zy = 0 + ; dist = 0 + ; iter = 0 + lda #00 + ldx iter - zx +initloop: + sta zx,x + dex + bne initloop loop: - ; 1652 - 2651 cyc + ; 1939 - 3007 cyc - ; iters++ = 2 cyc + ; iter++ & max-iters break = 7 cyc + inc iter ; 5 cyc + bne keep_going ; 2 cyc + rts +keep_going: ; 4.12: (-8 .. +7.9) - ; zx = zx_2 + zy_2 + cx = 3 * 20 = 60 cyc + ; zx = zx_2 - zy_2 + cx = 3 * 20 = 60 cyc + sub16 zx, zx_2, zy_2 + add16 zx, zx, cx + ; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc + sub16 zy, zx_zy, zx_zy + add16 zy, zy, cy ; 8.24: (-128 .. +127.9) - ; zx_2 = zx * zx = 470 - 780 cyc - ; zy_2 = zy * zy = 470 - 780 cyc - ; zx_zy = zx * zy = 470 - 780 cyc - ; dist = zx_2 + zy_2 = 38 cyc - ; if dist >= 4 break, else continue iterating = 7 cyc + ; zx_2 = zx * zx = 518 - 828 cyc + imul16 zx_2, zx, zx - ; shift and round zx_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc - ; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc - ; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc + ; zy_2 = zy * zy = 518 - 828 cyc + imul16 zy_2, zy, zy + + ; zx_zy = zx * zy = 518 - 828 cyc + imul16 zx_zy, zx, zy + + ; dist = zx_2 + zy_2 = 38 cyc + add32 dist, zx_2, zy_2 + + ; if dist >= 4 break, else continue iterating = 7 cyc + lda dist + 3 ; 3 cyc + cmp #4 ; 2 cyc + bmi still_in ; 2 cyc + rts +still_in: + + ; shift and round zx_2 to 4.12 = (60 + 5) - (60 + 28) = 65 - 88 cyc + .repeat 4 ; 60 cyc + shl24 zx_2 ; 15 cyc + .endrepeat + round16 zx_2 ; 5-28 cycles + + ; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc + .repeat 4 ; 60 cyc + shl24 zy_2 ; 15 cyc + .endrepeat + round16 zy_2 ; 5-28 cycles + + ; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc + .repeat 4 ; 60 cyc + shl24 zx_zy ; 15 cyc + .endrepeat + round16 zx_zy ; 5-28 cycles ; if may be in the lake, look for looping output with a small buffer ; as an optimization vs running to max iters + jmp loop ; 3 cycles .endproc .proc start looplong: - ; FR0 = 5 - ; FR1 = -3 - lda #5 - sta FR0 - lda #0 - sta FR0 + 1 - lda #$fd - sta FR1 + ; cx = -0.5 + lda #$f7 + sta cx lda #$ff - sta FR1 + 1 + sta cx + 1 - jsr imul16 + ; cy = 1 + lda #$10 + sta cy + lda #$00 + sta cy + 1 + + jsr mandelbrot ; should have 32-bit -15 in FR2 + ; save the completed iter count for debugging + lda iter + sta temp + loop: - jmp loop + ; keep looping over so we can work in the debugger + jmp looplong .endproc