flesh out the mandelbrot iteration loop

some bits i missed increased total to:
1939 - 3007 cycles per iteration

probably still buggy, will test later :D
This commit is contained in:
Brooke Vibber 2023-01-06 17:18:13 -08:00
parent 3d94a9b5d4
commit 32bd5a540c

207
mandel.s
View file

@ -1,14 +1,99 @@
; Our zero-page vars
sx = $80 ; 8 bits: screen pixel x
sy = $81 ; 8 bits: screen pixel y
cx = $82 ; 16 bits fixed point
cy = $84 ; 16 bits fixed point
zx = $86 ; 16 bits fixed point
zy = $88 ; 16 bits fixed point
zx_2 = $8a ; 32 bits fixed point
zy_2 = $8e ; 32 bits fixed point
zx_zy = $92 ; 32 bits fixed point
dist = $96 ; 32 bits fixed point
iter = $9a ; 8 bits iteration count
temp = $a0 ; debug temp area
; FP registers in zero page ; FP registers in zero page
FR0 = $d4 FR0 = $d4
FRE = $da FRE = $da
FR1 = $e0 FR1 = $e0
FR2 = $e6 FR2 = $e6
FRX = $ec
.code .code
.export start .export start
; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
clc ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
adc arg2 + byte
sta dest + byte
.endrepeat
.endmacro
.macro add16 dest, arg1, arg2
add 2, dest, arg1, arg2
.endmacro
.macro add32 dest, arg1, arg2
add 2, dest, arg2, dest
.endmacro
; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2
sec ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
sbc arg2 + byte
sta dest + byte
.endrepeat
.endmacro
.macro sub16 dest, arg1, arg2
sub 2, dest, arg1, arg2
.endmacro
.macro sub32 dest, arg1, arg2
sub 4, dest, arg1, arg2
.endmacro
.macro shl bytes, arg
asl arg
.repeat bytes-1
rol arg
.endrepeat
.endmacro
.macro shl16 arg
shl 2, arg
.endmacro
.macro shl24 arg
shl 3, arg
.endmacro
.macro shl32 arg
shl 4, arg
.endmacro
; 6 * bytes cycles
.macro copy bytes, dest, arg
.repeat bytes, byte ; 6 * bytes cycles
lda arg + byte ; 3 cyc
sta dest + byte ; 3 cyc
.endrepeat
.endmacro
.macro copy16 dest, arg
copy 2, dest, arg
.endmacro
.macro copy32 dest, arg
copy 4, dest, arg
.endmacro
; 2 + 8 * byte cycles ; 2 + 8 * byte cycles
.macro neg bytes, arg .macro neg bytes, arg
sec ; 2 cyc sec ; 2 cyc
@ -92,9 +177,17 @@ next:
positive: positive:
.endmacro .endmacro
; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; 470-780
copy32 dest, FR2 ; 24 cyc
.endmacro
; min 470 cycles ; min 470 cycles
; max 780 cycles ; max 780 cycles
.proc imul16 .proc imul16_func
arg1 = FR0 ; 16-bit arg (clobbered) arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result result = FR2 ; 32-bit result
@ -128,7 +221,7 @@ positive_result:
rts ; 6 cyc rts ; 6 cyc
.endproc .endproc
.macro round16_incdec arg .macro round16 arg
; Round top 16 bits of 32-bit fixed-point number in-place ; Round top 16 bits of 32-bit fixed-point number in-place
.local zero .local zero
.local one .local one
@ -178,61 +271,113 @@ next:
.proc iter .proc mandelbrot
; still working on the fixed-point ; input:
; should we just use 16-bit adds? ; cx: position scaled to 4.12 fixed point - -8..+7.9
; does that require extra rounding? ; cy: position scaled to 4.12
; is the integer precision right? ;
; output:
; iter: iteration count at escape or 0
; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9)
; zx = 0 ; zx = 0
; zy = 0 ; zy = 0
; zx_2 = 0 ; zx_2 = 0
; zy_2 = 0 ; zy_2 = 0
; zx_zy = 0 ; zx_zy = 0
; dist = 0
; iter = 0
lda #00
ldx iter - zx
initloop:
sta zx,x
dex
bne initloop
loop: loop:
; 1652 - 2651 cyc ; 1939 - 3007 cyc
; iters++ = 2 cyc ; iter++ & max-iters break = 7 cyc
inc iter ; 5 cyc
bne keep_going ; 2 cyc
rts
keep_going:
; 4.12: (-8 .. +7.9) ; 4.12: (-8 .. +7.9)
; zx = zx_2 + zy_2 + cx = 3 * 20 = 60 cyc ; zx = zx_2 - zy_2 + cx = 3 * 20 = 60 cyc
sub16 zx, zx_2, zy_2
add16 zx, zx, cx
; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc ; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc
sub16 zy, zx_zy, zx_zy
add16 zy, zy, cy
; 8.24: (-128 .. +127.9) ; 8.24: (-128 .. +127.9)
; zx_2 = zx * zx = 470 - 780 cyc ; zx_2 = zx * zx = 518 - 828 cyc
; zy_2 = zy * zy = 470 - 780 cyc imul16 zx_2, zx, zx
; zx_zy = zx * zy = 470 - 780 cyc
; dist = zx_2 + zy_2 = 38 cyc
; if dist >= 4 break, else continue iterating = 7 cyc
; shift and round zx_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc ; zy_2 = zy * zy = 518 - 828 cyc
; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc imul16 zy_2, zy, zy
; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
; zx_zy = zx * zy = 518 - 828 cyc
imul16 zx_zy, zx, zy
; dist = zx_2 + zy_2 = 38 cyc
add32 dist, zx_2, zy_2
; if dist >= 4 break, else continue iterating = 7 cyc
lda dist + 3 ; 3 cyc
cmp #4 ; 2 cyc
bmi still_in ; 2 cyc
rts
still_in:
; shift and round zx_2 to 4.12 = (60 + 5) - (60 + 28) = 65 - 88 cyc
.repeat 4 ; 60 cyc
shl24 zx_2 ; 15 cyc
.endrepeat
round16 zx_2 ; 5-28 cycles
; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
.repeat 4 ; 60 cyc
shl24 zy_2 ; 15 cyc
.endrepeat
round16 zy_2 ; 5-28 cycles
; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
.repeat 4 ; 60 cyc
shl24 zx_zy ; 15 cyc
.endrepeat
round16 zx_zy ; 5-28 cycles
; if may be in the lake, look for looping output with a small buffer ; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters ; as an optimization vs running to max iters
jmp loop ; 3 cycles
.endproc .endproc
.proc start .proc start
looplong: looplong:
; FR0 = 5 ; cx = -0.5
; FR1 = -3 lda #$f7
lda #5 sta cx
sta FR0
lda #0
sta FR0 + 1
lda #$fd
sta FR1
lda #$ff lda #$ff
sta FR1 + 1 sta cx + 1
jsr imul16 ; cy = 1
lda #$10
sta cy
lda #$00
sta cy + 1
jsr mandelbrot
; should have 32-bit -15 in FR2 ; should have 32-bit -15 in FR2
; save the completed iter count for debugging
lda iter
sta temp
loop: loop:
jmp loop ; keep looping over so we can work in the debugger
jmp looplong
.endproc .endproc