flesh out the mandelbrot iteration loop
some bits i missed increased total to: 1939 - 3007 cycles per iteration probably still buggy, will test later :D
This commit is contained in:
parent
3d94a9b5d4
commit
32bd5a540c
1 changed files with 176 additions and 31 deletions
207
mandel.s
207
mandel.s
|
@ -1,14 +1,99 @@
|
|||
; Our zero-page vars
|
||||
sx = $80 ; 8 bits: screen pixel x
|
||||
sy = $81 ; 8 bits: screen pixel y
|
||||
cx = $82 ; 16 bits fixed point
|
||||
cy = $84 ; 16 bits fixed point
|
||||
zx = $86 ; 16 bits fixed point
|
||||
zy = $88 ; 16 bits fixed point
|
||||
zx_2 = $8a ; 32 bits fixed point
|
||||
zy_2 = $8e ; 32 bits fixed point
|
||||
zx_zy = $92 ; 32 bits fixed point
|
||||
dist = $96 ; 32 bits fixed point
|
||||
iter = $9a ; 8 bits iteration count
|
||||
|
||||
temp = $a0 ; debug temp area
|
||||
|
||||
; FP registers in zero page
|
||||
FR0 = $d4
|
||||
FRE = $da
|
||||
FR1 = $e0
|
||||
FR2 = $e6
|
||||
FRX = $ec
|
||||
|
||||
.code
|
||||
|
||||
.export start
|
||||
|
||||
; 2 + 9 * byte cycles
|
||||
.macro add bytes, dest, arg1, arg2
|
||||
clc ; 2 cyc
|
||||
.repeat bytes, byte ; 9 * byte cycles
|
||||
lda arg1 + byte
|
||||
adc arg2 + byte
|
||||
sta dest + byte
|
||||
.endrepeat
|
||||
.endmacro
|
||||
|
||||
.macro add16 dest, arg1, arg2
|
||||
add 2, dest, arg1, arg2
|
||||
.endmacro
|
||||
|
||||
.macro add32 dest, arg1, arg2
|
||||
add 2, dest, arg2, dest
|
||||
.endmacro
|
||||
|
||||
; 2 + 9 * byte cycles
|
||||
.macro sub bytes, dest, arg1, arg2
|
||||
sec ; 2 cyc
|
||||
.repeat bytes, byte ; 9 * byte cycles
|
||||
lda arg1 + byte
|
||||
sbc arg2 + byte
|
||||
sta dest + byte
|
||||
.endrepeat
|
||||
.endmacro
|
||||
|
||||
.macro sub16 dest, arg1, arg2
|
||||
sub 2, dest, arg1, arg2
|
||||
.endmacro
|
||||
|
||||
.macro sub32 dest, arg1, arg2
|
||||
sub 4, dest, arg1, arg2
|
||||
.endmacro
|
||||
|
||||
.macro shl bytes, arg
|
||||
asl arg
|
||||
.repeat bytes-1
|
||||
rol arg
|
||||
.endrepeat
|
||||
.endmacro
|
||||
|
||||
.macro shl16 arg
|
||||
shl 2, arg
|
||||
.endmacro
|
||||
|
||||
.macro shl24 arg
|
||||
shl 3, arg
|
||||
.endmacro
|
||||
|
||||
.macro shl32 arg
|
||||
shl 4, arg
|
||||
.endmacro
|
||||
|
||||
; 6 * bytes cycles
|
||||
.macro copy bytes, dest, arg
|
||||
.repeat bytes, byte ; 6 * bytes cycles
|
||||
lda arg + byte ; 3 cyc
|
||||
sta dest + byte ; 3 cyc
|
||||
.endrepeat
|
||||
.endmacro
|
||||
|
||||
.macro copy16 dest, arg
|
||||
copy 2, dest, arg
|
||||
.endmacro
|
||||
|
||||
.macro copy32 dest, arg
|
||||
copy 4, dest, arg
|
||||
.endmacro
|
||||
|
||||
; 2 + 8 * byte cycles
|
||||
.macro neg bytes, arg
|
||||
sec ; 2 cyc
|
||||
|
@ -92,9 +177,17 @@ next:
|
|||
positive:
|
||||
.endmacro
|
||||
|
||||
; 518 - 828 cyc
|
||||
.macro imul16 dest, arg1, arg2
|
||||
copy16 FR0, arg1 ; 12 cyc
|
||||
copy16 FR1, arg2 ; 12 cyc
|
||||
jsr imul16_func ; 470-780
|
||||
copy32 dest, FR2 ; 24 cyc
|
||||
.endmacro
|
||||
|
||||
; min 470 cycles
|
||||
; max 780 cycles
|
||||
.proc imul16
|
||||
.proc imul16_func
|
||||
arg1 = FR0 ; 16-bit arg (clobbered)
|
||||
arg2 = FR1 ; 16-bit arg (clobbered)
|
||||
result = FR2 ; 32-bit result
|
||||
|
@ -128,7 +221,7 @@ positive_result:
|
|||
rts ; 6 cyc
|
||||
.endproc
|
||||
|
||||
.macro round16_incdec arg
|
||||
.macro round16 arg
|
||||
; Round top 16 bits of 32-bit fixed-point number in-place
|
||||
.local zero
|
||||
.local one
|
||||
|
@ -178,61 +271,113 @@ next:
|
|||
|
||||
|
||||
|
||||
.proc iter
|
||||
; still working on the fixed-point
|
||||
; should we just use 16-bit adds?
|
||||
; does that require extra rounding?
|
||||
; is the integer precision right?
|
||||
.proc mandelbrot
|
||||
; input:
|
||||
; cx: position scaled to 4.12 fixed point - -8..+7.9
|
||||
; cy: position scaled to 4.12
|
||||
;
|
||||
; output:
|
||||
; iter: iteration count at escape or 0
|
||||
|
||||
; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9)
|
||||
; zx = 0
|
||||
; zy = 0
|
||||
; zx_2 = 0
|
||||
; zy_2 = 0
|
||||
; zx_zy = 0
|
||||
; dist = 0
|
||||
; iter = 0
|
||||
lda #00
|
||||
ldx iter - zx
|
||||
initloop:
|
||||
sta zx,x
|
||||
dex
|
||||
bne initloop
|
||||
|
||||
loop:
|
||||
; 1652 - 2651 cyc
|
||||
; 1939 - 3007 cyc
|
||||
|
||||
; iters++ = 2 cyc
|
||||
; iter++ & max-iters break = 7 cyc
|
||||
inc iter ; 5 cyc
|
||||
bne keep_going ; 2 cyc
|
||||
rts
|
||||
keep_going:
|
||||
|
||||
; 4.12: (-8 .. +7.9)
|
||||
; zx = zx_2 + zy_2 + cx = 3 * 20 = 60 cyc
|
||||
; zx = zx_2 - zy_2 + cx = 3 * 20 = 60 cyc
|
||||
sub16 zx, zx_2, zy_2
|
||||
add16 zx, zx, cx
|
||||
|
||||
; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc
|
||||
sub16 zy, zx_zy, zx_zy
|
||||
add16 zy, zy, cy
|
||||
|
||||
; 8.24: (-128 .. +127.9)
|
||||
; zx_2 = zx * zx = 470 - 780 cyc
|
||||
; zy_2 = zy * zy = 470 - 780 cyc
|
||||
; zx_zy = zx * zy = 470 - 780 cyc
|
||||
; dist = zx_2 + zy_2 = 38 cyc
|
||||
; if dist >= 4 break, else continue iterating = 7 cyc
|
||||
; zx_2 = zx * zx = 518 - 828 cyc
|
||||
imul16 zx_2, zx, zx
|
||||
|
||||
; shift and round zx_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
|
||||
; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
|
||||
; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
|
||||
; zy_2 = zy * zy = 518 - 828 cyc
|
||||
imul16 zy_2, zy, zy
|
||||
|
||||
; zx_zy = zx * zy = 518 - 828 cyc
|
||||
imul16 zx_zy, zx, zy
|
||||
|
||||
; dist = zx_2 + zy_2 = 38 cyc
|
||||
add32 dist, zx_2, zy_2
|
||||
|
||||
; if dist >= 4 break, else continue iterating = 7 cyc
|
||||
lda dist + 3 ; 3 cyc
|
||||
cmp #4 ; 2 cyc
|
||||
bmi still_in ; 2 cyc
|
||||
rts
|
||||
still_in:
|
||||
|
||||
; shift and round zx_2 to 4.12 = (60 + 5) - (60 + 28) = 65 - 88 cyc
|
||||
.repeat 4 ; 60 cyc
|
||||
shl24 zx_2 ; 15 cyc
|
||||
.endrepeat
|
||||
round16 zx_2 ; 5-28 cycles
|
||||
|
||||
; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
|
||||
.repeat 4 ; 60 cyc
|
||||
shl24 zy_2 ; 15 cyc
|
||||
.endrepeat
|
||||
round16 zy_2 ; 5-28 cycles
|
||||
|
||||
; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
|
||||
.repeat 4 ; 60 cyc
|
||||
shl24 zx_zy ; 15 cyc
|
||||
.endrepeat
|
||||
round16 zx_zy ; 5-28 cycles
|
||||
|
||||
; if may be in the lake, look for looping output with a small buffer
|
||||
; as an optimization vs running to max iters
|
||||
jmp loop ; 3 cycles
|
||||
|
||||
.endproc
|
||||
|
||||
.proc start
|
||||
|
||||
looplong:
|
||||
; FR0 = 5
|
||||
; FR1 = -3
|
||||
lda #5
|
||||
sta FR0
|
||||
lda #0
|
||||
sta FR0 + 1
|
||||
lda #$fd
|
||||
sta FR1
|
||||
; cx = -0.5
|
||||
lda #$f7
|
||||
sta cx
|
||||
lda #$ff
|
||||
sta FR1 + 1
|
||||
sta cx + 1
|
||||
|
||||
jsr imul16
|
||||
; cy = 1
|
||||
lda #$10
|
||||
sta cy
|
||||
lda #$00
|
||||
sta cy + 1
|
||||
|
||||
jsr mandelbrot
|
||||
; should have 32-bit -15 in FR2
|
||||
|
||||
; save the completed iter count for debugging
|
||||
lda iter
|
||||
sta temp
|
||||
|
||||
loop:
|
||||
jmp loop
|
||||
; keep looping over so we can work in the debugger
|
||||
jmp looplong
|
||||
.endproc
|
||||
|
|
Loading…
Reference in a new issue