forked from brooke/mandel-6502
flesh out the mandelbrot iteration loop
some bits i missed increased total to: 1939 - 3007 cycles per iteration probably still buggy, will test later :D
This commit is contained in:
1 changed files with 176 additions and 31 deletions
@ -1,14 +1,99 @@
; Our zero-page vars
sx = $80 ; 8 bits: screen pixel x
sy = $81 ; 8 bits: screen pixel y
cx = $82 ; 16 bits fixed point
cy = $84 ; 16 bits fixed point
zx = $86 ; 16 bits fixed point
zy = $88 ; 16 bits fixed point
zx_2 = $8a ; 32 bits fixed point
zy_2 = $8e ; 32 bits fixed point
zx_zy = $92 ; 32 bits fixed point
dist = $96 ; 32 bits fixed point
iter = $9a ; 8 bits iteration count
temp = $a0 ; debug temp area
; FP registers in zero page
; FP registers in zero page
FR0 = $d4
FR0 = $d4
FRE = $da
FRE = $da
FR1 = $e0
FR1 = $e0
FR2 = $e6
FR2 = $e6
FRX = $ec
.export start
.export start
; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
clc ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
adc arg2 + byte
sta dest + byte
.macro add16 dest, arg1, arg2
add 2, dest, arg1, arg2
.macro add32 dest, arg1, arg2
add 2, dest, arg2, dest
; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2
sec ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
sbc arg2 + byte
sta dest + byte
.macro sub16 dest, arg1, arg2
sub 2, dest, arg1, arg2
.macro sub32 dest, arg1, arg2
sub 4, dest, arg1, arg2
.macro shl bytes, arg
asl arg
.repeat bytes-1
rol arg
.macro shl16 arg
shl 2, arg
.macro shl24 arg
shl 3, arg
.macro shl32 arg
shl 4, arg
; 6 * bytes cycles
.macro copy bytes, dest, arg
.repeat bytes, byte ; 6 * bytes cycles
lda arg + byte ; 3 cyc
sta dest + byte ; 3 cyc
.macro copy16 dest, arg
copy 2, dest, arg
.macro copy32 dest, arg
copy 4, dest, arg
; 2 + 8 * byte cycles
; 2 + 8 * byte cycles
.macro neg bytes, arg
.macro neg bytes, arg
sec ; 2 cyc
sec ; 2 cyc
@ -92,9 +177,17 @@ next:
; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; 470-780
copy32 dest, FR2 ; 24 cyc
; min 470 cycles
; min 470 cycles
; max 780 cycles
; max 780 cycles
.proc imul16
.proc imul16_func
arg1 = FR0 ; 16-bit arg (clobbered)
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
result = FR2 ; 32-bit result
@ -128,7 +221,7 @@ positive_result:
rts ; 6 cyc
rts ; 6 cyc
.macro round16_incdec arg
.macro round16 arg
; Round top 16 bits of 32-bit fixed-point number in-place
; Round top 16 bits of 32-bit fixed-point number in-place
.local zero
.local zero
.local one
.local one
@ -178,61 +271,113 @@ next:
.proc iter
.proc mandelbrot
; still working on the fixed-point
; input:
; should we just use 16-bit adds?
; cx: position scaled to 4.12 fixed point - -8..+7.9
; does that require extra rounding?
; cy: position scaled to 4.12
; is the integer precision right?
; output:
; iter: iteration count at escape or 0
; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9)
; zx = 0
; zx = 0
; zy = 0
; zy = 0
; zx_2 = 0
; zx_2 = 0
; zy_2 = 0
; zy_2 = 0
; zx_zy = 0
; zx_zy = 0
; dist = 0
; iter = 0
lda #00
ldx iter - zx
sta zx,x
bne initloop
; 1652 - 2651 cyc
; 1939 - 3007 cyc
; iters++ = 2 cyc
; iter++ & max-iters break = 7 cyc
inc iter ; 5 cyc
bne keep_going ; 2 cyc
; 4.12: (-8 .. +7.9)
; 4.12: (-8 .. +7.9)
; zx = zx_2 + zy_2 + cx = 3 * 20 = 60 cyc
; zx = zx_2 - zy_2 + cx = 3 * 20 = 60 cyc
sub16 zx, zx_2, zy_2
add16 zx, zx, cx
; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc
; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc
sub16 zy, zx_zy, zx_zy
add16 zy, zy, cy
; 8.24: (-128 .. +127.9)
; 8.24: (-128 .. +127.9)
; zx_2 = zx * zx = 470 - 780 cyc
; zx_2 = zx * zx = 518 - 828 cyc
; zy_2 = zy * zy = 470 - 780 cyc
imul16 zx_2, zx, zx
; zx_zy = zx * zy = 470 - 780 cyc
; dist = zx_2 + zy_2 = 38 cyc
; if dist >= 4 break, else continue iterating = 7 cyc
; shift and round zx_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
; zy_2 = zy * zy = 518 - 828 cyc
; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
imul16 zy_2, zy, zy
; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
; zx_zy = zx * zy = 518 - 828 cyc
imul16 zx_zy, zx, zy
; dist = zx_2 + zy_2 = 38 cyc
add32 dist, zx_2, zy_2
; if dist >= 4 break, else continue iterating = 7 cyc
lda dist + 3 ; 3 cyc
cmp #4 ; 2 cyc
bmi still_in ; 2 cyc
; shift and round zx_2 to 4.12 = (60 + 5) - (60 + 28) = 65 - 88 cyc
.repeat 4 ; 60 cyc
shl24 zx_2 ; 15 cyc
round16 zx_2 ; 5-28 cycles
; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
.repeat 4 ; 60 cyc
shl24 zy_2 ; 15 cyc
round16 zy_2 ; 5-28 cycles
; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
.repeat 4 ; 60 cyc
shl24 zx_zy ; 15 cyc
round16 zx_zy ; 5-28 cycles
; if may be in the lake, look for looping output with a small buffer
; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters
; as an optimization vs running to max iters
jmp loop ; 3 cycles
.proc start
.proc start
; FR0 = 5
; cx = -0.5
; FR1 = -3
lda #$f7
lda #5
sta cx
sta FR0
lda #0
sta FR0 + 1
lda #$fd
sta FR1
lda #$ff
lda #$ff
sta FR1 + 1
sta cx + 1
jsr imul16
; cy = 1
lda #$10
sta cy
lda #$00
sta cy + 1
jsr mandelbrot
; should have 32-bit -15 in FR2
; should have 32-bit -15 in FR2
; save the completed iter count for debugging
lda iter
sta temp
jmp loop
; keep looping over so we can work in the debugger
jmp looplong
Reference in a new issue