Compare commits

...

2 commits

Author SHA1 Message Date
8d312ea19d update round & sketch out the iter cycle count 2023-01-05 11:33:06 -08:00
7f70f14fc2 one last round sketch
combining copy and round
2023-01-05 11:17:13 -08:00

View file

@ -176,80 +176,38 @@ next:
.endmacro
.macro round16_addsub arg
; Round top 16 bits of 32-bit fixed-point number in-place
.local zero
.local one
.local positive
.local negative
.local neg2
.local next
; no round - 5 cycles
; one, pos - 28 cycles
; one, neg - 31 cycles
; average = 5 / 2 + (28 + 31) / 4
; = 5/2 + 59 / 4
; = 2.5 + 14.75
; = 17.25 cycles average on evenly distributed data
lda arg + 1 ; 3 cyc
bpl zero ; 2 cyc
one:
; check sign bit
lda arg + 3 ; 3 cyc
bpl positive ; 2 cyc
negative:
sec ; 2 cyc
lda arg + 2 ; 3 cyc
sbc #1 ; 2 cyc
sta arg + 2 ; 3 cyc
lda arg + 3 ; 3 cyc
sbc #0 ; 2 cyc
lda arg + 3 ; 3 cyc
jmp next ; 3 cyc
positive:
clc ; 2 cyc
lda arg + 2 ; 3 cyc
adc #1 ; 2 cyc
sta arg + 2 ; 3 cyc
lda arg + 3 ; 3 cyc
adc #0 ; 2 cyc
sta arg + 3 ; 3 cyc
zero:
next:
.endmacro
.proc iter
; (cx and cy should be pre-scaled to 6.26 fixed point)
; still working on the fixed-point
; should we just use 16-bit adds?
; does that require extra rounding?
; is the integer precision right?
; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9)
; zx = 0
; zy = 0
; zx_2 = 0
; zy_2 = 0
; zx_zy = 0
; still working on the fixed-point
loop:
; iters++
; 1644.5 - 2264.5 cyc
; 6.26:
; zx = zx_2 + zy_2 + cx
; zy = zx_zy + zx_zy + cy
; round to 6.10.
; iters++ = 2 cyc
; 12.20:
; zx_2 = zx * zx
; zy_2 = zy * zy
; dist = zx_2 + zy_2
; if dist >= 4 break, else continue iterating
; 4.12: (-8 .. +7.9)
; zx = zx_2 + zy_2 + cx = 3 * 20 = 60 cyc
; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc
; round zx_2, zy_2, dist to 6.26
; 8.24:
; zx_2 = zx * zx = 470 - 780 cyc
; zy_2 = zy * zy = 470 - 780 cyc
; zx_zy = zx * zy = 470 - 780 cyc
; dist = zx_2 + zy_2 = 38 cyc
; if dist >= 4 break, else continue iterating = 7 cyc
; shift and round zx_2, zy_2, dist up to 4.12 = 2 * (20 + 13.75) = 67.5 cycles
; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters