update round & sketch out the iter cycle count
This commit is contained in:
parent
7f70f14fc2
commit
2340f8210e
1 changed files with 19 additions and 118 deletions
137
mandel.s
137
mandel.s
|
@ -176,137 +176,38 @@ next:
|
||||||
|
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
.macro round16_addsub arg
|
|
||||||
; Round top 16 bits of 32-bit fixed-point number in-place
|
|
||||||
.local zero
|
|
||||||
.local one
|
|
||||||
.local positive
|
|
||||||
.local negative
|
|
||||||
.local neg2
|
|
||||||
.local next
|
|
||||||
|
|
||||||
; no round - 5 cycles
|
|
||||||
; one, pos - 28 cycles
|
|
||||||
; one, neg - 31 cycles
|
|
||||||
; average = 5 / 2 + (28 + 31) / 4
|
|
||||||
; = 5/2 + 59 / 4
|
|
||||||
; = 2.5 + 14.75
|
|
||||||
; = 17.25 cycles average on evenly distributed data
|
|
||||||
|
|
||||||
lda arg + 1 ; 3 cyc
|
|
||||||
bpl zero ; 2 cyc
|
|
||||||
|
|
||||||
one:
|
|
||||||
; check sign bit
|
|
||||||
lda arg + 3 ; 3 cyc
|
|
||||||
bpl positive ; 2 cyc
|
|
||||||
|
|
||||||
negative:
|
|
||||||
sec ; 2 cyc
|
|
||||||
lda arg + 2 ; 3 cyc
|
|
||||||
sbc #1 ; 2 cyc
|
|
||||||
sta arg + 2 ; 3 cyc
|
|
||||||
lda arg + 3 ; 3 cyc
|
|
||||||
sbc #0 ; 2 cyc
|
|
||||||
sta arg + 3 ; 3 cyc
|
|
||||||
jmp next ; 3 cyc
|
|
||||||
|
|
||||||
positive:
|
|
||||||
clc ; 2 cyc
|
|
||||||
lda arg + 2 ; 3 cyc
|
|
||||||
adc #1 ; 2 cyc
|
|
||||||
sta arg + 2 ; 3 cyc
|
|
||||||
lda arg + 3 ; 3 cyc
|
|
||||||
adc #0 ; 2 cyc
|
|
||||||
sta arg + 3 ; 3 cyc
|
|
||||||
|
|
||||||
zero:
|
|
||||||
next:
|
|
||||||
|
|
||||||
.endmacro
|
|
||||||
|
|
||||||
.macro round16_addsub_copy arg, dest
|
|
||||||
; Round top 16 bits of 32-bit fixed-point number and copy it
|
|
||||||
.local zero
|
|
||||||
.local one
|
|
||||||
.local positive
|
|
||||||
.local negative
|
|
||||||
.local neg2
|
|
||||||
.local next
|
|
||||||
|
|
||||||
; no round - 17 cycles
|
|
||||||
; round, positive - 31 cycles
|
|
||||||
; round, negative - 31 cycles
|
|
||||||
; average = 17 / 2 + (31 + 31) / 4
|
|
||||||
; = 17 / 2 + 62 / 4
|
|
||||||
; = 24 cycles average
|
|
||||||
;
|
|
||||||
; compare with 13.75 cyc in-place plus three copies at 12 cycles
|
|
||||||
; 13.75 + 36 = 49.75 (41 - 64)
|
|
||||||
; versus three rounds+copies: 72 (51 - 93)
|
|
||||||
|
|
||||||
lda arg + 1 ; 3 cyc
|
|
||||||
bpl zero ; 2 cyc
|
|
||||||
|
|
||||||
one:
|
|
||||||
; check sign bit
|
|
||||||
lda arg + 3 ; 3 cyc
|
|
||||||
bpl positive ; 2 cyc
|
|
||||||
|
|
||||||
negative:
|
|
||||||
sec ; 2 cyc
|
|
||||||
lda arg + 2 ; 3 cyc
|
|
||||||
sbc #1 ; 2 cyc
|
|
||||||
sta dest ; 3 cyc
|
|
||||||
lda arg + 3 ; 3 cyc
|
|
||||||
sbc #0 ; 2 cyc
|
|
||||||
jmp next ; 3 cyc
|
|
||||||
|
|
||||||
positive:
|
|
||||||
clc ; 2 cyc
|
|
||||||
lda arg + 2 ; 3 cyc
|
|
||||||
adc #1 ; 2 cyc
|
|
||||||
sta dest ; 3 cyc
|
|
||||||
lda arg + 3 ; 3 cyc
|
|
||||||
adc #0 ; 2 cyc
|
|
||||||
jmp next ; 3 cyc
|
|
||||||
|
|
||||||
zero:
|
|
||||||
lda arg + 2 ; 3 cyc
|
|
||||||
sta dest ; 3 cyc
|
|
||||||
lda arg + 3 ; 3 cyc
|
|
||||||
|
|
||||||
next:
|
|
||||||
sta dest + 2 ; 3 cyc
|
|
||||||
|
|
||||||
|
|
||||||
.endmacro
|
|
||||||
|
|
||||||
|
|
||||||
.proc iter
|
.proc iter
|
||||||
; (cx and cy should be pre-scaled to 6.26 fixed point)
|
; still working on the fixed-point
|
||||||
|
; should we just use 16-bit adds?
|
||||||
|
; does that require extra rounding?
|
||||||
|
; is the integer precision right?
|
||||||
|
|
||||||
|
; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9)
|
||||||
; zx = 0
|
; zx = 0
|
||||||
; zy = 0
|
; zy = 0
|
||||||
; zx_2 = 0
|
; zx_2 = 0
|
||||||
; zy_2 = 0
|
; zy_2 = 0
|
||||||
; zx_zy = 0
|
; zx_zy = 0
|
||||||
|
|
||||||
; still working on the fixed-point
|
|
||||||
loop:
|
loop:
|
||||||
; iters++
|
; 1627 - 2603 cyc
|
||||||
|
|
||||||
; 6.26:
|
; iters++ = 2 cyc
|
||||||
; zx = zx_2 + zy_2 + cx
|
|
||||||
; zy = zx_zy + zx_zy + cy
|
|
||||||
; round to 6.10.
|
|
||||||
|
|
||||||
; 12.20:
|
; 4.12: (-8 .. +7.9)
|
||||||
; zx_2 = zx * zx
|
; zx = zx_2 + zy_2 + cx = 3 * 20 = 60 cyc
|
||||||
; zy_2 = zy * zy
|
; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc
|
||||||
; dist = zx_2 + zy_2
|
|
||||||
; if dist >= 4 break, else continue iterating
|
|
||||||
|
|
||||||
; round zx_2, zy_2, dist to 6.26
|
; 8.24:
|
||||||
|
; zx_2 = zx * zx = 470 - 780 cyc
|
||||||
|
; zy_2 = zy * zy = 470 - 780 cyc
|
||||||
|
; zx_zy = zx * zy = 470 - 780 cyc
|
||||||
|
; dist = zx_2 + zy_2 = 38 cyc
|
||||||
|
; if dist >= 4 break, else continue iterating = 7 cyc
|
||||||
|
|
||||||
|
; shift and round zx_2, zy_2, dist up to 4.12 = 2 * (20 + 5) - 2 * (20 + 28) = 50 - 96 cyc
|
||||||
|
|
||||||
; if may be in the lake, look for looping output with a small buffer
|
; if may be in the lake, look for looping output with a small buffer
|
||||||
; as an optimization vs running to max iters
|
; as an optimization vs running to max iters
|
||||||
|
|
Loading…
Reference in a new issue