diff --git a/mandel.s b/mandel.s index b2161de..665ea70 100644 --- a/mandel.s +++ b/mandel.s @@ -176,38 +176,80 @@ next: .endmacro +.macro round16_addsub arg + ; Round top 16 bits of 32-bit fixed-point number in-place + .local zero + .local one + .local positive + .local negative + .local neg2 + .local next + + ; no round - 5 cycles + ; one, pos - 28 cycles + ; one, neg - 31 cycles + ; average = 5 / 2 + (28 + 31) / 4 + ; = 5/2 + 59 / 4 + ; = 2.5 + 14.75 + ; = 17.25 cycles average on evenly distributed data + + lda arg + 1 ; 3 cyc + bpl zero ; 2 cyc + +one: + ; check sign bit + lda arg + 3 ; 3 cyc + bpl positive ; 2 cyc + +negative: + sec ; 2 cyc + lda arg + 2 ; 3 cyc + sbc #1 ; 2 cyc + sta arg + 2 ; 3 cyc + lda arg + 3 ; 3 cyc + sbc #0 ; 2 cyc + lda arg + 3 ; 3 cyc + jmp next ; 3 cyc + +positive: + clc ; 2 cyc + lda arg + 2 ; 3 cyc + adc #1 ; 2 cyc + sta arg + 2 ; 3 cyc + lda arg + 3 ; 3 cyc + adc #0 ; 2 cyc + sta arg + 3 ; 3 cyc + +zero: +next: + +.endmacro .proc iter - ; still working on the fixed-point - ; should we just use 16-bit adds? - ; does that require extra rounding? - ; is the integer precision right? - - ; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9) + ; (cx and cy should be pre-scaled to 6.26 fixed point) ; zx = 0 ; zy = 0 ; zx_2 = 0 ; zy_2 = 0 ; zx_zy = 0 + ; still working on the fixed-point loop: - ; 1644.5 - 2264.5 cyc + ; iters++ - ; iters++ = 2 cyc + ; 6.26: + ; zx = zx_2 + zy_2 + cx + ; zy = zx_zy + zx_zy + cy + ; round to 6.10. - ; 4.12: (-8 .. +7.9) - ; zx = zx_2 + zy_2 + cx = 3 * 20 = 60 cyc - ; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc + ; 12.20: + ; zx_2 = zx * zx + ; zy_2 = zy * zy + ; dist = zx_2 + zy_2 + ; if dist >= 4 break, else continue iterating - ; 8.24: - ; zx_2 = zx * zx = 470 - 780 cyc - ; zy_2 = zy * zy = 470 - 780 cyc - ; zx_zy = zx * zy = 470 - 780 cyc - ; dist = zx_2 + zy_2 = 38 cyc - ; if dist >= 4 break, else continue iterating = 7 cyc - - ; shift and round zx_2, zy_2, dist up to 4.12 = 2 * (20 + 13.75) = 67.5 cycles + ; round zx_2, zy_2, dist to 6.26 ; if may be in the lake, look for looping output with a small buffer ; as an optimization vs running to max iters