diff --git a/mandel.s b/mandel.s index 665ea70..b2161de 100644 --- a/mandel.s +++ b/mandel.s @@ -176,80 +176,38 @@ next: .endmacro -.macro round16_addsub arg - ; Round top 16 bits of 32-bit fixed-point number in-place - .local zero - .local one - .local positive - .local negative - .local neg2 - .local next - - ; no round - 5 cycles - ; one, pos - 28 cycles - ; one, neg - 31 cycles - ; average = 5 / 2 + (28 + 31) / 4 - ; = 5/2 + 59 / 4 - ; = 2.5 + 14.75 - ; = 17.25 cycles average on evenly distributed data - - lda arg + 1 ; 3 cyc - bpl zero ; 2 cyc - -one: - ; check sign bit - lda arg + 3 ; 3 cyc - bpl positive ; 2 cyc - -negative: - sec ; 2 cyc - lda arg + 2 ; 3 cyc - sbc #1 ; 2 cyc - sta arg + 2 ; 3 cyc - lda arg + 3 ; 3 cyc - sbc #0 ; 2 cyc - lda arg + 3 ; 3 cyc - jmp next ; 3 cyc - -positive: - clc ; 2 cyc - lda arg + 2 ; 3 cyc - adc #1 ; 2 cyc - sta arg + 2 ; 3 cyc - lda arg + 3 ; 3 cyc - adc #0 ; 2 cyc - sta arg + 3 ; 3 cyc - -zero: -next: - -.endmacro .proc iter - ; (cx and cy should be pre-scaled to 6.26 fixed point) + ; still working on the fixed-point + ; should we just use 16-bit adds? + ; does that require extra rounding? + ; is the integer precision right? + + ; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9) ; zx = 0 ; zy = 0 ; zx_2 = 0 ; zy_2 = 0 ; zx_zy = 0 - ; still working on the fixed-point loop: - ; iters++ + ; 1644.5 - 2264.5 cyc - ; 6.26: - ; zx = zx_2 + zy_2 + cx - ; zy = zx_zy + zx_zy + cy - ; round to 6.10. + ; iters++ = 2 cyc - ; 12.20: - ; zx_2 = zx * zx - ; zy_2 = zy * zy - ; dist = zx_2 + zy_2 - ; if dist >= 4 break, else continue iterating + ; 4.12: (-8 .. +7.9) + ; zx = zx_2 + zy_2 + cx = 3 * 20 = 60 cyc + ; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc - ; round zx_2, zy_2, dist to 6.26 + ; 8.24: + ; zx_2 = zx * zx = 470 - 780 cyc + ; zy_2 = zy * zy = 470 - 780 cyc + ; zx_zy = zx * zy = 470 - 780 cyc + ; dist = zx_2 + zy_2 = 38 cyc + ; if dist >= 4 break, else continue iterating = 7 cyc + + ; shift and round zx_2, zy_2, dist up to 4.12 = 2 * (20 + 13.75) = 67.5 cycles ; if may be in the lake, look for looping output with a small buffer ; as an optimization vs running to max iters