From 2340f8210ea8e54b9fafd25b5e164a524b9705d4 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Thu, 5 Jan 2023 11:32:15 -0800 Subject: [PATCH] update round & sketch out the iter cycle count --- mandel.s | 137 ++++++++----------------------------------------------- 1 file changed, 19 insertions(+), 118 deletions(-) diff --git a/mandel.s b/mandel.s index 7701090..b69907c 100644 --- a/mandel.s +++ b/mandel.s @@ -176,137 +176,38 @@ next: .endmacro -.macro round16_addsub arg - ; Round top 16 bits of 32-bit fixed-point number in-place - .local zero - .local one - .local positive - .local negative - .local neg2 - .local next - - ; no round - 5 cycles - ; one, pos - 28 cycles - ; one, neg - 31 cycles - ; average = 5 / 2 + (28 + 31) / 4 - ; = 5/2 + 59 / 4 - ; = 2.5 + 14.75 - ; = 17.25 cycles average on evenly distributed data - - lda arg + 1 ; 3 cyc - bpl zero ; 2 cyc - -one: - ; check sign bit - lda arg + 3 ; 3 cyc - bpl positive ; 2 cyc - -negative: - sec ; 2 cyc - lda arg + 2 ; 3 cyc - sbc #1 ; 2 cyc - sta arg + 2 ; 3 cyc - lda arg + 3 ; 3 cyc - sbc #0 ; 2 cyc - sta arg + 3 ; 3 cyc - jmp next ; 3 cyc - -positive: - clc ; 2 cyc - lda arg + 2 ; 3 cyc - adc #1 ; 2 cyc - sta arg + 2 ; 3 cyc - lda arg + 3 ; 3 cyc - adc #0 ; 2 cyc - sta arg + 3 ; 3 cyc - -zero: -next: - -.endmacro - -.macro round16_addsub_copy arg, dest - ; Round top 16 bits of 32-bit fixed-point number and copy it - .local zero - .local one - .local positive - .local negative - .local neg2 - .local next - - ; no round - 17 cycles - ; round, positive - 31 cycles - ; round, negative - 31 cycles - ; average = 17 / 2 + (31 + 31) / 4 - ; = 17 / 2 + 62 / 4 - ; = 24 cycles average - ; - ; compare with 13.75 cyc in-place plus three copies at 12 cycles - ; 13.75 + 36 = 49.75 (41 - 64) - ; versus three rounds+copies: 72 (51 - 93) - - lda arg + 1 ; 3 cyc - bpl zero ; 2 cyc - -one: - ; check sign bit - lda arg + 3 ; 3 cyc - bpl positive ; 2 cyc - -negative: - sec ; 2 cyc - lda arg + 2 ; 3 cyc - sbc #1 ; 2 cyc - sta dest ; 3 cyc - lda arg + 3 ; 3 cyc - sbc #0 ; 2 cyc - jmp next ; 3 cyc - -positive: - clc ; 2 cyc - lda arg + 2 ; 3 cyc - adc #1 ; 2 cyc - sta dest ; 3 cyc - lda arg + 3 ; 3 cyc - adc #0 ; 2 cyc - jmp next ; 3 cyc - -zero: - lda arg + 2 ; 3 cyc - sta dest ; 3 cyc - lda arg + 3 ; 3 cyc - -next: - sta dest + 2 ; 3 cyc - - -.endmacro .proc iter - ; (cx and cy should be pre-scaled to 6.26 fixed point) + ; still working on the fixed-point + ; should we just use 16-bit adds? + ; does that require extra rounding? + ; is the integer precision right? + + ; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9) ; zx = 0 ; zy = 0 ; zx_2 = 0 ; zy_2 = 0 ; zx_zy = 0 - ; still working on the fixed-point loop: - ; iters++ + ; 1627 - 2603 cyc - ; 6.26: - ; zx = zx_2 + zy_2 + cx - ; zy = zx_zy + zx_zy + cy - ; round to 6.10. + ; iters++ = 2 cyc - ; 12.20: - ; zx_2 = zx * zx - ; zy_2 = zy * zy - ; dist = zx_2 + zy_2 - ; if dist >= 4 break, else continue iterating + ; 4.12: (-8 .. +7.9) + ; zx = zx_2 + zy_2 + cx = 3 * 20 = 60 cyc + ; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc - ; round zx_2, zy_2, dist to 6.26 + ; 8.24: + ; zx_2 = zx * zx = 470 - 780 cyc + ; zy_2 = zy * zy = 470 - 780 cyc + ; zx_zy = zx * zy = 470 - 780 cyc + ; dist = zx_2 + zy_2 = 38 cyc + ; if dist >= 4 break, else continue iterating = 7 cyc + + ; shift and round zx_2, zy_2, dist up to 4.12 = 2 * (20 + 5) - 2 * (20 + 28) = 50 - 96 cyc ; if may be in the lake, look for looping output with a small buffer ; as an optimization vs running to max iters