From 7ac0df034356ef0b186f56591b8eff4737adcc44 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Thu, 5 Jan 2023 09:06:07 -0800 Subject: [PATCH] Added two version of 16-bit rounding round16_incdec uses inc and dec round16_addsub uses adc and sbc the incdec version is the same when no rounding is needed but saves about 8 cycles on the rounding cases, for an average savings of 4.5 cycles for randomly distributed inputs untested so far --- mandel.s | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/mandel.s b/mandel.s index 2b55f78..665ea70 100644 --- a/mandel.s +++ b/mandel.s @@ -128,6 +128,104 @@ positive_result: rts ; 6 cyc .endproc +.macro round16_incdec arg + ; Round top 16 bits of 32-bit fixed-point number in-place + .local zero + .local one + .local positive + .local negative + .local neg2 + .local next + + ; no round - 5 cycles + ; round pos, no carry - 17 + ; round pos, carry - 22 + ; round neg, no carry - 23 + ; round neg, carry - 28 + ; average = 5 / 2 + (17 + 22 + 23 + 28) / 8 + ; = 5 / 2 + 90 / 8 + ; = 2.5 + 11.25 = 13.75 cycles average on evenly distributed input + + lda arg + 1 ; 3 cyc + bpl zero ; 2 cyc + +one: + ; check sign bit + lda arg + 3 ; 3 cyc + bpl positive ; 2 cyc + +negative: + lda arg + 2 ; 3 cyc + beq neg2 ; 2 cyc + + dec arg + 2 ; 5 cyc + jmp next ; 3 cyc + +neg2: + dec arg + 2 ; 5 cyc + dec arg + 3 ; 5 cyc + jmp next ; 3 cyc + +positive: + inc arg + 2 ; 5 cyc + beq next ; 2 cyc + inc arg + 3 ; 5 cyc + +zero: +next: + +.endmacro + +.macro round16_addsub arg + ; Round top 16 bits of 32-bit fixed-point number in-place + .local zero + .local one + .local positive + .local negative + .local neg2 + .local next + + ; no round - 5 cycles + ; one, pos - 28 cycles + ; one, neg - 31 cycles + ; average = 5 / 2 + (28 + 31) / 4 + ; = 5/2 + 59 / 4 + ; = 2.5 + 14.75 + ; = 17.25 cycles average on evenly distributed data + + lda arg + 1 ; 3 cyc + bpl zero ; 2 cyc + +one: + ; check sign bit + lda arg + 3 ; 3 cyc + bpl positive ; 2 cyc + +negative: + sec ; 2 cyc + lda arg + 2 ; 3 cyc + sbc #1 ; 2 cyc + sta arg + 2 ; 3 cyc + lda arg + 3 ; 3 cyc + sbc #0 ; 2 cyc + lda arg + 3 ; 3 cyc + jmp next ; 3 cyc + +positive: + clc ; 2 cyc + lda arg + 2 ; 3 cyc + adc #1 ; 2 cyc + sta arg + 2 ; 3 cyc + lda arg + 3 ; 3 cyc + adc #0 ; 2 cyc + sta arg + 3 ; 3 cyc + +zero: +next: + +.endmacro + + .proc iter ; (cx and cy should be pre-scaled to 6.26 fixed point) ; zx = 0