Added two version of 16-bit rounding

round16_incdec uses inc and dec
round16_addsub uses adc and sbc

the incdec version is the same when no rounding is needed
but saves about 8 cycles on the rounding cases, for an
average savings of 4.5 cycles for randomly distributed inputs

untested so far
This commit is contained in:
Brooke Vibber 2023-01-05 09:06:07 -08:00
parent 6edd2f71d9
commit 7ac0df0343

View file

@ -128,6 +128,104 @@ positive_result:
rts ; 6 cyc
.endproc
.macro round16_incdec arg
; Round top 16 bits of 32-bit fixed-point number in-place
.local zero
.local one
.local positive
.local negative
.local neg2
.local next
; no round - 5 cycles
; round pos, no carry - 17
; round pos, carry - 22
; round neg, no carry - 23
; round neg, carry - 28
; average = 5 / 2 + (17 + 22 + 23 + 28) / 8
; = 5 / 2 + 90 / 8
; = 2.5 + 11.25 = 13.75 cycles average on evenly distributed input
lda arg + 1 ; 3 cyc
bpl zero ; 2 cyc
one:
; check sign bit
lda arg + 3 ; 3 cyc
bpl positive ; 2 cyc
negative:
lda arg + 2 ; 3 cyc
beq neg2 ; 2 cyc
dec arg + 2 ; 5 cyc
jmp next ; 3 cyc
neg2:
dec arg + 2 ; 5 cyc
dec arg + 3 ; 5 cyc
jmp next ; 3 cyc
positive:
inc arg + 2 ; 5 cyc
beq next ; 2 cyc
inc arg + 3 ; 5 cyc
zero:
next:
.endmacro
.macro round16_addsub arg
; Round top 16 bits of 32-bit fixed-point number in-place
.local zero
.local one
.local positive
.local negative
.local neg2
.local next
; no round - 5 cycles
; one, pos - 28 cycles
; one, neg - 31 cycles
; average = 5 / 2 + (28 + 31) / 4
; = 5/2 + 59 / 4
; = 2.5 + 14.75
; = 17.25 cycles average on evenly distributed data
lda arg + 1 ; 3 cyc
bpl zero ; 2 cyc
one:
; check sign bit
lda arg + 3 ; 3 cyc
bpl positive ; 2 cyc
negative:
sec ; 2 cyc
lda arg + 2 ; 3 cyc
sbc #1 ; 2 cyc
sta arg + 2 ; 3 cyc
lda arg + 3 ; 3 cyc
sbc #0 ; 2 cyc
lda arg + 3 ; 3 cyc
jmp next ; 3 cyc
positive:
clc ; 2 cyc
lda arg + 2 ; 3 cyc
adc #1 ; 2 cyc
sta arg + 2 ; 3 cyc
lda arg + 3 ; 3 cyc
adc #0 ; 2 cyc
sta arg + 3 ; 3 cyc
zero:
next:
.endmacro
.proc iter
; (cx and cy should be pre-scaled to 6.26 fixed point)
; zx = 0