slightly faster handling of signed mul

previously we were flipping the inputs if negative, and then the
output if both inputs were negative

turns out you can just treat the whole thing as an unsigned mul
and then subtract each term from the high word if the other term
is negative.

https://stackoverflow.com/a/28827013

this saves a handful of cycles, reducing our runtime to a merge
14.211 ms/px \o/
This commit is contained in:
Brooke Vibber 2024-12-15 20:17:45 -08:00
parent 7f2bc43cff
commit 05133aabdd

View file

@ -344,18 +344,6 @@ fill_masks:
neg 4, arg
.endmacro
; 5 to 25 cycles
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the Y register.
.local positive
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
iny ; 2 cyc
positive:
.endmacro
; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
@ -438,11 +426,6 @@ positive:
result = FR2 ; 32-bit result
inter = temp2
ldy #0 ; 2 cyc
; counts the number of sign bits in Y
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
; h1l1 * h2l2
; (h1*256 + l1) * (h2*256 + l2)
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
@ -464,11 +447,16 @@ positive:
imul8 inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter
; In case of mixed input signs, return a negative result.
cpy #1 ; 2 cyc
bne positive_result ; 2 cyc
neg32 result ; 34 cyc
positive_result:
; In case of negative inputs, adjust high word
; https://stackoverflow.com/a/28827013
lda arg1 + 1
bpl arg1_pos
sub16 result + 2, result + 2, arg2
arg1_pos:
lda arg2 + 1
bpl arg2_pos
sub16 result + 2, result + 2, arg1
arg2_pos:
rts ; 6 cyc
.endproc