slightly faster handling of signed mul
previously we were flipping the inputs if negative, and then the output if both inputs were negative turns out you can just treat the whole thing as an unsigned mul and then subtract each term from the high word if the other term is negative. https://stackoverflow.com/a/28827013 this saves a handful of cycles, reducing our runtime to a merge 14.211 ms/px \o/
This commit is contained in:
parent
7f2bc43cff
commit
05133aabdd
1 changed files with 10 additions and 22 deletions
32
mandel.s
32
mandel.s
|
@ -344,18 +344,6 @@ fill_masks:
|
||||||
neg 4, arg
|
neg 4, arg
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
; 5 to 25 cycles
|
|
||||||
.macro check_sign arg
|
|
||||||
; Check sign bit and flip argument to postive,
|
|
||||||
; keeping a count of sign bits in the Y register.
|
|
||||||
.local positive
|
|
||||||
lda arg + 1 ; 3 cyc
|
|
||||||
bpl positive ; 2 cyc
|
|
||||||
neg16 arg ; 18 cyc
|
|
||||||
iny ; 2 cyc
|
|
||||||
positive:
|
|
||||||
.endmacro
|
|
||||||
|
|
||||||
; 518 - 828 cyc
|
; 518 - 828 cyc
|
||||||
.macro imul16 dest, arg1, arg2
|
.macro imul16 dest, arg1, arg2
|
||||||
copy16 FR0, arg1 ; 12 cyc
|
copy16 FR0, arg1 ; 12 cyc
|
||||||
|
@ -438,11 +426,6 @@ positive:
|
||||||
result = FR2 ; 32-bit result
|
result = FR2 ; 32-bit result
|
||||||
inter = temp2
|
inter = temp2
|
||||||
|
|
||||||
ldy #0 ; 2 cyc
|
|
||||||
; counts the number of sign bits in Y
|
|
||||||
check_sign arg1 ; 5 to 25 cyc
|
|
||||||
check_sign arg2 ; 5 to 25 cyc
|
|
||||||
|
|
||||||
; h1l1 * h2l2
|
; h1l1 * h2l2
|
||||||
; (h1*256 + l1) * (h2*256 + l2)
|
; (h1*256 + l1) * (h2*256 + l2)
|
||||||
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
|
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
|
||||||
|
@ -464,11 +447,16 @@ positive:
|
||||||
imul8 inter, arg1 + 1, arg2 + 1
|
imul8 inter, arg1 + 1, arg2 + 1
|
||||||
add16 result + 2, result + 2, inter
|
add16 result + 2, result + 2, inter
|
||||||
|
|
||||||
; In case of mixed input signs, return a negative result.
|
; In case of negative inputs, adjust high word
|
||||||
cpy #1 ; 2 cyc
|
; https://stackoverflow.com/a/28827013
|
||||||
bne positive_result ; 2 cyc
|
lda arg1 + 1
|
||||||
neg32 result ; 34 cyc
|
bpl arg1_pos
|
||||||
positive_result:
|
sub16 result + 2, result + 2, arg2
|
||||||
|
arg1_pos:
|
||||||
|
lda arg2 + 1
|
||||||
|
bpl arg2_pos
|
||||||
|
sub16 result + 2, result + 2, arg1
|
||||||
|
arg2_pos:
|
||||||
|
|
||||||
rts ; 6 cyc
|
rts ; 6 cyc
|
||||||
.endproc
|
.endproc
|
||||||
|
|
Loading…
Reference in a new issue