slightly faster handling of signed mul

previously we were flipping the inputs if negative, and then the output if both inputs were negative turns out you can just treat the whole thing as an unsigned mul and then subtract each term from the high word if the other term is negative. https://stackoverflow.com/a/28827013 this saves a handful of cycles, reducing our runtime to a merge 14.211 ms/px \o/
2024-12-15 20:17:45 -08:00 · 2024-12-15 20:17:45 -08:00 · 05133aabdd
commit 05133aabdd
parent 7f2bc43cff
1 changed files with 10 additions and 22 deletions
--- a/mandel.s
+++ b/mandel.s
@ -344,18 +344,6 @@ fill_masks:
    neg 4, arg
 .endmacro

-; 5 to 25 cycles
-.macro check_sign arg
-    ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the Y register.
-    .local positive
-    lda arg + 1   ; 3 cyc
-    bpl positive  ; 2 cyc
-    neg16 arg     ; 18 cyc
-    iny           ; 2 cyc
-positive:
-.endmacro
-
 ; 518 - 828 cyc
 .macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
@ -438,11 +426,6 @@ positive:
    result = FR2 ; 32-bit result
    inter = temp2

-    ldy #0          ; 2 cyc
-    ; counts the number of sign bits in Y
-    check_sign arg1 ; 5 to 25 cyc
-    check_sign arg2 ; 5 to 25 cyc
-
    ; h1l1 * h2l2
    ; (h1*256 + l1) * (h2*256 + l2)
    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
@ -464,11 +447,16 @@ positive:
    imul8 inter, arg1 + 1, arg2 + 1
    add16 result + 2, result + 2, inter

-    ; In case of mixed input signs, return a negative result.
-    cpy #1              ; 2 cyc
-    bne positive_result ; 2 cyc
-    neg32 result        ; 34 cyc
-positive_result:
+    ; In case of negative inputs, adjust high word
+    ; https://stackoverflow.com/a/28827013
+    lda arg1 + 1
+    bpl arg1_pos
+    sub16 result + 2, result + 2, arg2
+arg1_pos:
+    lda arg2 + 1
+    bpl arg2_pos
+    sub16 result + 2, result + 2, arg1
+arg2_pos:

    rts ; 6 cyc
 .endproc