apply jamey's suggestion of skipping add for high byte muls

rather than saving 0 into the high bytes, then adding the high-byte multiplication later, write it directly in place. this saves a few cycles on every iteration, and it adds up nicely. View 1 overview render times: 130XE: 10.050 ms/px - 4m56s 800XL: 10.906 ms/px - 5m21s
2025-01-04 10:53:51 -08:00 · 2025-01-04 10:53:51 -08:00 · 582ddf497f
commit 582ddf497f
parent d157fe1306
2 changed files with 4 additions and 28 deletions
--- a/mandel.s
+++ b/mandel.s
@ -464,20 +464,6 @@ viewport_oy:
    sta dest + 1
 .endmacro

-; input: arg as u8
-; input/output: dest as u16
-; clobbers a, x
-.macro sqr8_add16 dest, arg
-    ldx arg
-    clc
-    lda sqr_lobyte,x
-    adc dest
-    sta dest
-    lda sqr_hibyte,x
-    adc dest + 1
-    sta dest + 1
-.endmacro
-
 .segment "TABLES"
 ; lookup table for top byte -> PORTB value for bank-switch
 .align 256
@ -760,9 +746,8 @@ inner_loop:
    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2

    imul8 result, arg1, arg2, xe
-    lda #0
-    sta result + 2
-    sta result + 3
+
+    imul8 result + 2, arg1 + 1, arg2 + 1, xe

    imul8 inter, arg1 + 1, arg2, xe
    add16 result + 1, result + 1, inter
@ -772,9 +757,6 @@ inner_loop:
    add16 result + 1, result + 1, inter
    add_carry result + 3

-    imul8 inter, arg1 + 1, arg2 + 1, xe
-    add16 result + 2, result + 2, inter
-
    ; In case of negative inputs, adjust high word
    ; https://stackoverflow.com/a/28827013
    lda arg1 + 1
@ -807,9 +789,8 @@ arg2_pos:
        ; h*h*256*256 + h*l*256 + h*l*256 + l*l

        sqr8 result, arg
-        lda #0
-        sta result + 2
-        sta result + 3
+
+        sqr8 result + 2, arg + 1

        imul8 inter, arg + 1, arg, xe
        add16 result + 1, result + 1, inter
@ -817,8 +798,6 @@ arg2_pos:
        add16 result + 1, result + 1, inter
        add_carry result + 3

-        sqr8_add16 result + 2, arg + 1
-
        rts ; 6 cyc
    .endscope
 .endmacro
--- a/todo.md
+++ b/todo.md
@ -1,8 +1,5 @@
 things to try:

-* skip add on the top-byte multiply in sqr8/mul8
-  * should save a few cycles, suggestion by jamey
-
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D

 * y-axis mirror optimization