apply jamey's suggestion of skipping add for high byte muls

rather than saving 0 into the high bytes, then adding the high-byte multiplication later, write it directly in place. this saves a few cycles on every iteration, and it adds up nicely. View 1 overview render times: 130XE: 10.050 ms/px - 4m56s 800XL: 10.906 ms/px - 5m21s
2025-01-04 10:53:51 -08:00 · 2025-01-04 10:53:51 -08:00 · 582ddf497f
commit 582ddf497f
parent d157fe1306
2 changed files with 4 additions and 28 deletions
--- a/mandel.s
+++ b/mandel.s
@ -464,20 +464,6 @@ viewport_oy:
    sta dest + 1
 .endmacro
 ; input: arg as u8
 ; input/output: dest as u16
 ; clobbers a, x
 .macro sqr8_add16 dest, arg
    ldx arg
    clc
    lda sqr_lobyte,x
    adc dest
    sta dest
    lda sqr_hibyte,x
    adc dest + 1
    sta dest + 1
 .endmacro
 .segment "TABLES"
 ; lookup table for top byte -> PORTB value for bank-switch
 .align 256
@ -760,9 +746,8 @@ inner_loop:
    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
    imul8 result, arg1, arg2, xe
-    lda #0
+
-    sta result + 2
+    imul8 result + 2, arg1 + 1, arg2 + 1, xe
    sta result + 3
    imul8 inter, arg1 + 1, arg2, xe
    add16 result + 1, result + 1, inter
@ -772,9 +757,6 @@ inner_loop:
    add16 result + 1, result + 1, inter
    add_carry result + 3
    imul8 inter, arg1 + 1, arg2 + 1, xe
    add16 result + 2, result + 2, inter
    ; In case of negative inputs, adjust high word
    ; https://stackoverflow.com/a/28827013
    lda arg1 + 1
@ -807,9 +789,8 @@ arg2_pos:
        ; h*h*256*256 + h*l*256 + h*l*256 + l*l
        sqr8 result, arg
-        lda #0
+
-        sta result + 2
+        sqr8 result + 2, arg + 1
        sta result + 3
        imul8 inter, arg + 1, arg, xe
        add16 result + 1, result + 1, inter
@ -817,8 +798,6 @@ arg2_pos:
        add16 result + 1, result + 1, inter
        add_carry result + 3
        sqr8_add16 result + 2, arg + 1
        rts ; 6 cyc
    .endscope
 .endmacro
--- a/todo.md
+++ b/todo.md
@ -1,8 +1,5 @@
 things to try:
 * skip add on the top-byte multiply in sqr8/mul8
  * should save a few cycles, suggestion by jamey
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 * y-axis mirror optimization