From 582ddf497f3c4f1aeae39201b2490dff14ff7f16 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Sat, 4 Jan 2025 10:53:51 -0800
Subject: [PATCH] apply jamey's suggestion of skipping add for high byte muls

rather than saving 0 into the high bytes, then adding the high-byte
multiplication later, write it directly in place. this saves a few
cycles on every iteration, and it adds up nicely.

View 1 overview render times:
130XE: 10.050 ms/px - 4m56s
800XL: 10.906 ms/px - 5m21s
---
 mandel.s | 29 ++++-------------------------
 todo.md  |  3 ---
 2 files changed, 4 insertions(+), 28 deletions(-)

diff --git a/mandel.s b/mandel.s
index 210799a..526953a 100644
--- a/mandel.s
+++ b/mandel.s
@@ -464,20 +464,6 @@ viewport_oy:
     sta dest + 1
 .endmacro
 
-; input: arg as u8
-; input/output: dest as u16
-; clobbers a, x
-.macro sqr8_add16 dest, arg
-    ldx arg
-    clc
-    lda sqr_lobyte,x
-    adc dest
-    sta dest
-    lda sqr_hibyte,x
-    adc dest + 1
-    sta dest + 1
-.endmacro
-
 .segment "TABLES"
 ; lookup table for top byte -> PORTB value for bank-switch
 .align 256
@@ -760,9 +746,8 @@ inner_loop:
     ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
 
     imul8 result, arg1, arg2, xe
-    lda #0
-    sta result + 2
-    sta result + 3
+
+    imul8 result + 2, arg1 + 1, arg2 + 1, xe
 
     imul8 inter, arg1 + 1, arg2, xe
     add16 result + 1, result + 1, inter
@@ -772,9 +757,6 @@ inner_loop:
     add16 result + 1, result + 1, inter
     add_carry result + 3
 
-    imul8 inter, arg1 + 1, arg2 + 1, xe
-    add16 result + 2, result + 2, inter
-
     ; In case of negative inputs, adjust high word
     ; https://stackoverflow.com/a/28827013
     lda arg1 + 1
@@ -807,9 +789,8 @@ arg2_pos:
         ; h*h*256*256 + h*l*256 + h*l*256 + l*l
 
         sqr8 result, arg
-        lda #0
-        sta result + 2
-        sta result + 3
+
+        sqr8 result + 2, arg + 1
 
         imul8 inter, arg + 1, arg, xe
         add16 result + 1, result + 1, inter
@@ -817,8 +798,6 @@ arg2_pos:
         add16 result + 1, result + 1, inter
         add_carry result + 3
 
-        sqr8_add16 result + 2, arg + 1
-
         rts ; 6 cyc
     .endscope
 .endmacro
diff --git a/todo.md b/todo.md
index 1d46281..a78a2d5 100644
--- a/todo.md
+++ b/todo.md
@@ -1,8 +1,5 @@
 things to try:
 
-* skip add on the top-byte multiply in sqr8/mul8
-  * should save a few cycles, suggestion by jamey
-
 * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
 
 * y-axis mirror optimization