From aee587388de88e35e8f3b345898bd4abc9acf3ed Mon Sep 17 00:00:00 2001
From: Jamey Sharp <jamey@minilop.net>
Date: Tue, 31 Dec 2024 02:01:45 -0800
Subject: [PATCH 1/4] eliminate mul_hibyte512 table

This costs an extra half cycle on average, assuming uniform distribution
of multiplication inputs. I don't think a half cycle is worth an extra
256-byte table.
---
 mandel.s  | 30 ++++++++++++++++--------------
 tables.js | 24 +++++++-----------------
 2 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/mandel.s b/mandel.s
index fc30532..ec1b086 100644
--- a/mandel.s
+++ b/mandel.s
@@ -129,9 +129,8 @@ KEY_0     = 50
     mantissa .byte 5
 .endstruct
 
-.import mul_lobyte256
-.import mul_hibyte256
-.import mul_hibyte512
+.import mul_lobyte
+.import mul_hibyte
 .import sqr_lobyte
 .import sqr_hibyte
 
@@ -548,22 +547,25 @@ bank_switch_table:
             clc                   ; 2 cyc         
             adc mul_factor_x      ; 3 cyc
             tax                   ; 2 cyc
-            bcc under256          ; 2 cyc
-            lda mul_hibyte512,x   ; 4 cyc
-            bcs next              ; 2 cyc
-        under256:
-            lda mul_hibyte256,x   ; 4 cyc
-            sec                   ; 2 cyc
+            lda mul_hibyte,x      ; 4 cyc
+            bcc next              ; 2 cyc
+            ; carry is set so we get to add 1 for free, but need to add 0x80
+            adc #$7f              ; 2 cyc
+            clc                   ; 2 cyc
+            ; stash the sum temporarily so we can use it as an operand to add
+            stx mul_product_lo    ; 3 cyc
+            adc mul_product_lo    ; 3 cyc
         next:
+            sec                   ; 2 cyc
             sta mul_product_hi    ; 3 cyc
-            lda mul_lobyte256,x   ; 4 cyc
+            lda mul_lobyte,x      ; 4 cyc
 
             ; - a^2/2
             ldx mul_factor_a      ; 3 cyc
-            sbc mul_lobyte256,x   ; 4 cyc
+            sbc mul_lobyte,x      ; 4 cyc
             sta mul_product_lo    ; 3 cyc
             lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte256,x   ; 4 cyc
+            sbc mul_hibyte,x      ; 4 cyc
             sta mul_product_hi    ; 3 cyc
 
             ; + x & a & 1:
@@ -582,10 +584,10 @@ bank_switch_table:
             ; - x^2/2
         small_product:
             sec                   ; 2 cyc
-            sbc mul_lobyte256,x   ; 4 cyc
+            sbc mul_lobyte,x      ; 4 cyc
             sta mul_product_lo    ; 3 cyc
             lda mul_product_hi    ; 3 cyc
-            sbc mul_hibyte256,x   ; 4 cyc
+            sbc mul_hibyte,x      ; 4 cyc
             sta mul_product_hi    ; 3 cyc
         .endscope
     .endif
diff --git a/tables.js b/tables.js
index 50cbef9..f4802ce 100644
--- a/tables.js
+++ b/tables.js
@@ -11,32 +11,22 @@ function db(func) {
     return lines.join('\n');
 }
 
-let squares = [];
-for (let i = 0; i < 512; i++) {
-    squares.push(Math.trunc((i * i + 1) / 2));
-}
-
 console.log(
 `.segment "TABLES"
 
-.export mul_lobyte256
-.export mul_hibyte256
-.export mul_hibyte512
+.export mul_lobyte
+.export mul_hibyte
 .export sqr_lobyte
 .export sqr_hibyte
 
-; (i * i + 1) / 2 for the multiplier
+; (i * i) / 2 for the multiplier
 .align 256
-mul_lobyte256:
-${db((i) => squares[i] & 0xff)}
+mul_lobyte:
+${db((i) => ((i * i) >> 1) & 0xff)}
 
 .align 256
-mul_hibyte256:
-${db((i) => (squares[i] >> 8) & 0xff)}
-
-.align 256
-mul_hibyte512:
-${db((i) => (squares[i + 256] >> 8) & 0xff)}
+mul_hibyte:
+${db((i) => ((i * i) >> 9) & 0xff)}
 
 ; (i * i) for the plain squares
 .align 256

From f06aed0c0080b45fdd92544afddcbebea6d74efa Mon Sep 17 00:00:00 2001
From: Jamey Sharp <jamey@minilop.net>
Date: Tue, 31 Dec 2024 02:22:31 -0800
Subject: [PATCH 2/4] set results from both 8-bit squares first

Since the results from the lo and hi squares don't overlap or overflow,
they can be written directly to the final output location without doing
any addition. Then only the multiplication that goes in the middle needs
any adds.
---
 mandel.s | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/mandel.s b/mandel.s
index ec1b086..a63d96f 100644
--- a/mandel.s
+++ b/mandel.s
@@ -450,18 +450,6 @@ viewport_oy:
     sta dest + 1
 .endmacro
 
-; clobbers a, x
-.macro sqr8_add16 dest, arg
-    ldx arg
-    clc
-    lda sqr_lobyte,x
-    adc dest
-    sta dest
-    lda sqr_hibyte,x
-    adc dest + 1
-    sta dest + 1
-.endmacro
-
 .segment "TABLES"
 ; lookup table for top byte -> PORTB value for bank-switch
 .align 256
@@ -794,9 +782,7 @@ arg2_pos:
         ; h*h*256*256 + h*l*256 + h*l*256 + l*l
 
         sqr8 result, arg
-        lda #0
-        sta result + 2
-        sta result + 3
+        sqr8 result + 2, arg + 1
 
         imul8 inter, arg + 1, arg, xe
         add16 result + 1, result + 1, inter
@@ -804,8 +790,6 @@ arg2_pos:
         add16 result + 1, result + 1, inter
         add_carry result + 3
 
-        sqr8_add16 result + 2, arg + 1
-
         rts ; 6 cyc
     .endscope
 .endmacro

From 0f49760aa53b76f16fadf66b236b00df3d4fdd4c Mon Sep 17 00:00:00 2001
From: Jamey Sharp <jamey@minilop.net>
Date: Tue, 31 Dec 2024 02:26:24 -0800
Subject: [PATCH 3/4] unify tables for squaring and multiplication

---
 mandel.s  | 10 ++++++----
 tables.js | 11 -----------
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/mandel.s b/mandel.s
index a63d96f..299db98 100644
--- a/mandel.s
+++ b/mandel.s
@@ -131,8 +131,6 @@ KEY_0     = 50
 
 .import mul_lobyte
 .import mul_hibyte
-.import sqr_lobyte
-.import sqr_hibyte
 
 .data
 
@@ -444,9 +442,13 @@ viewport_oy:
 ; clobbers a, x
 .macro sqr8 dest, arg
     ldx arg
-    lda sqr_lobyte,x
+    txa
+    lsr
+    lda mul_lobyte,x
+    rol
     sta dest
-    lda sqr_hibyte,x
+    lda mul_hibyte,x
+    rol
     sta dest + 1
 .endmacro
 
diff --git a/tables.js b/tables.js
index f4802ce..176e4df 100644
--- a/tables.js
+++ b/tables.js
@@ -16,8 +16,6 @@ console.log(
 
 .export mul_lobyte
 .export mul_hibyte
-.export sqr_lobyte
-.export sqr_hibyte
 
 ; (i * i) / 2 for the multiplier
 .align 256
@@ -28,13 +26,4 @@ ${db((i) => ((i * i) >> 1) & 0xff)}
 mul_hibyte:
 ${db((i) => ((i * i) >> 9) & 0xff)}
 
-; (i * i) for the plain squares
-.align 256
-sqr_lobyte:
-${db((i) => (i * i) & 0xff)}
-
-.align 256
-sqr_hibyte:
-${db((i) => ((i * i) >> 8) & 0xff)}
-
 `);

From 3553ce986f6721f8c6d446368cb6c6f55186713b Mon Sep 17 00:00:00 2001
From: Jamey Sharp <jamey@minilop.net>
Date: Tue, 31 Dec 2024 02:55:22 -0800
Subject: [PATCH 4/4] shave some cycles off 16-bit squaring with shift instead
 of add

also fix the comments about how many cycles shift takes
---
 mandel.s | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mandel.s b/mandel.s
index 299db98..b0c2b42 100644
--- a/mandel.s
+++ b/mandel.s
@@ -348,7 +348,7 @@ viewport_oy:
     sub 4, dest, arg1, arg2
 .endmacro
 
-; 3 + 5 * bytes cycles
+; 3 + 5 * (bytes - 1) cycles
 .macro shl bytes, arg
     asl arg              ; 3 cyc
     .repeat bytes-1, i
@@ -356,17 +356,17 @@ viewport_oy:
     .endrepeat
 .endmacro
 
-; 13 cycles
+; 8 cycles
 .macro shl16 arg
     shl 2, arg
 .endmacro
 
-; 18 cycles
+; 13 cycles
 .macro shl24 arg
     shl 3, arg
 .endmacro
 
-; 23 cycles
+; 18 cycles
 .macro shl32 arg
     shl 4, arg
 .endmacro
@@ -787,7 +787,7 @@ arg2_pos:
         sqr8 result + 2, arg + 1
 
         imul8 inter, arg + 1, arg, xe
-        add16 result + 1, result + 1, inter
+        shl16 inter
         add_carry result + 3
         add16 result + 1, result + 1, inter
         add_carry result + 3