From 6db8cef82d4117ae2b3ede21e9ed3cf1ab720a22 Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Mon, 30 Dec 2024 15:17:50 -0800
Subject: [PATCH] 51-70 cycles for xe :D

---
 mandel.s | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/mandel.s b/mandel.s
index 9704a22..4ac8d4d 100644
--- a/mandel.s
+++ b/mandel.s
@@ -463,28 +463,27 @@ bank_switch_table:
 .macro imul8 dest, arg1, arg2, xe
     .if xe
         ; using 64KB lookup table
-        ; 53-72 cycles
+        ; 51-70 cycles
         ; clobbers x, y, dest, ptr
         .scope
             output = dest
 
-            ; bottom 14 bits except the LSB are the per-bank table index
-            ; add $4000 for the bank pointer
-            lda arg1     ; 3 cyc
-            and #$fe     ; 2 cyc
-            tay          ; 2 cyc
-            lda arg2     ; 3 cyc
-            and #$3f     ; 2 cyc
-            ora #$40     ; 2 cyc
-            sta ptr + 1  ; 3 cyc
-            
             ; top 2 bits are the table bank selector
             ldx arg2                ; 3 cyc
             lda bank_switch_table,x ; 4 cyc
             sta PORTB               ; 4 cyc
 
+            ; bottom 14 bits except the LSB are the per-bank table index
+            ; add $4000 for the bank pointer
+            txa          ; 2 cyc
+            and #$3f     ; 2 cyc
+            ora #$40     ; 2 cyc
+            sta ptr + 1  ; 3 cyc
 
             ; copy the entry into output
+            lda arg1     ; 3 cyc
+            and #$fe     ; 2 cyc
+            tay          ; 2 cyc
             lda (ptr),y  ; 5 cyc
             sta output   ; 3 cyc
             iny          ; 2 cyc
@@ -503,9 +502,9 @@ bank_switch_table:
             and #1       ; 2 cyc
             beq done     ; 2 cyc
 
-            ; add the second param one last time for the skipped bit
+            ; add arg2 one last time for the skipped bit
             clc          ; 2 cyc
-            lda arg2     ; 3 cyc
+            txa          ; 2 cyc
             adc output   ; 3 cyc
             sta output   ; 3 cyc
             lda #0       ; 2 cyc