From f996c3cbcd84b3aff3fd39bf3daee9a6c60a9e2a Mon Sep 17 00:00:00 2001
From: Brooke Vibber <bvibber@pobox.com>
Date: Wed, 25 Dec 2024 12:47:37 -0800
Subject: [PATCH] provisional maybe

old mode runs in 81-92 cycles

provisional code runs in 58-77 cycles

if it works ;)
---
 imul8xe.s | 76 ++++++++++++++++++++-----------------------------------
 mandel.s  |  1 +
 2 files changed, 29 insertions(+), 48 deletions(-)

diff --git a/imul8xe.s b/imul8xe.s
index 5cbb852..d12f53f 100644
--- a/imul8xe.s
+++ b/imul8xe.s
@@ -3,55 +3,38 @@ PORTB = $d301
 
 
 EXTENDED_RAM = $4000 ; 16KiB bank on the XE
-bankswitch = ; ???
 
-; input in X/Y (lo/hi)
-; output in FR0
-; clobbers FR0
-; 128 cycles
-proc imul8xe
-    output = FR0
-    ptr = FR0 + 2
+; lookup table for top byte -> PORTB value for bank-switch
+.align 256
+bankswitch:
+    .repeat 256, i
+        .byte ((i & $c0) >> 5) | $c1
+    .endrepeat
 
-    lda #0       ; 2 cyc
-    sta ptr      ; 3 cyc
-    sta ptr + 1  ; 3 cyc
+; 58-77 cycles
+.macro imul8xe dest, arg1, arg2
+.local done
+.local output
+.local ptr
+
+    output = dest
+    ptr = dest + 2 ; scratch space assumed
 
     ; bottom 14 bits except the LSB are the per-bank table index
     ; add $4000 for the bank pointer
-    txa          ; 2 cyc
+    lda arg1     ; 3 cyc
     and #$fe     ; 2 cyc
     sta ptr      ; 3 cyc
-    tya          ; 2 cyc
+    lda arg2     ; 3 cyc
     and #$3f     ; 2 cyc
     clc          ; 2 cyc
     adc #$40     ; 2 cyc
     sta ptr + 1  ; 3 cyc
     
     ; top 2 bits are the table bank selector
-    tya          ; 2 cyc
-    and #$c0     ; 2 cyc
-    ; shift in extended RAM mode 2x 1 bits
-    sec          ; 2 cyc
-    ror          ; 2 cyc
-    ror          ; 2 cyc
-    ; shift in 0 bits
-    asr          ; 2 cyc
-    asr          ; 2 cyc
-    asr          ; 2 cyc
-
-    ; save the second param for later
-    phy          ; 3 cyc
-
-    ; disable interrupts
-    lda NMIEN    ; 4 cyc
-    pha          ; 3 cyc
-    lda #0       ; 2 cyc
-    sta NMIEN    ; 4 cyc
-
-    ; set the standard top RAM and OS ROM on
-    or #$81      ; 2 cyc
-    sta PORTB    ; 4 cyc
+    ldx arg2          ; 3 cyc
+    lda bank_switch,x ; 4 cyc
+    sta PORTB         ; 4 cyc
 
 
     ; copy the entry into output
@@ -62,22 +45,21 @@ proc imul8xe
     lda (ptr),y  ; 5 cyc
     sta output+1 ; 3 cyc
 
-    ; restore memory
-    lda #$81     ; 2 cyc
-    sta PORTB    ; 4 cyc
-
-    ; restore interrupts
-    pla          ; 3 cyc
-    sta NMIEN    ; 4 cyc
+    ; note: we are not restoring memory to save 6 cycles!
+    ; this means those 16kb have to be switched back to base RAM
+    ; if we need to use them anywhere else
+    ;;; restore memory
+    ;;lda #$81     ; 2 cyc - disabled
+    ;;sta PORTB    ; 4 cyc - disabled
 
     ; check that 1 bit we skipped to fit into space
-    txa          ; 2 cyc
+    lda arg1     ; 3 cyc
     and $#1      ; 2 cyc
     beq done     ; 2 cyc
 
     ; add the second param one last tie for the skipped bit
     clc          ; 2 cyc
-    pla          ; 3 cyc
+    lda arg2     ; 3 cyc
     adc output   ; 3 cyc
     sta output   ; 3 cyc
     lda #0       ; 2 cyc
@@ -85,9 +67,7 @@ proc imul8xe
     sta output+1 ; 3 cyc
 
 done:
-    pla
-    rts          ; 6 cyc
-endproc
+.endmacro
 
 proc imul8xe_init
     rts
diff --git a/mandel.s b/mandel.s
index 3b0bc9f..e0a8570 100644
--- a/mandel.s
+++ b/mandel.s
@@ -373,6 +373,7 @@ fill_masks:
     .local next
     .local small_product
     ; circa 92 cycles? this doesn't seem right
+    ; 81-92 cycles
     .scope
         mul_factor_a   = arg1
         mul_factor_x   = arg2