FR0    = $d4 ; float48
PORTB = $d301


EXTENDED_RAM = $4000 ; 16KiB bank on the XE

; lookup table for top byte -> PORTB value for bank-switch
.align 256
bankswitch:
    .repeat 256, i
        .byte ((i & $c0) >> 5) | $c1
    .endrepeat

; 58-77 cycles
; clobbers x, y, dest to dest + 3
.macro imul8xe dest, arg1, arg2
.local done
.local output
.local ptr

    output = dest
    ptr = dest + 2 ; scratch space assumed

    ; bottom 14 bits except the LSB are the per-bank table index
    ; add $4000 for the bank pointer
    lda arg1     ; 3 cyc
    and #$fe     ; 2 cyc
    sta ptr      ; 3 cyc
    lda arg2     ; 3 cyc
    and #$3f     ; 2 cyc
    clc          ; 2 cyc
    adc #$40     ; 2 cyc
    sta ptr + 1  ; 3 cyc
    
    ; top 2 bits are the table bank selector
    ldx arg2          ; 3 cyc
    lda bank_switch,x ; 4 cyc
    sta PORTB         ; 4 cyc


    ; copy the entry into output
    ldy #0       ; 2 cyc
    lda (ptr),y  ; 5 cyc
    sta output   ; 3 cyc
    iny          ; 2 cyc
    lda (ptr),y  ; 5 cyc
    sta output+1 ; 3 cyc

    ; note: we are not restoring memory to save 6 cycles!
    ; this means those 16kb have to be switched back to base RAM
    ; if we need to use them anywhere else
    ;;; restore memory
    ;;lda #$81     ; 2 cyc - disabled
    ;;sta PORTB    ; 4 cyc - disabled

    ; check that 1 bit we skipped to fit into space
    lda arg1     ; 3 cyc
    and #1       ; 2 cyc
    beq done     ; 2 cyc

    ; add the second param one last time for the skipped bit
    clc          ; 2 cyc
    lda arg2     ; 3 cyc
    adc output   ; 3 cyc
    sta output   ; 3 cyc
    lda #0       ; 2 cyc
    adc output+1 ; 3 cyc
    sta output+1 ; 3 cyc

done:
.endmacro

.macro bank_switch bank
    lda #((bank << 1) | $c1)
    sta PORTB
.endmacro

proc imul8xe_init

    ; go through the input set, in four 16KB chunks

    arg1 = FR1
    arg2 = FR2
    result = FR0

    lda #$00
    sta arg1
    sta arg2

    ; $00 * $00 -> $3f * $ff
    bank_switch 0
    jsr imul8xe_init_section

    ; $40 * $00 -> $7f * $ff
    bank_switch 1
    jsr imul8xe_init_section

    ; $80 * $00 -> $bf * $ff
    bank_switch 2
    jsr imul8xe_init_section

    ; $c0 * $00 -> $ff * $ff
    bank_switch 3
    jsr imul8xe_init_section

    rts
endproc

; Initialize a 16 KB chunk of the table
; input: multipliers in temp
; output: new multipliers in temp
; clobbers: temp, temp2
proc imul8xe_init_section
    arg1 = FR1
    arg2 = FR2
    result = FR0
    ptr = temp2

    lda #$00
    sta ptr
    lda #$40
    sta ptr + 1

    ldx #0
    ldy #0

    ; outer loop: $00 -> $3f
outer_loop:

    ; reset result to 0
    lda #0
    sta result
    sta result + 1

    ; inner loop: $00 -> $ff
inner_loop:

    ; copy result to data set
    lda result
    sta (ptr),y
    lda result + 1
    sta (ptr),y

    ; result += 2 * arg2
    clc
    lda arg2
    adc result
    sta result
    lda #0
    adc result + 1
    sta result
    lda arg2
    adc result
    sta result
    lda #0
    adc result + 1
    sta result

    ; inner loop check
    inc arg1
    inc arg1
    inc ptr
    inc ptr
    bne inner_loop

    ; outer loop check
    inc arg2
    inc ptr + 1
    lda ptr + 1
    cmp #$40
    bne outer_loop

    rts

endproc