WIP imul8 via table experiments

planning to try a 64KB table of 8x7-bit multiplies in the high memory
on a 130XE or other high-memory-capable machine

not yet working or finished

too many cycles of overhead per invocation
This commit is contained in:
Brooke Vibber 2024-12-25 10:51:27 -08:00
parent 05133aabdd
commit 405cec6d51
2 changed files with 133 additions and 32 deletions

94
imul8xe.s Normal file
View file

@ -0,0 +1,94 @@
FR0 = $d4 ; float48
PORTB = $d301
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
bankswitch = ; ???
; input in X/Y (lo/hi)
; output in FR0
; clobbers FR0
; 128 cycles
proc imul8xe
output = FR0
ptr = FR0 + 2
lda #0 ; 2 cyc
sta ptr ; 3 cyc
sta ptr + 1 ; 3 cyc
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
txa ; 2 cyc
and #$fe ; 2 cyc
sta ptr ; 3 cyc
tya ; 2 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; top 2 bits are the table bank selector
tya ; 2 cyc
and #$c0 ; 2 cyc
; shift in extended RAM mode 2x 1 bits
sec ; 2 cyc
ror ; 2 cyc
ror ; 2 cyc
; shift in 0 bits
asr ; 2 cyc
asr ; 2 cyc
asr ; 2 cyc
; save the second param for later
phy ; 3 cyc
; disable interrupts
lda NMIEN ; 4 cyc
pha ; 3 cyc
lda #0 ; 2 cyc
sta NMIEN ; 4 cyc
; set the standard top RAM and OS ROM on
or #$81 ; 2 cyc
sta PORTB ; 4 cyc
; copy the entry into output
ldy #0 ; 2 cyc
lda (ptr),y ; 5 cyc
sta output ; 3 cyc
iny ; 2 cyc
lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc
; restore memory
lda #$81 ; 2 cyc
sta PORTB ; 4 cyc
; restore interrupts
pla ; 3 cyc
sta NMIEN ; 4 cyc
; check that 1 bit we skipped to fit into space
txa ; 2 cyc
and $#1 ; 2 cyc
beq done ; 2 cyc
; add the second param one last tie for the skipped bit
clc ; 2 cyc
pla ; 3 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
adc output+1 ; 3 cyc
sta output+1 ; 3 cyc
done:
pla
rts ; 6 cyc
endproc
proc imul8xe_init
rts
endproc

View file

@ -372,51 +372,58 @@ fill_masks:
.local under256
.local next
.local small_product
; circa 92 cycles? this doesn't seem right
.scope
mul_factor_a = arg1
mul_factor_x = arg2
mul_product_lo = dest
mul_product_hi = dest + 1
lda mul_factor_a ; setup: 6 cycles
;ldx mul_factor_x
lda mul_factor_a ; 3 cyc
clc ; (a + x)^2/2: 23 cycles
adc mul_factor_x
tax
bcc under256
lda mul_hibyte512,x
bcs next
; (a + x)^2/2
clc ; 2 cyc
adc mul_factor_x ; 3 cyc
tax ; 2 cyc
bcc under256 ; 2 cyc
lda mul_hibyte512,x ; 4 cyc
bcs next ; 2 cyc
under256:
lda mul_hibyte256,x
sec
lda mul_hibyte256,x ; 4 cyc
sec ; 2 cyc
next:
sta mul_product_hi
lda mul_lobyte256,x
sta mul_product_hi ; 3 cyc
lda mul_lobyte256,x ; 4 cyc
ldx mul_factor_a ; - a^2/2: 20 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
; - a^2/2
ldx mul_factor_a ; 3 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
ldx mul_factor_x ; + x & a & 1: 22 cycles
txa ; (this is a kludge to correct a
and mul_factor_a ; roundoff error that makes odd * odd too low)
and #1
; + x & a & 1:
; (this is a kludge to correct a
; roundoff error that makes odd * odd too low)
ldx mul_factor_x ; 3 cyc
txa ; 2 cyc
and mul_factor_a ; 3 cyc
and #1 ; 2 cyc
clc
adc mul_product_lo
bcc small_product
inc mul_product_hi
clc ; 2 cyc
adc mul_product_lo ; 3 cyc
bcc small_product ; 2 cyc
inc mul_product_hi ; 5 cyc
; - x^2/2
small_product:
sec ; - x^2/2: 25 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
sec ; 2 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
.endscope
.endmacro