Compare commits

...

3 commits

Author SHA1 Message Date
829d2860e8 :P 2024-12-26 12:04:01 -08:00
f996c3cbcd provisional maybe
old mode runs in 81-92 cycles

provisional code runs in 58-77 cycles

if it works ;)
2024-12-25 12:47:37 -08:00
405cec6d51 WIP imul8 via table experiments
planning to try a 64KB table of 8x7-bit multiplies in the high memory
on a 130XE or other high-memory-capable machine

not yet working or finished

too many cycles of overhead per invocation
2024-12-25 10:51:27 -08:00
2 changed files with 115 additions and 32 deletions

75
imul8xe.s Normal file
View file

@ -0,0 +1,75 @@
FR0 = $d4 ; float48
PORTB = $d301
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
; lookup table for top byte -> PORTB value for bank-switch
.align 256
bankswitch:
.repeat 256, i
.byte ((i & $c0) >> 5) | $c1
.endrepeat
; 58-77 cycles
; clobbers x, y, dest to dest + 3
.macro imul8xe dest, arg1, arg2
.local done
.local output
.local ptr
output = dest
ptr = dest + 2 ; scratch space assumed
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
lda arg1 ; 3 cyc
and #$fe ; 2 cyc
sta ptr ; 3 cyc
lda arg2 ; 3 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; top 2 bits are the table bank selector
ldx arg2 ; 3 cyc
lda bank_switch,x ; 4 cyc
sta PORTB ; 4 cyc
; copy the entry into output
ldy #0 ; 2 cyc
lda (ptr),y ; 5 cyc
sta output ; 3 cyc
iny ; 2 cyc
lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc
; note: we are not restoring memory to save 6 cycles!
; this means those 16kb have to be switched back to base RAM
; if we need to use them anywhere else
;;; restore memory
;;lda #$81 ; 2 cyc - disabled
;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc
and #1 ; 2 cyc
beq done ; 2 cyc
; add the second param one last time for the skipped bit
clc ; 2 cyc
lda arg2 ; 3 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
adc output+1 ; 3 cyc
sta output+1 ; 3 cyc
done:
.endmacro
proc imul8xe_init
rts
endproc

View file

@ -372,51 +372,59 @@ fill_masks:
.local under256
.local next
.local small_product
; circa 92 cycles? this doesn't seem right
; 81-92 cycles
.scope
mul_factor_a = arg1
mul_factor_x = arg2
mul_product_lo = dest
mul_product_hi = dest + 1
lda mul_factor_a ; setup: 6 cycles
;ldx mul_factor_x
lda mul_factor_a ; 3 cyc
clc ; (a + x)^2/2: 23 cycles
adc mul_factor_x
tax
bcc under256
lda mul_hibyte512,x
bcs next
; (a + x)^2/2
clc ; 2 cyc
adc mul_factor_x ; 3 cyc
tax ; 2 cyc
bcc under256 ; 2 cyc
lda mul_hibyte512,x ; 4 cyc
bcs next ; 2 cyc
under256:
lda mul_hibyte256,x
sec
lda mul_hibyte256,x ; 4 cyc
sec ; 2 cyc
next:
sta mul_product_hi
lda mul_lobyte256,x
sta mul_product_hi ; 3 cyc
lda mul_lobyte256,x ; 4 cyc
ldx mul_factor_a ; - a^2/2: 20 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
; - a^2/2
ldx mul_factor_a ; 3 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
ldx mul_factor_x ; + x & a & 1: 22 cycles
txa ; (this is a kludge to correct a
and mul_factor_a ; roundoff error that makes odd * odd too low)
and #1
; + x & a & 1:
; (this is a kludge to correct a
; roundoff error that makes odd * odd too low)
ldx mul_factor_x ; 3 cyc
txa ; 2 cyc
and mul_factor_a ; 3 cyc
and #1 ; 2 cyc
clc
adc mul_product_lo
bcc small_product
inc mul_product_hi
clc ; 2 cyc
adc mul_product_lo ; 3 cyc
bcc small_product ; 2 cyc
inc mul_product_hi ; 5 cyc
; - x^2/2
small_product:
sec ; - x^2/2: 25 cycles
sbc mul_lobyte256,x
sta mul_product_lo
lda mul_product_hi
sbc mul_hibyte256,x
sta mul_product_hi
sec ; 2 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
.endscope
.endmacro