WIP imul8 via table experiments
planning to try a 64KB table of 8x7-bit multiplies in the high memory on a 130XE or other high-memory-capable machine not yet working or finished too many cycles of overhead per invocation
This commit is contained in:
parent
05133aabdd
commit
405cec6d51
2 changed files with 133 additions and 32 deletions
94
imul8xe.s
Normal file
94
imul8xe.s
Normal file
|
@ -0,0 +1,94 @@
|
|||
FR0 = $d4 ; float48
|
||||
PORTB = $d301
|
||||
|
||||
|
||||
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
|
||||
bankswitch = ; ???
|
||||
|
||||
; input in X/Y (lo/hi)
|
||||
; output in FR0
|
||||
; clobbers FR0
|
||||
; 128 cycles
|
||||
proc imul8xe
|
||||
output = FR0
|
||||
ptr = FR0 + 2
|
||||
|
||||
lda #0 ; 2 cyc
|
||||
sta ptr ; 3 cyc
|
||||
sta ptr + 1 ; 3 cyc
|
||||
|
||||
; bottom 14 bits except the LSB are the per-bank table index
|
||||
; add $4000 for the bank pointer
|
||||
txa ; 2 cyc
|
||||
and #$fe ; 2 cyc
|
||||
sta ptr ; 3 cyc
|
||||
tya ; 2 cyc
|
||||
and #$3f ; 2 cyc
|
||||
clc ; 2 cyc
|
||||
adc #$40 ; 2 cyc
|
||||
sta ptr + 1 ; 3 cyc
|
||||
|
||||
; top 2 bits are the table bank selector
|
||||
tya ; 2 cyc
|
||||
and #$c0 ; 2 cyc
|
||||
; shift in extended RAM mode 2x 1 bits
|
||||
sec ; 2 cyc
|
||||
ror ; 2 cyc
|
||||
ror ; 2 cyc
|
||||
; shift in 0 bits
|
||||
asr ; 2 cyc
|
||||
asr ; 2 cyc
|
||||
asr ; 2 cyc
|
||||
|
||||
; save the second param for later
|
||||
phy ; 3 cyc
|
||||
|
||||
; disable interrupts
|
||||
lda NMIEN ; 4 cyc
|
||||
pha ; 3 cyc
|
||||
lda #0 ; 2 cyc
|
||||
sta NMIEN ; 4 cyc
|
||||
|
||||
; set the standard top RAM and OS ROM on
|
||||
or #$81 ; 2 cyc
|
||||
sta PORTB ; 4 cyc
|
||||
|
||||
|
||||
; copy the entry into output
|
||||
ldy #0 ; 2 cyc
|
||||
lda (ptr),y ; 5 cyc
|
||||
sta output ; 3 cyc
|
||||
iny ; 2 cyc
|
||||
lda (ptr),y ; 5 cyc
|
||||
sta output+1 ; 3 cyc
|
||||
|
||||
; restore memory
|
||||
lda #$81 ; 2 cyc
|
||||
sta PORTB ; 4 cyc
|
||||
|
||||
; restore interrupts
|
||||
pla ; 3 cyc
|
||||
sta NMIEN ; 4 cyc
|
||||
|
||||
; check that 1 bit we skipped to fit into space
|
||||
txa ; 2 cyc
|
||||
and $#1 ; 2 cyc
|
||||
beq done ; 2 cyc
|
||||
|
||||
; add the second param one last tie for the skipped bit
|
||||
clc ; 2 cyc
|
||||
pla ; 3 cyc
|
||||
adc output ; 3 cyc
|
||||
sta output ; 3 cyc
|
||||
lda #0 ; 2 cyc
|
||||
adc output+1 ; 3 cyc
|
||||
sta output+1 ; 3 cyc
|
||||
|
||||
done:
|
||||
pla
|
||||
rts ; 6 cyc
|
||||
endproc
|
||||
|
||||
proc imul8xe_init
|
||||
rts
|
||||
endproc
|
71
mandel.s
71
mandel.s
|
@ -372,51 +372,58 @@ fill_masks:
|
|||
.local under256
|
||||
.local next
|
||||
.local small_product
|
||||
; circa 92 cycles? this doesn't seem right
|
||||
.scope
|
||||
mul_factor_a = arg1
|
||||
mul_factor_x = arg2
|
||||
mul_product_lo = dest
|
||||
mul_product_hi = dest + 1
|
||||
|
||||
lda mul_factor_a ; setup: 6 cycles
|
||||
;ldx mul_factor_x
|
||||
lda mul_factor_a ; 3 cyc
|
||||
|
||||
clc ; (a + x)^2/2: 23 cycles
|
||||
adc mul_factor_x
|
||||
tax
|
||||
bcc under256
|
||||
lda mul_hibyte512,x
|
||||
bcs next
|
||||
; (a + x)^2/2
|
||||
clc ; 2 cyc
|
||||
adc mul_factor_x ; 3 cyc
|
||||
tax ; 2 cyc
|
||||
bcc under256 ; 2 cyc
|
||||
lda mul_hibyte512,x ; 4 cyc
|
||||
bcs next ; 2 cyc
|
||||
under256:
|
||||
lda mul_hibyte256,x
|
||||
sec
|
||||
lda mul_hibyte256,x ; 4 cyc
|
||||
sec ; 2 cyc
|
||||
next:
|
||||
sta mul_product_hi
|
||||
lda mul_lobyte256,x
|
||||
sta mul_product_hi ; 3 cyc
|
||||
lda mul_lobyte256,x ; 4 cyc
|
||||
|
||||
ldx mul_factor_a ; - a^2/2: 20 cycles
|
||||
sbc mul_lobyte256,x
|
||||
sta mul_product_lo
|
||||
lda mul_product_hi
|
||||
sbc mul_hibyte256,x
|
||||
sta mul_product_hi
|
||||
; - a^2/2
|
||||
ldx mul_factor_a ; 3 cyc
|
||||
sbc mul_lobyte256,x ; 4 cyc
|
||||
sta mul_product_lo ; 3 cyc
|
||||
lda mul_product_hi ; 3 cyc
|
||||
sbc mul_hibyte256,x ; 4 cyc
|
||||
sta mul_product_hi ; 3 cyc
|
||||
|
||||
ldx mul_factor_x ; + x & a & 1: 22 cycles
|
||||
txa ; (this is a kludge to correct a
|
||||
and mul_factor_a ; roundoff error that makes odd * odd too low)
|
||||
and #1
|
||||
; + x & a & 1:
|
||||
; (this is a kludge to correct a
|
||||
; roundoff error that makes odd * odd too low)
|
||||
ldx mul_factor_x ; 3 cyc
|
||||
txa ; 2 cyc
|
||||
and mul_factor_a ; 3 cyc
|
||||
and #1 ; 2 cyc
|
||||
|
||||
clc
|
||||
adc mul_product_lo
|
||||
bcc small_product
|
||||
inc mul_product_hi
|
||||
clc ; 2 cyc
|
||||
adc mul_product_lo ; 3 cyc
|
||||
bcc small_product ; 2 cyc
|
||||
inc mul_product_hi ; 5 cyc
|
||||
|
||||
; - x^2/2
|
||||
small_product:
|
||||
sec ; - x^2/2: 25 cycles
|
||||
sbc mul_lobyte256,x
|
||||
sta mul_product_lo
|
||||
lda mul_product_hi
|
||||
sbc mul_hibyte256,x
|
||||
sta mul_product_hi
|
||||
sec ; 2 cyc
|
||||
sbc mul_lobyte256,x ; 4 cyc
|
||||
sta mul_product_lo ; 3 cyc
|
||||
lda mul_product_hi ; 3 cyc
|
||||
sbc mul_hibyte256,x ; 4 cyc
|
||||
sta mul_product_hi ; 3 cyc
|
||||
.endscope
|
||||
.endmacro
|
||||
|
||||
|
|
Loading…
Reference in a new issue