From 405cec6d511947ccc1a0dcc3c79e06e4ac1a5278 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 25 Dec 2024 10:51:27 -0800 Subject: [PATCH] WIP imul8 via table experiments planning to try a 64KB table of 8x7-bit multiplies in the high memory on a 130XE or other high-memory-capable machine not yet working or finished too many cycles of overhead per invocation --- imul8xe.s | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ mandel.s | 71 ++++++++++++++++++++++------------------- 2 files changed, 133 insertions(+), 32 deletions(-) create mode 100644 imul8xe.s diff --git a/imul8xe.s b/imul8xe.s new file mode 100644 index 0000000..5cbb852 --- /dev/null +++ b/imul8xe.s @@ -0,0 +1,94 @@ +FR0 = $d4 ; float48 +PORTB = $d301 + + +EXTENDED_RAM = $4000 ; 16KiB bank on the XE +bankswitch = ; ??? + +; input in X/Y (lo/hi) +; output in FR0 +; clobbers FR0 +; 128 cycles +proc imul8xe + output = FR0 + ptr = FR0 + 2 + + lda #0 ; 2 cyc + sta ptr ; 3 cyc + sta ptr + 1 ; 3 cyc + + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + txa ; 2 cyc + and #$fe ; 2 cyc + sta ptr ; 3 cyc + tya ; 2 cyc + and #$3f ; 2 cyc + clc ; 2 cyc + adc #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; top 2 bits are the table bank selector + tya ; 2 cyc + and #$c0 ; 2 cyc + ; shift in extended RAM mode 2x 1 bits + sec ; 2 cyc + ror ; 2 cyc + ror ; 2 cyc + ; shift in 0 bits + asr ; 2 cyc + asr ; 2 cyc + asr ; 2 cyc + + ; save the second param for later + phy ; 3 cyc + + ; disable interrupts + lda NMIEN ; 4 cyc + pha ; 3 cyc + lda #0 ; 2 cyc + sta NMIEN ; 4 cyc + + ; set the standard top RAM and OS ROM on + or #$81 ; 2 cyc + sta PORTB ; 4 cyc + + + ; copy the entry into output + ldy #0 ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc + + ; restore memory + lda #$81 ; 2 cyc + sta PORTB ; 4 cyc + + ; restore interrupts + pla ; 3 cyc + sta NMIEN ; 4 cyc + + ; check that 1 bit we skipped to fit into space + txa ; 2 cyc + and $#1 ; 2 cyc + beq done ; 2 cyc + + ; add the second param one last tie for the skipped bit + clc ; 2 cyc + pla ; 3 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc + +done: + pla + rts ; 6 cyc +endproc + +proc imul8xe_init + rts +endproc diff --git a/mandel.s b/mandel.s index 3622995..3b0bc9f 100644 --- a/mandel.s +++ b/mandel.s @@ -372,51 +372,58 @@ fill_masks: .local under256 .local next .local small_product + ; circa 92 cycles? this doesn't seem right .scope mul_factor_a = arg1 mul_factor_x = arg2 mul_product_lo = dest mul_product_hi = dest + 1 - lda mul_factor_a ; setup: 6 cycles - ;ldx mul_factor_x + lda mul_factor_a ; 3 cyc - clc ; (a + x)^2/2: 23 cycles - adc mul_factor_x - tax - bcc under256 - lda mul_hibyte512,x - bcs next + ; (a + x)^2/2 + clc ; 2 cyc + adc mul_factor_x ; 3 cyc + tax ; 2 cyc + bcc under256 ; 2 cyc + lda mul_hibyte512,x ; 4 cyc + bcs next ; 2 cyc under256: - lda mul_hibyte256,x - sec + lda mul_hibyte256,x ; 4 cyc + sec ; 2 cyc next: - sta mul_product_hi - lda mul_lobyte256,x + sta mul_product_hi ; 3 cyc + lda mul_lobyte256,x ; 4 cyc - ldx mul_factor_a ; - a^2/2: 20 cycles - sbc mul_lobyte256,x - sta mul_product_lo - lda mul_product_hi - sbc mul_hibyte256,x - sta mul_product_hi + ; - a^2/2 + ldx mul_factor_a ; 3 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc - ldx mul_factor_x ; + x & a & 1: 22 cycles - txa ; (this is a kludge to correct a - and mul_factor_a ; roundoff error that makes odd * odd too low) - and #1 + ; + x & a & 1: + ; (this is a kludge to correct a + ; roundoff error that makes odd * odd too low) + ldx mul_factor_x ; 3 cyc + txa ; 2 cyc + and mul_factor_a ; 3 cyc + and #1 ; 2 cyc - clc - adc mul_product_lo - bcc small_product - inc mul_product_hi + clc ; 2 cyc + adc mul_product_lo ; 3 cyc + bcc small_product ; 2 cyc + inc mul_product_hi ; 5 cyc + + ; - x^2/2 small_product: - sec ; - x^2/2: 25 cycles - sbc mul_lobyte256,x - sta mul_product_lo - lda mul_product_hi - sbc mul_hibyte256,x - sta mul_product_hi + sec ; 2 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc .endscope .endmacro