From 405cec6d511947ccc1a0dcc3c79e06e4ac1a5278 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 25 Dec 2024 10:51:27 -0800 Subject: [PATCH 1/3] WIP imul8 via table experiments planning to try a 64KB table of 8x7-bit multiplies in the high memory on a 130XE or other high-memory-capable machine not yet working or finished too many cycles of overhead per invocation --- imul8xe.s | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ mandel.s | 71 ++++++++++++++++++++++------------------- 2 files changed, 133 insertions(+), 32 deletions(-) create mode 100644 imul8xe.s diff --git a/imul8xe.s b/imul8xe.s new file mode 100644 index 0000000..5cbb852 --- /dev/null +++ b/imul8xe.s @@ -0,0 +1,94 @@ +FR0 = $d4 ; float48 +PORTB = $d301 + + +EXTENDED_RAM = $4000 ; 16KiB bank on the XE +bankswitch = ; ??? + +; input in X/Y (lo/hi) +; output in FR0 +; clobbers FR0 +; 128 cycles +proc imul8xe + output = FR0 + ptr = FR0 + 2 + + lda #0 ; 2 cyc + sta ptr ; 3 cyc + sta ptr + 1 ; 3 cyc + + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + txa ; 2 cyc + and #$fe ; 2 cyc + sta ptr ; 3 cyc + tya ; 2 cyc + and #$3f ; 2 cyc + clc ; 2 cyc + adc #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; top 2 bits are the table bank selector + tya ; 2 cyc + and #$c0 ; 2 cyc + ; shift in extended RAM mode 2x 1 bits + sec ; 2 cyc + ror ; 2 cyc + ror ; 2 cyc + ; shift in 0 bits + asr ; 2 cyc + asr ; 2 cyc + asr ; 2 cyc + + ; save the second param for later + phy ; 3 cyc + + ; disable interrupts + lda NMIEN ; 4 cyc + pha ; 3 cyc + lda #0 ; 2 cyc + sta NMIEN ; 4 cyc + + ; set the standard top RAM and OS ROM on + or #$81 ; 2 cyc + sta PORTB ; 4 cyc + + + ; copy the entry into output + ldy #0 ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc + + ; restore memory + lda #$81 ; 2 cyc + sta PORTB ; 4 cyc + + ; restore interrupts + pla ; 3 cyc + sta NMIEN ; 4 cyc + + ; check that 1 bit we skipped to fit into space + txa ; 2 cyc + and $#1 ; 2 cyc + beq done ; 2 cyc + + ; add the second param one last tie for the skipped bit + clc ; 2 cyc + pla ; 3 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc + +done: + pla + rts ; 6 cyc +endproc + +proc imul8xe_init + rts +endproc diff --git a/mandel.s b/mandel.s index 3622995..3b0bc9f 100644 --- a/mandel.s +++ b/mandel.s @@ -372,51 +372,58 @@ fill_masks: .local under256 .local next .local small_product + ; circa 92 cycles? this doesn't seem right .scope mul_factor_a = arg1 mul_factor_x = arg2 mul_product_lo = dest mul_product_hi = dest + 1 - lda mul_factor_a ; setup: 6 cycles - ;ldx mul_factor_x + lda mul_factor_a ; 3 cyc - clc ; (a + x)^2/2: 23 cycles - adc mul_factor_x - tax - bcc under256 - lda mul_hibyte512,x - bcs next + ; (a + x)^2/2 + clc ; 2 cyc + adc mul_factor_x ; 3 cyc + tax ; 2 cyc + bcc under256 ; 2 cyc + lda mul_hibyte512,x ; 4 cyc + bcs next ; 2 cyc under256: - lda mul_hibyte256,x - sec + lda mul_hibyte256,x ; 4 cyc + sec ; 2 cyc next: - sta mul_product_hi - lda mul_lobyte256,x + sta mul_product_hi ; 3 cyc + lda mul_lobyte256,x ; 4 cyc - ldx mul_factor_a ; - a^2/2: 20 cycles - sbc mul_lobyte256,x - sta mul_product_lo - lda mul_product_hi - sbc mul_hibyte256,x - sta mul_product_hi + ; - a^2/2 + ldx mul_factor_a ; 3 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc - ldx mul_factor_x ; + x & a & 1: 22 cycles - txa ; (this is a kludge to correct a - and mul_factor_a ; roundoff error that makes odd * odd too low) - and #1 + ; + x & a & 1: + ; (this is a kludge to correct a + ; roundoff error that makes odd * odd too low) + ldx mul_factor_x ; 3 cyc + txa ; 2 cyc + and mul_factor_a ; 3 cyc + and #1 ; 2 cyc - clc - adc mul_product_lo - bcc small_product - inc mul_product_hi + clc ; 2 cyc + adc mul_product_lo ; 3 cyc + bcc small_product ; 2 cyc + inc mul_product_hi ; 5 cyc + + ; - x^2/2 small_product: - sec ; - x^2/2: 25 cycles - sbc mul_lobyte256,x - sta mul_product_lo - lda mul_product_hi - sbc mul_hibyte256,x - sta mul_product_hi + sec ; 2 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc .endscope .endmacro From f996c3cbcd84b3aff3fd39bf3daee9a6c60a9e2a Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 25 Dec 2024 12:47:37 -0800 Subject: [PATCH 2/3] provisional maybe old mode runs in 81-92 cycles provisional code runs in 58-77 cycles if it works ;) --- imul8xe.s | 76 ++++++++++++++++++++----------------------------------- mandel.s | 1 + 2 files changed, 29 insertions(+), 48 deletions(-) diff --git a/imul8xe.s b/imul8xe.s index 5cbb852..d12f53f 100644 --- a/imul8xe.s +++ b/imul8xe.s @@ -3,55 +3,38 @@ PORTB = $d301 EXTENDED_RAM = $4000 ; 16KiB bank on the XE -bankswitch = ; ??? -; input in X/Y (lo/hi) -; output in FR0 -; clobbers FR0 -; 128 cycles -proc imul8xe - output = FR0 - ptr = FR0 + 2 +; lookup table for top byte -> PORTB value for bank-switch +.align 256 +bankswitch: + .repeat 256, i + .byte ((i & $c0) >> 5) | $c1 + .endrepeat - lda #0 ; 2 cyc - sta ptr ; 3 cyc - sta ptr + 1 ; 3 cyc +; 58-77 cycles +.macro imul8xe dest, arg1, arg2 +.local done +.local output +.local ptr + + output = dest + ptr = dest + 2 ; scratch space assumed ; bottom 14 bits except the LSB are the per-bank table index ; add $4000 for the bank pointer - txa ; 2 cyc + lda arg1 ; 3 cyc and #$fe ; 2 cyc sta ptr ; 3 cyc - tya ; 2 cyc + lda arg2 ; 3 cyc and #$3f ; 2 cyc clc ; 2 cyc adc #$40 ; 2 cyc sta ptr + 1 ; 3 cyc ; top 2 bits are the table bank selector - tya ; 2 cyc - and #$c0 ; 2 cyc - ; shift in extended RAM mode 2x 1 bits - sec ; 2 cyc - ror ; 2 cyc - ror ; 2 cyc - ; shift in 0 bits - asr ; 2 cyc - asr ; 2 cyc - asr ; 2 cyc - - ; save the second param for later - phy ; 3 cyc - - ; disable interrupts - lda NMIEN ; 4 cyc - pha ; 3 cyc - lda #0 ; 2 cyc - sta NMIEN ; 4 cyc - - ; set the standard top RAM and OS ROM on - or #$81 ; 2 cyc - sta PORTB ; 4 cyc + ldx arg2 ; 3 cyc + lda bank_switch,x ; 4 cyc + sta PORTB ; 4 cyc ; copy the entry into output @@ -62,22 +45,21 @@ proc imul8xe lda (ptr),y ; 5 cyc sta output+1 ; 3 cyc - ; restore memory - lda #$81 ; 2 cyc - sta PORTB ; 4 cyc - - ; restore interrupts - pla ; 3 cyc - sta NMIEN ; 4 cyc + ; note: we are not restoring memory to save 6 cycles! + ; this means those 16kb have to be switched back to base RAM + ; if we need to use them anywhere else + ;;; restore memory + ;;lda #$81 ; 2 cyc - disabled + ;;sta PORTB ; 4 cyc - disabled ; check that 1 bit we skipped to fit into space - txa ; 2 cyc + lda arg1 ; 3 cyc and $#1 ; 2 cyc beq done ; 2 cyc ; add the second param one last tie for the skipped bit clc ; 2 cyc - pla ; 3 cyc + lda arg2 ; 3 cyc adc output ; 3 cyc sta output ; 3 cyc lda #0 ; 2 cyc @@ -85,9 +67,7 @@ proc imul8xe sta output+1 ; 3 cyc done: - pla - rts ; 6 cyc -endproc +.endmacro proc imul8xe_init rts diff --git a/mandel.s b/mandel.s index 3b0bc9f..e0a8570 100644 --- a/mandel.s +++ b/mandel.s @@ -373,6 +373,7 @@ fill_masks: .local next .local small_product ; circa 92 cycles? this doesn't seem right + ; 81-92 cycles .scope mul_factor_a = arg1 mul_factor_x = arg2 From 829d2860e8f946a088218fa5cde2e07067e0dfa6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 12:04:01 -0800 Subject: [PATCH 3/3] :P --- imul8xe.s | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/imul8xe.s b/imul8xe.s index d12f53f..15adf64 100644 --- a/imul8xe.s +++ b/imul8xe.s @@ -12,6 +12,7 @@ bankswitch: .endrepeat ; 58-77 cycles +; clobbers x, y, dest to dest + 3 .macro imul8xe dest, arg1, arg2 .local done .local output @@ -54,10 +55,10 @@ bankswitch: ; check that 1 bit we skipped to fit into space lda arg1 ; 3 cyc - and $#1 ; 2 cyc + and #1 ; 2 cyc beq done ; 2 cyc - ; add the second param one last tie for the skipped bit + ; add the second param one last time for the skipped bit clc ; 2 cyc lda arg2 ; 3 cyc adc output ; 3 cyc