From 405cec6d511947ccc1a0dcc3c79e06e4ac1a5278 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 25 Dec 2024 10:51:27 -0800 Subject: [PATCH 01/10] WIP imul8 via table experiments planning to try a 64KB table of 8x7-bit multiplies in the high memory on a 130XE or other high-memory-capable machine not yet working or finished too many cycles of overhead per invocation --- imul8xe.s | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ mandel.s | 71 ++++++++++++++++++++++------------------- 2 files changed, 133 insertions(+), 32 deletions(-) create mode 100644 imul8xe.s diff --git a/imul8xe.s b/imul8xe.s new file mode 100644 index 0000000..5cbb852 --- /dev/null +++ b/imul8xe.s @@ -0,0 +1,94 @@ +FR0 = $d4 ; float48 +PORTB = $d301 + + +EXTENDED_RAM = $4000 ; 16KiB bank on the XE +bankswitch = ; ??? + +; input in X/Y (lo/hi) +; output in FR0 +; clobbers FR0 +; 128 cycles +proc imul8xe + output = FR0 + ptr = FR0 + 2 + + lda #0 ; 2 cyc + sta ptr ; 3 cyc + sta ptr + 1 ; 3 cyc + + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + txa ; 2 cyc + and #$fe ; 2 cyc + sta ptr ; 3 cyc + tya ; 2 cyc + and #$3f ; 2 cyc + clc ; 2 cyc + adc #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; top 2 bits are the table bank selector + tya ; 2 cyc + and #$c0 ; 2 cyc + ; shift in extended RAM mode 2x 1 bits + sec ; 2 cyc + ror ; 2 cyc + ror ; 2 cyc + ; shift in 0 bits + asr ; 2 cyc + asr ; 2 cyc + asr ; 2 cyc + + ; save the second param for later + phy ; 3 cyc + + ; disable interrupts + lda NMIEN ; 4 cyc + pha ; 3 cyc + lda #0 ; 2 cyc + sta NMIEN ; 4 cyc + + ; set the standard top RAM and OS ROM on + or #$81 ; 2 cyc + sta PORTB ; 4 cyc + + + ; copy the entry into output + ldy #0 ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc + + ; restore memory + lda #$81 ; 2 cyc + sta PORTB ; 4 cyc + + ; restore interrupts + pla ; 3 cyc + sta NMIEN ; 4 cyc + + ; check that 1 bit we skipped to fit into space + txa ; 2 cyc + and $#1 ; 2 cyc + beq done ; 2 cyc + + ; add the second param one last tie for the skipped bit + clc ; 2 cyc + pla ; 3 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc + +done: + pla + rts ; 6 cyc +endproc + +proc imul8xe_init + rts +endproc diff --git a/mandel.s b/mandel.s index 3622995..3b0bc9f 100644 --- a/mandel.s +++ b/mandel.s @@ -372,51 +372,58 @@ fill_masks: .local under256 .local next .local small_product + ; circa 92 cycles? this doesn't seem right .scope mul_factor_a = arg1 mul_factor_x = arg2 mul_product_lo = dest mul_product_hi = dest + 1 - lda mul_factor_a ; setup: 6 cycles - ;ldx mul_factor_x + lda mul_factor_a ; 3 cyc - clc ; (a + x)^2/2: 23 cycles - adc mul_factor_x - tax - bcc under256 - lda mul_hibyte512,x - bcs next + ; (a + x)^2/2 + clc ; 2 cyc + adc mul_factor_x ; 3 cyc + tax ; 2 cyc + bcc under256 ; 2 cyc + lda mul_hibyte512,x ; 4 cyc + bcs next ; 2 cyc under256: - lda mul_hibyte256,x - sec + lda mul_hibyte256,x ; 4 cyc + sec ; 2 cyc next: - sta mul_product_hi - lda mul_lobyte256,x + sta mul_product_hi ; 3 cyc + lda mul_lobyte256,x ; 4 cyc - ldx mul_factor_a ; - a^2/2: 20 cycles - sbc mul_lobyte256,x - sta mul_product_lo - lda mul_product_hi - sbc mul_hibyte256,x - sta mul_product_hi + ; - a^2/2 + ldx mul_factor_a ; 3 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc - ldx mul_factor_x ; + x & a & 1: 22 cycles - txa ; (this is a kludge to correct a - and mul_factor_a ; roundoff error that makes odd * odd too low) - and #1 + ; + x & a & 1: + ; (this is a kludge to correct a + ; roundoff error that makes odd * odd too low) + ldx mul_factor_x ; 3 cyc + txa ; 2 cyc + and mul_factor_a ; 3 cyc + and #1 ; 2 cyc - clc - adc mul_product_lo - bcc small_product - inc mul_product_hi + clc ; 2 cyc + adc mul_product_lo ; 3 cyc + bcc small_product ; 2 cyc + inc mul_product_hi ; 5 cyc + + ; - x^2/2 small_product: - sec ; - x^2/2: 25 cycles - sbc mul_lobyte256,x - sta mul_product_lo - lda mul_product_hi - sbc mul_hibyte256,x - sta mul_product_hi + sec ; 2 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc .endscope .endmacro From f996c3cbcd84b3aff3fd39bf3daee9a6c60a9e2a Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Wed, 25 Dec 2024 12:47:37 -0800 Subject: [PATCH 02/10] provisional maybe old mode runs in 81-92 cycles provisional code runs in 58-77 cycles if it works ;) --- imul8xe.s | 76 ++++++++++++++++++++----------------------------------- mandel.s | 1 + 2 files changed, 29 insertions(+), 48 deletions(-) diff --git a/imul8xe.s b/imul8xe.s index 5cbb852..d12f53f 100644 --- a/imul8xe.s +++ b/imul8xe.s @@ -3,55 +3,38 @@ PORTB = $d301 EXTENDED_RAM = $4000 ; 16KiB bank on the XE -bankswitch = ; ??? -; input in X/Y (lo/hi) -; output in FR0 -; clobbers FR0 -; 128 cycles -proc imul8xe - output = FR0 - ptr = FR0 + 2 +; lookup table for top byte -> PORTB value for bank-switch +.align 256 +bankswitch: + .repeat 256, i + .byte ((i & $c0) >> 5) | $c1 + .endrepeat - lda #0 ; 2 cyc - sta ptr ; 3 cyc - sta ptr + 1 ; 3 cyc +; 58-77 cycles +.macro imul8xe dest, arg1, arg2 +.local done +.local output +.local ptr + + output = dest + ptr = dest + 2 ; scratch space assumed ; bottom 14 bits except the LSB are the per-bank table index ; add $4000 for the bank pointer - txa ; 2 cyc + lda arg1 ; 3 cyc and #$fe ; 2 cyc sta ptr ; 3 cyc - tya ; 2 cyc + lda arg2 ; 3 cyc and #$3f ; 2 cyc clc ; 2 cyc adc #$40 ; 2 cyc sta ptr + 1 ; 3 cyc ; top 2 bits are the table bank selector - tya ; 2 cyc - and #$c0 ; 2 cyc - ; shift in extended RAM mode 2x 1 bits - sec ; 2 cyc - ror ; 2 cyc - ror ; 2 cyc - ; shift in 0 bits - asr ; 2 cyc - asr ; 2 cyc - asr ; 2 cyc - - ; save the second param for later - phy ; 3 cyc - - ; disable interrupts - lda NMIEN ; 4 cyc - pha ; 3 cyc - lda #0 ; 2 cyc - sta NMIEN ; 4 cyc - - ; set the standard top RAM and OS ROM on - or #$81 ; 2 cyc - sta PORTB ; 4 cyc + ldx arg2 ; 3 cyc + lda bank_switch,x ; 4 cyc + sta PORTB ; 4 cyc ; copy the entry into output @@ -62,22 +45,21 @@ proc imul8xe lda (ptr),y ; 5 cyc sta output+1 ; 3 cyc - ; restore memory - lda #$81 ; 2 cyc - sta PORTB ; 4 cyc - - ; restore interrupts - pla ; 3 cyc - sta NMIEN ; 4 cyc + ; note: we are not restoring memory to save 6 cycles! + ; this means those 16kb have to be switched back to base RAM + ; if we need to use them anywhere else + ;;; restore memory + ;;lda #$81 ; 2 cyc - disabled + ;;sta PORTB ; 4 cyc - disabled ; check that 1 bit we skipped to fit into space - txa ; 2 cyc + lda arg1 ; 3 cyc and $#1 ; 2 cyc beq done ; 2 cyc ; add the second param one last tie for the skipped bit clc ; 2 cyc - pla ; 3 cyc + lda arg2 ; 3 cyc adc output ; 3 cyc sta output ; 3 cyc lda #0 ; 2 cyc @@ -85,9 +67,7 @@ proc imul8xe sta output+1 ; 3 cyc done: - pla - rts ; 6 cyc -endproc +.endmacro proc imul8xe_init rts diff --git a/mandel.s b/mandel.s index 3b0bc9f..e0a8570 100644 --- a/mandel.s +++ b/mandel.s @@ -373,6 +373,7 @@ fill_masks: .local next .local small_product ; circa 92 cycles? this doesn't seem right + ; 81-92 cycles .scope mul_factor_a = arg1 mul_factor_x = arg2 From 829d2860e8f946a088218fa5cde2e07067e0dfa6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 12:04:01 -0800 Subject: [PATCH 03/10] :P --- imul8xe.s | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/imul8xe.s b/imul8xe.s index d12f53f..15adf64 100644 --- a/imul8xe.s +++ b/imul8xe.s @@ -12,6 +12,7 @@ bankswitch: .endrepeat ; 58-77 cycles +; clobbers x, y, dest to dest + 3 .macro imul8xe dest, arg1, arg2 .local done .local output @@ -54,10 +55,10 @@ bankswitch: ; check that 1 bit we skipped to fit into space lda arg1 ; 3 cyc - and $#1 ; 2 cyc + and #1 ; 2 cyc beq done ; 2 cyc - ; add the second param one last tie for the skipped bit + ; add the second param one last time for the skipped bit clc ; 2 cyc lda arg2 ; 3 cyc adc output ; 3 cyc From a9d551a98d01a3634cb5068fc00506f2b398f8d2 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 17:50:59 -0800 Subject: [PATCH 04/10] first draft initializer --- imul8xe.s | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/imul8xe.s b/imul8xe.s index 15adf64..855e044 100644 --- a/imul8xe.s +++ b/imul8xe.s @@ -70,6 +70,106 @@ bankswitch: done: .endmacro +.macro bank_switch bank + lda #((bank << 1) | $c1) + sta PORTB +.endmacro + proc imul8xe_init + + ; go through the input set, in four 16KB chunks + + arg1 = FR1 + arg2 = FR2 + result = FR0 + + lda #$00 + sta arg1 + sta arg2 + + ; $00 * $00 -> $3f * $ff + bank_switch 0 + jsr imul8xe_init_section + + ; $40 * $00 -> $7f * $ff + bank_switch 1 + jsr imul8xe_init_section + + ; $80 * $00 -> $bf * $ff + bank_switch 2 + jsr imul8xe_init_section + + ; $c0 * $00 -> $ff * $ff + bank_switch 3 + jsr imul8xe_init_section + rts endproc + +; Initialize a 16 KB chunk of the table +; input: multipliers in temp +; output: new multipliers in temp +; clobbers: temp, temp2 +proc imul8xe_init_section + arg1 = FR1 + arg2 = FR2 + result = FR0 + ptr = temp2 + + lda #$00 + sta ptr + lda #$40 + sta ptr + 1 + + ldx #0 + ldy #0 + + ; outer loop: $00 -> $3f +outer_loop: + + ; reset result to 0 + lda #0 + sta result + sta result + 1 + + ; inner loop: $00 -> $ff +inner_loop: + + ; copy result to data set + lda result + sta (ptr),y + lda result + 1 + sta (ptr),y + + ; result += 2 * arg2 + clc + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + + ; inner loop check + inc arg1 + inc arg1 + inc ptr + inc ptr + bne inner_loop + + ; outer loop check + inc arg2 + inc ptr + 1 + lda ptr + 1 + cmp #$40 + bne outer_loop + + rts + +endproc From 34ce9da030ea3ee9853a8e5ecf64f65798faaded Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 18:17:01 -0800 Subject: [PATCH 05/10] builds, not used yte --- mandel.s | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/mandel.s b/mandel.s index e0a8570..d824193 100644 --- a/mandel.s +++ b/mandel.s @@ -74,6 +74,9 @@ width = 160 half_width = width >> 1 stride = width >> 2 +EXTENDED_RAM = $4000 ; 16KiB bank on the XE +PORTB = $D301 ; memory & bank-switch for XL/XE + DMACTL = $D400 DLISTL = $D402 DLISTH = $D403 @@ -428,6 +431,176 @@ fill_masks: .endscope .endmacro +; lookup table for top byte -> PORTB value for bank-switch +;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes +bankswitch: + .repeat 256, i + .byte ((i & $c0) >> 5) | $c1 + .endrepeat + +; 58-77 cycles +; clobbers x, y, dest to dest + 3 +.macro imul8xe dest, arg1, arg2 +.local done +.local output +.local ptr + + output = dest + ptr = dest + 2 ; scratch space assumed + + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + lda arg1 ; 3 cyc + and #$fe ; 2 cyc + sta ptr ; 3 cyc + lda arg2 ; 3 cyc + and #$3f ; 2 cyc + clc ; 2 cyc + adc #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; top 2 bits are the table bank selector + ldx arg2 ; 3 cyc + lda bank_switch,x ; 4 cyc + sta PORTB ; 4 cyc + + + ; copy the entry into output + ldy #0 ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc + + ; note: we are not restoring memory to save 6 cycles! + ; this means those 16kb have to be switched back to base RAM + ; if we need to use them anywhere else + ;;; restore memory + ;;lda #$81 ; 2 cyc - disabled + ;;sta PORTB ; 4 cyc - disabled + + ; check that 1 bit we skipped to fit into space + lda arg1 ; 3 cyc + and #1 ; 2 cyc + beq done ; 2 cyc + + ; add the second param one last time for the skipped bit + clc ; 2 cyc + lda arg2 ; 3 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc + +done: +.endmacro + +.macro bank_switch bank + lda #((bank << 1) | $c1) + sta PORTB +.endmacro + +.proc imul8xe_init + + ; go through the input set, in four 16KB chunks + + arg1 = FR1 + arg2 = FR2 + result = FR0 + + lda #$00 + sta arg1 + sta arg2 + + ; $00 * $00 -> $3f * $ff + bank_switch 0 + jsr imul8xe_init_section + + ; $40 * $00 -> $7f * $ff + bank_switch 1 + jsr imul8xe_init_section + + ; $80 * $00 -> $bf * $ff + bank_switch 2 + jsr imul8xe_init_section + + ; $c0 * $00 -> $ff * $ff + bank_switch 3 + jsr imul8xe_init_section + + rts +.endproc + +; Initialize a 16 KB chunk of the table +; input: multipliers in temp +; output: new multipliers in temp +; clobbers: temp, temp2 +.proc imul8xe_init_section + arg1 = FR1 + arg2 = FR2 + result = FR0 + ptr = temp2 + + lda #$00 + sta ptr + lda #$40 + sta ptr + 1 + + ldx #0 + ldy #0 + + ; outer loop: $00 -> $3f +outer_loop: + + ; reset result to 0 + lda #0 + sta result + sta result + 1 + + ; inner loop: $00 -> $ff +inner_loop: + + ; copy result to data set + lda result + sta (ptr),y + lda result + 1 + sta (ptr),y + + ; result += 2 * arg2 + clc + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + + ; inner loop check + inc arg1 + inc arg1 + inc ptr + inc ptr + bne inner_loop + + ; outer loop check + inc arg2 + inc ptr + 1 + lda ptr + 1 + cmp #$40 + bne outer_loop + + rts + +.endproc + .proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) From 45c5a4cb2d62d6fbed4ba64364220eb8827369f0 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 18:20:10 -0800 Subject: [PATCH 06/10] called, gets lost --- mandel.s | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index d824193..a8f3cac 100644 --- a/mandel.s +++ b/mandel.s @@ -433,7 +433,7 @@ fill_masks: ; lookup table for top byte -> PORTB value for bank-switch ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes -bankswitch: +bank_switch_table: .repeat 256, i .byte ((i & $c0) >> 5) | $c1 .endrepeat @@ -460,9 +460,9 @@ bankswitch: sta ptr + 1 ; 3 cyc ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch,x ; 4 cyc - sta PORTB ; 4 cyc + ldx arg2 ; 3 cyc + lda bank_switch_table,x ; 4 cyc + sta PORTB ; 4 cyc ; copy the entry into output @@ -612,20 +612,20 @@ inner_loop: ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - imul8 result, arg1, arg2 + imul8xe result, arg1, arg2 lda #0 sta result + 2 sta result + 3 - imul8 inter, arg1 + 1, arg2 + imul8xe inter, arg1 + 1, arg2 add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1, arg2 + 1 + imul8xe inter, arg1, arg2 + 1 add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1 + 1, arg2 + 1 + imul8xe inter, arg1 + 1, arg2 + 1 add16 result + 2, result + 2, inter ; In case of negative inputs, adjust high word @@ -1147,6 +1147,8 @@ zero_byte_loop: .proc start + jsr imul8xe_init + ; ox = 0; oy = 0; zoom = 0 ; count_frames = 0; count_pixels = 0 lda #0 From 0cde31905e62b9b97d8df3ea03c73a89bbb5d602 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 18:35:37 -0800 Subject: [PATCH 07/10] runs but doesn't work --- mandel.s | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index a8f3cac..79d9c78 100644 --- a/mandel.s +++ b/mandel.s @@ -548,7 +548,6 @@ done: lda #$40 sta ptr + 1 - ldx #0 ldy #0 ; outer loop: $00 -> $3f @@ -566,7 +565,9 @@ inner_loop: lda result sta (ptr),y lda result + 1 + iny sta (ptr),y + dey ; result += 2 * arg2 clc @@ -594,7 +595,7 @@ inner_loop: inc arg2 inc ptr + 1 lda ptr + 1 - cmp #$40 + cmp #$80 bne outer_loop rts From e84a990789b13c6c67e63cdb2db2a2be2b7893a6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 21:41:03 -0800 Subject: [PATCH 08/10] tweaks: --- mandel.s | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index 79d9c78..8c6130b 100644 --- a/mandel.s +++ b/mandel.s @@ -435,9 +435,15 @@ fill_masks: ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes bank_switch_table: .repeat 256, i - .byte ((i & $c0) >> 5) | $c1 + .byte ((i & $c0) >> 4) | $d1 .endrepeat +.macro bank_switch bank + lda #((bank << 2) | $d1) + sta PORTB +.endmacro + + ; 58-77 cycles ; clobbers x, y, dest to dest + 3 .macro imul8xe dest, arg1, arg2 @@ -497,11 +503,6 @@ bank_switch_table: done: .endmacro -.macro bank_switch bank - lda #((bank << 1) | $c1) - sta PORTB -.endmacro - .proc imul8xe_init ; go through the input set, in four 16KB chunks @@ -576,13 +577,14 @@ inner_loop: sta result lda #0 adc result + 1 - sta result + sta result + 1 + clc lda arg2 adc result sta result lda #0 adc result + 1 - sta result + sta result + 1 ; inner loop check inc arg1 From ee1c2687054d760d21ffe8f1be97eb5eb6ecc7b9 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 21:49:13 -0800 Subject: [PATCH 09/10] it works --- mandel.s | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index 8c6130b..3ff91d1 100644 --- a/mandel.s +++ b/mandel.s @@ -435,11 +435,11 @@ fill_masks: ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes bank_switch_table: .repeat 256, i - .byte ((i & $c0) >> 4) | $d1 + .byte ((i & $c0) >> 4) | $e1 .endrepeat .macro bank_switch bank - lda #((bank << 2) | $d1) + lda #((bank << 2) | $e1) sta PORTB .endmacro From 83cba4afa3e28cc8f6b0377c9edc49e60af36187 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Fri, 27 Dec 2024 18:37:03 -0800 Subject: [PATCH 10/10] Runtime detection of XE-style extended memory Uses the "big multiplication table" in 64KB of extended memory if bank switching appears to work, otherwise uses the table of squares lookups. Initial view clocks in at 13.133 ms/px for the XE version and still 14.211 ms/px for the 400/800/XL version. Tested in emulator with 130XE and XL+Ultimate 1MB upgrade configs, and base implementation on the 800XL emulator. --- mandel.s | 75 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/mandel.s b/mandel.s index 3ff91d1..d198989 100644 --- a/mandel.s +++ b/mandel.s @@ -347,14 +347,6 @@ fill_masks: neg 4, arg .endmacro -; 518 - 828 cyc -.macro imul16 dest, arg1, arg2 - copy16 FR0, arg1 ; 12 cyc - copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; 470-780 cyc - copy32 dest, FR2 ; 24 cyc -.endmacro - .macro shift_round_16 arg, shift .repeat shift shl32 arg @@ -365,7 +357,7 @@ fill_masks: .macro imul16_round dest, arg1, arg2, shift copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; 470-780 cyc + jsr imul16_func ; ? cyc shift_round_16 FR2, shift copy16 dest, FR2 + 2 ; 12 cyc .endmacro @@ -505,6 +497,30 @@ done: .proc imul8xe_init + bank_switch 0 + lda #0 + sta EXTENDED_RAM + bank_switch 1 + lda #1 + sta EXTENDED_RAM + bank_switch 0 + lda EXTENDED_RAM + beq init + + ; no bank switching available, we just overwrite the value in base ram + rts + +init: + + ; patch imul16_func into a forwarding thunk to imul16xe_func + lda #$4c ; 'jmp' opcode + sta imul16_func + lda #.lobyte(imul16xe_func) + sta imul16_func + 1 + lda #.hibyte(imul16xe_func) + sta imul16_func + 2 + + ; create the lookup table ; go through the input set, in four 16KB chunks arg1 = FR1 @@ -615,6 +631,47 @@ inner_loop: ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 + imul8 result, arg1, arg2 + lda #0 + sta result + 2 + sta result + 3 + + imul8 inter, arg1 + 1, arg2 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1, arg2 + 1 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1 + 1, arg2 + 1 + add16 result + 2, result + 2, inter + + ; In case of negative inputs, adjust high word + ; https://stackoverflow.com/a/28827013 + lda arg1 + 1 + bpl arg1_pos + sub16 result + 2, result + 2, arg2 +arg1_pos: + lda arg2 + 1 + bpl arg2_pos + sub16 result + 2, result + 2, arg1 +arg2_pos: + + rts ; 6 cyc +.endproc + +.proc imul16xe_func + arg1 = FR0 ; 16-bit arg (clobbered) + arg2 = FR1 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + inter = temp2 + + ; h1l1 * h2l2 + ; (h1*256 + l1) * (h2*256 + l2) + ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) + ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 + imul8xe result, arg1, arg2 lda #0 sta result + 2