From a9d551a98d01a3634cb5068fc00506f2b398f8d2 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 17:50:59 -0800 Subject: [PATCH 1/5] first draft initializer --- imul8xe.s | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/imul8xe.s b/imul8xe.s index 15adf64..855e044 100644 --- a/imul8xe.s +++ b/imul8xe.s @@ -70,6 +70,106 @@ bankswitch: done: .endmacro +.macro bank_switch bank + lda #((bank << 1) | $c1) + sta PORTB +.endmacro + proc imul8xe_init + + ; go through the input set, in four 16KB chunks + + arg1 = FR1 + arg2 = FR2 + result = FR0 + + lda #$00 + sta arg1 + sta arg2 + + ; $00 * $00 -> $3f * $ff + bank_switch 0 + jsr imul8xe_init_section + + ; $40 * $00 -> $7f * $ff + bank_switch 1 + jsr imul8xe_init_section + + ; $80 * $00 -> $bf * $ff + bank_switch 2 + jsr imul8xe_init_section + + ; $c0 * $00 -> $ff * $ff + bank_switch 3 + jsr imul8xe_init_section + rts endproc + +; Initialize a 16 KB chunk of the table +; input: multipliers in temp +; output: new multipliers in temp +; clobbers: temp, temp2 +proc imul8xe_init_section + arg1 = FR1 + arg2 = FR2 + result = FR0 + ptr = temp2 + + lda #$00 + sta ptr + lda #$40 + sta ptr + 1 + + ldx #0 + ldy #0 + + ; outer loop: $00 -> $3f +outer_loop: + + ; reset result to 0 + lda #0 + sta result + sta result + 1 + + ; inner loop: $00 -> $ff +inner_loop: + + ; copy result to data set + lda result + sta (ptr),y + lda result + 1 + sta (ptr),y + + ; result += 2 * arg2 + clc + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + + ; inner loop check + inc arg1 + inc arg1 + inc ptr + inc ptr + bne inner_loop + + ; outer loop check + inc arg2 + inc ptr + 1 + lda ptr + 1 + cmp #$40 + bne outer_loop + + rts + +endproc From 34ce9da030ea3ee9853a8e5ecf64f65798faaded Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 18:17:01 -0800 Subject: [PATCH 2/5] builds, not used yte --- mandel.s | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/mandel.s b/mandel.s index e0a8570..d824193 100644 --- a/mandel.s +++ b/mandel.s @@ -74,6 +74,9 @@ width = 160 half_width = width >> 1 stride = width >> 2 +EXTENDED_RAM = $4000 ; 16KiB bank on the XE +PORTB = $D301 ; memory & bank-switch for XL/XE + DMACTL = $D400 DLISTL = $D402 DLISTH = $D403 @@ -428,6 +431,176 @@ fill_masks: .endscope .endmacro +; lookup table for top byte -> PORTB value for bank-switch +;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes +bankswitch: + .repeat 256, i + .byte ((i & $c0) >> 5) | $c1 + .endrepeat + +; 58-77 cycles +; clobbers x, y, dest to dest + 3 +.macro imul8xe dest, arg1, arg2 +.local done +.local output +.local ptr + + output = dest + ptr = dest + 2 ; scratch space assumed + + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + lda arg1 ; 3 cyc + and #$fe ; 2 cyc + sta ptr ; 3 cyc + lda arg2 ; 3 cyc + and #$3f ; 2 cyc + clc ; 2 cyc + adc #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; top 2 bits are the table bank selector + ldx arg2 ; 3 cyc + lda bank_switch,x ; 4 cyc + sta PORTB ; 4 cyc + + + ; copy the entry into output + ldy #0 ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc + + ; note: we are not restoring memory to save 6 cycles! + ; this means those 16kb have to be switched back to base RAM + ; if we need to use them anywhere else + ;;; restore memory + ;;lda #$81 ; 2 cyc - disabled + ;;sta PORTB ; 4 cyc - disabled + + ; check that 1 bit we skipped to fit into space + lda arg1 ; 3 cyc + and #1 ; 2 cyc + beq done ; 2 cyc + + ; add the second param one last time for the skipped bit + clc ; 2 cyc + lda arg2 ; 3 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc + +done: +.endmacro + +.macro bank_switch bank + lda #((bank << 1) | $c1) + sta PORTB +.endmacro + +.proc imul8xe_init + + ; go through the input set, in four 16KB chunks + + arg1 = FR1 + arg2 = FR2 + result = FR0 + + lda #$00 + sta arg1 + sta arg2 + + ; $00 * $00 -> $3f * $ff + bank_switch 0 + jsr imul8xe_init_section + + ; $40 * $00 -> $7f * $ff + bank_switch 1 + jsr imul8xe_init_section + + ; $80 * $00 -> $bf * $ff + bank_switch 2 + jsr imul8xe_init_section + + ; $c0 * $00 -> $ff * $ff + bank_switch 3 + jsr imul8xe_init_section + + rts +.endproc + +; Initialize a 16 KB chunk of the table +; input: multipliers in temp +; output: new multipliers in temp +; clobbers: temp, temp2 +.proc imul8xe_init_section + arg1 = FR1 + arg2 = FR2 + result = FR0 + ptr = temp2 + + lda #$00 + sta ptr + lda #$40 + sta ptr + 1 + + ldx #0 + ldy #0 + + ; outer loop: $00 -> $3f +outer_loop: + + ; reset result to 0 + lda #0 + sta result + sta result + 1 + + ; inner loop: $00 -> $ff +inner_loop: + + ; copy result to data set + lda result + sta (ptr),y + lda result + 1 + sta (ptr),y + + ; result += 2 * arg2 + clc + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + + ; inner loop check + inc arg1 + inc arg1 + inc ptr + inc ptr + bne inner_loop + + ; outer loop check + inc arg2 + inc ptr + 1 + lda ptr + 1 + cmp #$40 + bne outer_loop + + rts + +.endproc + .proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) From 45c5a4cb2d62d6fbed4ba64364220eb8827369f0 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 18:20:10 -0800 Subject: [PATCH 3/5] called, gets lost --- mandel.s | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index d824193..a8f3cac 100644 --- a/mandel.s +++ b/mandel.s @@ -433,7 +433,7 @@ fill_masks: ; lookup table for top byte -> PORTB value for bank-switch ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes -bankswitch: +bank_switch_table: .repeat 256, i .byte ((i & $c0) >> 5) | $c1 .endrepeat @@ -460,9 +460,9 @@ bankswitch: sta ptr + 1 ; 3 cyc ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch,x ; 4 cyc - sta PORTB ; 4 cyc + ldx arg2 ; 3 cyc + lda bank_switch_table,x ; 4 cyc + sta PORTB ; 4 cyc ; copy the entry into output @@ -612,20 +612,20 @@ inner_loop: ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - imul8 result, arg1, arg2 + imul8xe result, arg1, arg2 lda #0 sta result + 2 sta result + 3 - imul8 inter, arg1 + 1, arg2 + imul8xe inter, arg1 + 1, arg2 add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1, arg2 + 1 + imul8xe inter, arg1, arg2 + 1 add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1 + 1, arg2 + 1 + imul8xe inter, arg1 + 1, arg2 + 1 add16 result + 2, result + 2, inter ; In case of negative inputs, adjust high word @@ -1147,6 +1147,8 @@ zero_byte_loop: .proc start + jsr imul8xe_init + ; ox = 0; oy = 0; zoom = 0 ; count_frames = 0; count_pixels = 0 lda #0 From 0cde31905e62b9b97d8df3ea03c73a89bbb5d602 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 18:35:37 -0800 Subject: [PATCH 4/5] runs but doesn't work --- mandel.s | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mandel.s b/mandel.s index a8f3cac..79d9c78 100644 --- a/mandel.s +++ b/mandel.s @@ -548,7 +548,6 @@ done: lda #$40 sta ptr + 1 - ldx #0 ldy #0 ; outer loop: $00 -> $3f @@ -566,7 +565,9 @@ inner_loop: lda result sta (ptr),y lda result + 1 + iny sta (ptr),y + dey ; result += 2 * arg2 clc @@ -594,7 +595,7 @@ inner_loop: inc arg2 inc ptr + 1 lda ptr + 1 - cmp #$40 + cmp #$80 bne outer_loop rts From e84a990789b13c6c67e63cdb2db2a2be2b7893a6 Mon Sep 17 00:00:00 2001 From: Brooke Vibber Date: Thu, 26 Dec 2024 21:41:03 -0800 Subject: [PATCH 5/5] tweaks: --- mandel.s | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mandel.s b/mandel.s index 79d9c78..8c6130b 100644 --- a/mandel.s +++ b/mandel.s @@ -435,9 +435,15 @@ fill_masks: ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes bank_switch_table: .repeat 256, i - .byte ((i & $c0) >> 5) | $c1 + .byte ((i & $c0) >> 4) | $d1 .endrepeat +.macro bank_switch bank + lda #((bank << 2) | $d1) + sta PORTB +.endmacro + + ; 58-77 cycles ; clobbers x, y, dest to dest + 3 .macro imul8xe dest, arg1, arg2 @@ -497,11 +503,6 @@ bank_switch_table: done: .endmacro -.macro bank_switch bank - lda #((bank << 1) | $c1) - sta PORTB -.endmacro - .proc imul8xe_init ; go through the input set, in four 16KB chunks @@ -576,13 +577,14 @@ inner_loop: sta result lda #0 adc result + 1 - sta result + sta result + 1 + clc lda arg2 adc result sta result lda #0 adc result + 1 - sta result + sta result + 1 ; inner loop check inc arg1