Compare commits

...

5 commits

Author SHA1 Message Date
e84a990789 tweaks: 2024-12-26 21:41:03 -08:00
0cde31905e runs but doesn't work 2024-12-26 18:35:37 -08:00
45c5a4cb2d called, gets lost 2024-12-26 18:20:10 -08:00
34ce9da030 builds, not used yte 2024-12-26 18:17:01 -08:00
a9d551a98d first draft initializer 2024-12-26 17:50:59 -08:00
2 changed files with 282 additions and 4 deletions

100
imul8xe.s
View file

@ -70,6 +70,106 @@ bankswitch:
done:
.endmacro
.macro bank_switch bank
lda #((bank << 1) | $c1)
sta PORTB
.endmacro
proc imul8xe_init
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
endproc
; Initialize a 16 KB chunk of the table
; input: multipliers in temp
; output: new multipliers in temp
; clobbers: temp, temp2
proc imul8xe_init_section
arg1 = FR1
arg2 = FR2
result = FR0
ptr = temp2
lda #$00
sta ptr
lda #$40
sta ptr + 1
ldx #0
ldy #0
; outer loop: $00 -> $3f
outer_loop:
; reset result to 0
lda #0
sta result
sta result + 1
; inner loop: $00 -> $ff
inner_loop:
; copy result to data set
lda result
sta (ptr),y
lda result + 1
sta (ptr),y
; result += 2 * arg2
clc
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result
; inner loop check
inc arg1
inc arg1
inc ptr
inc ptr
bne inner_loop
; outer loop check
inc arg2
inc ptr + 1
lda ptr + 1
cmp #$40
bne outer_loop
rts
endproc

186
mandel.s
View file

@ -74,6 +74,9 @@ width = 160
half_width = width >> 1
stride = width >> 2
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
PORTB = $D301 ; memory & bank-switch for XL/XE
DMACTL = $D400
DLISTL = $D402
DLISTH = $D403
@ -428,6 +431,179 @@ fill_masks:
.endscope
.endmacro
; lookup table for top byte -> PORTB value for bank-switch
;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
bank_switch_table:
.repeat 256, i
.byte ((i & $c0) >> 4) | $d1
.endrepeat
.macro bank_switch bank
lda #((bank << 2) | $d1)
sta PORTB
.endmacro
; 58-77 cycles
; clobbers x, y, dest to dest + 3
.macro imul8xe dest, arg1, arg2
.local done
.local output
.local ptr
output = dest
ptr = dest + 2 ; scratch space assumed
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
lda arg1 ; 3 cyc
and #$fe ; 2 cyc
sta ptr ; 3 cyc
lda arg2 ; 3 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; top 2 bits are the table bank selector
ldx arg2 ; 3 cyc
lda bank_switch_table,x ; 4 cyc
sta PORTB ; 4 cyc
; copy the entry into output
ldy #0 ; 2 cyc
lda (ptr),y ; 5 cyc
sta output ; 3 cyc
iny ; 2 cyc
lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc
; note: we are not restoring memory to save 6 cycles!
; this means those 16kb have to be switched back to base RAM
; if we need to use them anywhere else
;;; restore memory
;;lda #$81 ; 2 cyc - disabled
;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc
and #1 ; 2 cyc
beq done ; 2 cyc
; add the second param one last time for the skipped bit
clc ; 2 cyc
lda arg2 ; 3 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
adc output+1 ; 3 cyc
sta output+1 ; 3 cyc
done:
.endmacro
.proc imul8xe_init
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
.endproc
; Initialize a 16 KB chunk of the table
; input: multipliers in temp
; output: new multipliers in temp
; clobbers: temp, temp2
.proc imul8xe_init_section
arg1 = FR1
arg2 = FR2
result = FR0
ptr = temp2
lda #$00
sta ptr
lda #$40
sta ptr + 1
ldy #0
; outer loop: $00 -> $3f
outer_loop:
; reset result to 0
lda #0
sta result
sta result + 1
; inner loop: $00 -> $ff
inner_loop:
; copy result to data set
lda result
sta (ptr),y
lda result + 1
iny
sta (ptr),y
dey
; result += 2 * arg2
clc
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result + 1
clc
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result + 1
; inner loop check
inc arg1
inc arg1
inc ptr
inc ptr
bne inner_loop
; outer loop check
inc arg2
inc ptr + 1
lda ptr + 1
cmp #$80
bne outer_loop
rts
.endproc
.proc imul16_func
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
@ -439,20 +615,20 @@ fill_masks:
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8 result, arg1, arg2
imul8xe result, arg1, arg2
lda #0
sta result + 2
sta result + 3
imul8 inter, arg1 + 1, arg2
imul8xe inter, arg1 + 1, arg2
add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1, arg2 + 1
imul8xe inter, arg1, arg2 + 1
add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1 + 1, arg2 + 1
imul8xe inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word
@ -974,6 +1150,8 @@ zero_byte_loop:
.proc start
jsr imul8xe_init
; ox = 0; oy = 0; zoom = 0
; count_frames = 0; count_pixels = 0
lda #0