Compare commits

...

7 commits

Author SHA1 Message Date
70d2c91f03 fix bank switch on xl/xe
was accidentally enabling basic rom :D

5m46s - 11.759 ms/px - 800xl
5m30s - 11.215 ms/px - 130xe
2024-12-30 03:56:35 -08:00
acac5a8df4 moving the framebuffer into the basic space
fails on 130xe and 800xl for some reason

works on 800 as expected
2024-12-29 21:19:55 -08:00
883f926e57 split memory, wip
appears to work on 800 but xl/xe overlap basic lol
2024-12-29 21:06:48 -08:00
0c63430dd9 wip tables segment to be 2024-12-29 20:37:58 -08:00
3ab5006aa3 wip refacotring 2024-12-29 17:56:14 -08:00
f903272335 refactoring and start on squares 2024-12-29 17:37:06 -08:00
8ad996981a whoops 2024-12-29 13:19:58 -08:00
4 changed files with 220 additions and 164 deletions

View file

@ -2,8 +2,8 @@
all : mandel.xex all : mandel.xex
mandel.xex : mandel.o tables.o mandel.xex : mandel.o tables.o atari-asm-xex.cfg
ld65 -C ./atari-asm-xex.cfg -o $@ $+ ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
%.o : %.s %.o : %.s
ca65 -o $@ $< ca65 -o $@ $<

View file

@ -6,7 +6,10 @@ SYMBOLS {
} }
MEMORY { MEMORY {
ZP: file = "", define = yes, start = $0082, size = $007E; ZP: file = "", define = yes, start = $0082, size = $007E;
MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; MAIN: file = %O, define = yes, start = %S, size = $4000 - %S;
# Keep $4000-7fff clear for expanded RAM access window
TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000;
# Keep $a000-$bfff clear for BASIC cartridge
} }
FILES { FILES {
%O: format = atari; %O: format = atari;
@ -21,5 +24,5 @@ SEGMENTS {
RODATA: load = MAIN, type = ro optional = yes; RODATA: load = MAIN, type = ro optional = yes;
DATA: load = MAIN, type = rw optional = yes; DATA: load = MAIN, type = rw optional = yes;
BSS: load = MAIN, type = bss, optional = yes, define = yes; BSS: load = MAIN, type = bss, optional = yes, define = yes;
TABLES: load = MAIN, type = ro, optional = yes, align = 256; TABLES: load = TABLES, type = ro, optional = yes, align = 256;
} }

359
mandel.s
View file

@ -62,11 +62,11 @@ FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
FMOVE = $DDB6 ; MOVE FR0 TO FR1 FMOVE = $DDB6 ; MOVE FR0 TO FR1
; High data ; High data
framebuffer_top = $8000 framebuffer_top = $a000
textbuffer = $8f00 textbuffer = $af00
framebuffer_bottom = $9000 framebuffer_bottom = $b000
display_list = $9f00 display_list = $bf00
framebuffer_end = $a000 framebuffer_end = $c000
height = 184 height = 184
half_height = height >> 1 half_height = height >> 1
@ -107,12 +107,14 @@ KEY_RIGHT = $87
.struct float48 .struct float48
exponent .byte exponent .byte
mantissa .byte 6 mantissa .byte 5
.endstruct .endstruct
.import mul_lobyte256 .import mul_lobyte256
.import mul_hibyte256 .import mul_hibyte256
.import mul_hibyte512 .import mul_hibyte512
.import sqr_lobyte
.import sqr_hibyte
.data .data
@ -374,137 +376,150 @@ viewport_oy:
copy16 dest, FR2 + 2 ; 12 cyc copy16 dest, FR2 + 2 ; 12 cyc
.endmacro .endmacro
; Adapted from https://everything2.com/title/Fast+6502+multiplication .macro sqr16_round dest, arg, shift
.macro imul8 dest, arg1, arg2 ;imul16_round dest, arg, arg, shift
.local under256 copy16 FR0, arg ; 12 cyc
.local next jsr sqr16_func ; ? cyc
.local small_product shift_round_16 FR2, shift
; circa 92 cycles? this doesn't seem right copy16 dest, FR2 + 2 ; 12 cyc
; 81-92 cycles .endmacro
.scope
mul_factor_a = arg1
mul_factor_x = arg2
mul_product_lo = dest
mul_product_hi = dest + 1
lda mul_factor_a ; 3 cyc ; clobbers a, x
.macro sqr8 dest, arg
; (a + x)^2/2 ldx arg
clc ; 2 cyc lda sqr_lobyte,x
adc mul_factor_x ; 3 cyc sta dest
tax ; 2 cyc lda sqr_hibyte,x
bcc under256 ; 2 cyc sta dest + 1
lda mul_hibyte512,x ; 4 cyc
bcs next ; 2 cyc
under256:
lda mul_hibyte256,x ; 4 cyc
sec ; 2 cyc
next:
sta mul_product_hi ; 3 cyc
lda mul_lobyte256,x ; 4 cyc
; - a^2/2
ldx mul_factor_a ; 3 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
; + x & a & 1:
; (this is a kludge to correct a
; roundoff error that makes odd * odd too low)
ldx mul_factor_x ; 3 cyc
txa ; 2 cyc
and mul_factor_a ; 3 cyc
and #1 ; 2 cyc
clc ; 2 cyc
adc mul_product_lo ; 3 cyc
bcc small_product ; 2 cyc
inc mul_product_hi ; 5 cyc
; - x^2/2
small_product:
sec ; 2 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
.endscope
.endmacro .endmacro
; lookup table for top byte -> PORTB value for bank-switch ; lookup table for top byte -> PORTB value for bank-switch
;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
bank_switch_table: bank_switch_table:
.repeat 256, i .repeat 256, i
.byte ((i & $c0) >> 4) | $e1 .byte ((i & $c0) >> 4) | $e3
.endrepeat .endrepeat
.macro bank_switch bank .macro bank_switch bank
lda #((bank << 2) | $e1) lda #((bank << 2) | $e3)
sta PORTB sta PORTB
.endmacro .endmacro
.macro imul8 dest, arg1, arg2, xe
.if xe
; using 64KB lookup table
; 58-77 cycles
; clobbers x, y, dest to dest + 3
.scope
output = dest
ptr = dest + 2 ; scratch space assumed
; 58-77 cycles ; bottom 14 bits except the LSB are the per-bank table index
; clobbers x, y, dest to dest + 3 ; add $4000 for the bank pointer
.macro imul8xe dest, arg1, arg2 lda arg1 ; 3 cyc
.local done and #$fe ; 2 cyc
.local output sta ptr ; 3 cyc
.local ptr lda arg2 ; 3 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
output = dest ; top 2 bits are the table bank selector
ptr = dest + 2 ; scratch space assumed ldx arg2 ; 3 cyc
lda bank_switch_table,x ; 4 cyc
; bottom 14 bits except the LSB are the per-bank table index sta PORTB ; 4 cyc
; add $4000 for the bank pointer
lda arg1 ; 3 cyc
and #$fe ; 2 cyc
sta ptr ; 3 cyc
lda arg2 ; 3 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; top 2 bits are the table bank selector
ldx arg2 ; 3 cyc
lda bank_switch_table,x ; 4 cyc
sta PORTB ; 4 cyc
; copy the entry into output ; copy the entry into output
ldy #0 ; 2 cyc ldy #0 ; 2 cyc
lda (ptr),y ; 5 cyc lda (ptr),y ; 5 cyc
sta output ; 3 cyc sta output ; 3 cyc
iny ; 2 cyc iny ; 2 cyc
lda (ptr),y ; 5 cyc lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc sta output+1 ; 3 cyc
; note: we are not restoring memory to save 6 cycles! ; note: we are not restoring memory to save 6 cycles!
; this means those 16kb have to be switched back to base RAM ; this means those 16kb have to be switched back to base RAM
; if we need to use them anywhere else ; if we need to use them anywhere else
;;; restore memory ;;; restore memory
;;lda #$81 ; 2 cyc - disabled ;;lda #$81 ; 2 cyc - disabled
;;sta PORTB ; 4 cyc - disabled ;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space ; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc lda arg1 ; 3 cyc
and #1 ; 2 cyc and #1 ; 2 cyc
beq done ; 2 cyc beq done ; 2 cyc
; add the second param one last time for the skipped bit ; add the second param one last time for the skipped bit
clc ; 2 cyc clc ; 2 cyc
lda arg2 ; 3 cyc lda arg2 ; 3 cyc
adc output ; 3 cyc adc output ; 3 cyc
sta output ; 3 cyc sta output ; 3 cyc
lda #0 ; 2 cyc lda #0 ; 2 cyc
adc output+1 ; 3 cyc adc output+1 ; 3 cyc
sta output+1 ; 3 cyc sta output+1 ; 3 cyc
done: done:
.endscope
.else
; Using base 48k RAM compatibility mode
; Small table of half squares
; Adapted from https://everything2.com/title/Fast+6502+multiplication
; 81-92 cycles
.scope
mul_factor_a = arg1
mul_factor_x = arg2
mul_product_lo = dest
mul_product_hi = dest + 1
lda mul_factor_a ; 3 cyc
; (a + x)^2/2
clc ; 2 cyc
adc mul_factor_x ; 3 cyc
tax ; 2 cyc
bcc under256 ; 2 cyc
lda mul_hibyte512,x ; 4 cyc
bcs next ; 2 cyc
under256:
lda mul_hibyte256,x ; 4 cyc
sec ; 2 cyc
next:
sta mul_product_hi ; 3 cyc
lda mul_lobyte256,x ; 4 cyc
; - a^2/2
ldx mul_factor_a ; 3 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
; + x & a & 1:
; (this is a kludge to correct a
; roundoff error that makes odd * odd too low)
ldx mul_factor_x ; 3 cyc
txa ; 2 cyc
and mul_factor_a ; 3 cyc
and #1 ; 2 cyc
clc ; 2 cyc
adc mul_product_lo ; 3 cyc
bcc small_product ; 2 cyc
inc mul_product_hi ; 5 cyc
; - x^2/2
small_product:
sec ; 2 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
.endscope
.endif
.endmacro .endmacro
.proc imul8xe_init .proc imul8xe_init
@ -532,6 +547,14 @@ init:
lda #.hibyte(imul16xe_func) lda #.hibyte(imul16xe_func)
sta imul16_func + 2 sta imul16_func + 2
; ditto for sqr16_func -> sqr16xe_func
lda #$4c ; 'jmp' opcode
sta sqr16_func
lda #.lobyte(sqr16xe_func)
sta sqr16_func + 1
lda #.hibyte(sqr16xe_func)
sta sqr16_func + 2
; create the lookup table ; create the lookup table
; go through the input set, in four 16KB chunks ; go through the input set, in four 16KB chunks
@ -632,7 +655,13 @@ inner_loop:
.endproc .endproc
.proc imul16_func .macro imul16_impl xe
.local arg1
.local arg2
.local result
.local inter
.local arg1_pos
.local arg2_pos
arg1 = FR0 ; 16-bit arg (clobbered) arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result result = FR2 ; 32-bit result
@ -643,20 +672,20 @@ inner_loop:
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8 result, arg1, arg2 imul8 result, arg1, arg2, xe
lda #0 lda #0
sta result + 2 sta result + 2
sta result + 3 sta result + 3
imul8 inter, arg1 + 1, arg2 imul8 inter, arg1 + 1, arg2, xe
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3 add_carry result + 3
imul8 inter, arg1, arg2 + 1 imul8 inter, arg1, arg2 + 1, xe
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3 add_carry result + 3
imul8 inter, arg1 + 1, arg2 + 1 imul8 inter, arg1 + 1, arg2 + 1, xe
add16 result + 2, result + 2, inter add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word ; In case of negative inputs, adjust high word
@ -671,47 +700,59 @@ arg1_pos:
arg2_pos: arg2_pos:
rts ; 6 cyc rts ; 6 cyc
.endmacro
.macro sqr16_impl xe
.scope
arg = FR0 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
;inter = temp2
inter = FR1
lda arg + 1
bpl arg_pos
neg16 arg
arg_pos:
; hl * hl
; (h*256 + l) * (h*256 + l)
; h*256*(h*256 + l) + l*(h*256 + l)
; h*h*256*256 + h*l*256 + h*l*256 + l*l
sqr8 result, arg
;imul8 inter, arg, arg, xe
lda #0
sta result + 2
sta result + 3
imul8 inter, arg + 1, arg, xe
add16 result + 1, result + 1, inter
add_carry result + 3
add16 result + 1, result + 1, inter
add_carry result + 3
sqr8 inter, arg + 1
;imul8 inter, arg + 1, arg + 1, xe
add16 result + 2, result + 2, inter
rts ; 6 cyc
.endscope
.endmacro
.proc imul16_func
imul16_impl 0
.endproc .endproc
.proc imul16xe_func .proc imul16xe_func
arg1 = FR0 ; 16-bit arg (clobbered) imul16_impl 1
arg2 = FR1 ; 16-bit arg (clobbered) .endproc
result = FR2 ; 32-bit result
inter = temp2
; h1l1 * h2l2 .proc sqr16_func
; (h1*256 + l1) * (h2*256 + l2) sqr16_impl 0
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) .endproc
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8xe result, arg1, arg2 .proc sqr16xe_func
lda #0 sqr16_impl 1
sta result + 2
sta result + 3
imul8xe inter, arg1 + 1, arg2
add16 result + 1, result + 1, inter
add_carry result + 3
imul8xe inter, arg1, arg2 + 1
add16 result + 1, result + 1, inter
add_carry result + 3
imul8xe inter, arg1 + 1, arg2 + 1
add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word
; https://stackoverflow.com/a/28827013
lda arg1 + 1
bpl arg1_pos
sub16 result + 2, result + 2, arg2
arg1_pos:
lda arg2 + 1
bpl arg2_pos
sub16 result + 2, result + 2, arg1
arg2_pos:
rts ; 6 cyc
.endproc .endproc
.macro round16 arg .macro round16 arg
@ -825,10 +866,10 @@ keep_going:
quick_exit zy, 2 quick_exit zy, 2
; zx_2 = zx * zx ; zx_2 = zx * zx
imul16_round zx_2, zx, zx, 4 sqr16_round zx_2, zx, 4
; zy_2 = zy * zy ; zy_2 = zy * zy
imul16_round zy_2, zy, zy, 4 sqr16_round zy_2, zy, 4
; zx_zy = zx * zy ; zx_zy = zx * zy
imul16_round zx_zy, zx, zy, 4 imul16_round zx_zy, zx, zy, 4

View file

@ -22,7 +22,10 @@ console.log(
.export mul_lobyte256 .export mul_lobyte256
.export mul_hibyte256 .export mul_hibyte256
.export mul_hibyte512 .export mul_hibyte512
.export sqr_lobyte
.export sqr_hibyte
; (i * i + 1) / 2 for the multiplier
.align 256 .align 256
mul_lobyte256: mul_lobyte256:
${db((i) => squares[i] & 0xff)} ${db((i) => squares[i] & 0xff)}
@ -35,4 +38,13 @@ ${db((i) => (squares[i] >> 8) & 0xff)}
mul_hibyte512: mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)} ${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares
.align 256
sqr_lobyte:
${db((i) => (i * i) & 0xff)}
.align 256
sqr_hibyte:
${db((i) => ((i * i) >> 8) & 0xff)}
`); `);