Compare commits

...
Sign in to create a new pull request.

8 commits

Author SHA1 Message Date
6479cf530c update some timings 2025-09-16 21:56:50 -07:00
29cd3d968f Shaves 3 seconds off initial view runtime on XE :D
Instead of relying solely on the JMP thunks added to
imul16_func and sqr16_func, three call sites within the
mandelbrot iteration function are patched directly to
jsr to the XE versions, saving like 15 cycles per iter

Ok so it's not a lot, but every seconds counts. ;)

with XE code disabled:
1539 us/iter
5m13s

with old XE code:
1417 us/iter
4m48s

with new XE code:
1406 us/iter
4m45s
2025-09-06 19:53:25 -07:00
b46e6fb343 fix typo on stub x/y inputs
was accidentally falling through to the load
a viewport from a keypress thingy which was
not needed here
2025-09-01 12:28:33 -07:00
f2a6af0995 Replace the not-enough-precision 32 bit to float impl
keep the proc though to encapsulate it but uses the older
logic of rounding down to 3.13 first
2025-07-03 18:43:10 -07:00
96e0356e57 WIP input handling for coords
experimental output via 32-bits mult, looses precision in conversion
2025-07-03 18:41:24 -07:00
fab2760394 refactor countdown as a procedure call 2025-06-28 13:43:43 -07:00
fd954da47e Create map file for convenience
export a symbol and it'll appear in mandel.map
2025-06-23 08:17:39 -07:00
4bac47a4fd fix at 256 seconds 2025-06-23 00:31:53 -07:00
2 changed files with 207 additions and 123 deletions

View file

@ -3,7 +3,7 @@
all : mandel.xex
mandel.xex : mandel.o tables.o atari-asm-xex.cfg
ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
ld65 -C ./atari-asm-xex.cfg --mapfile mandel.map -o $@ mandel.o tables.o
%.o : %.s
ca65 -o $@ $<
@ -15,4 +15,6 @@ clean :
rm -f tables.s
rm -f *.o
rm -f *.xex
rm -f mandel.map

326
mandel.s
View file

@ -126,6 +126,10 @@ KEY_7 = 51
KEY_8 = 53
KEY_9 = 48
KEY_0 = 50
KEY_PERIOD = 34
KEY_E = 42
KEY_X = 22
KEY_Y = 43
.struct float48
exponent .byte
@ -403,6 +407,13 @@ elapsed_work:
elapsed_digit:
.byte 0
input_col:
.byte 0
input_row:
.byte 0
input_max:
.byte 0
; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
clc ; 2 cyc
@ -450,7 +461,7 @@ elapsed_digit:
sub 4, dest, arg1, arg2
.endmacro
; 3 + 5 * bytes cycles
; 3 + 5 * (bytes - 1) cycles
.macro shl bytes, arg
asl arg ; 3 cyc
.repeat bytes-1, i
@ -458,22 +469,23 @@ elapsed_digit:
.endrepeat
.endmacro
; 13 cycles
; 8 cycles
.macro shl16 arg
shl 2, arg
.endmacro
; 18 cycles
; 13 cycles
.macro shl24 arg
shl 3, arg
.endmacro
; 23 cycles
; 18 cycles
.macro shl32 arg
shl 4, arg
.endmacro
; 6 * bytes cycles
; 4 * bytes bytes
.macro copy bytes, dest, arg
.repeat bytes, byte ; 6 * bytes cycles
lda arg + byte ; 3 cyc
@ -482,6 +494,7 @@ elapsed_digit:
.endmacro
; 12 cycles
; 8 bytes
.macro copy16 dest, arg
copy 2, dest, arg
.endmacro
@ -516,17 +529,19 @@ elapsed_digit:
neg 4, arg
.endmacro
; 11-27 + 23 * shift cycles
; 103-119 cycles for shift=4
; 11-27 + 18 * shift cycles
; 65-81 cycles for shift=3
.macro shift_round_16 arg, shift
.repeat shift
shl32 arg ; 23 cycles
shl32 arg ; 18 cycles
.endrepeat
round16 arg ; 11-27 cycles
.endmacro
; input: arg1, arg2 as fixed4.12
; output: dest as fixed8.24
; patch point jsr at 16 bytes in
imul16_patch_offset = 16
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
@ -536,6 +551,8 @@ elapsed_digit:
; input: arg as fixed4.12
; output: dest as fixed8.24
; patch point jsr at 8 bytes in
sqr16_patch_offset = 8
.macro sqr16 dest, arg
copy16 FR0, arg ; 12 cyc
jsr sqr16_func ; ? cyc
@ -681,71 +698,6 @@ bank_switch_table:
.endif
.endmacro
.proc imul8xe_init
bank_switch 0
lda #0
sta EXTENDED_RAM
bank_switch 1
lda #1
sta EXTENDED_RAM
bank_switch 0
lda EXTENDED_RAM
beq init
; no bank switching available, we just overwrite the value in base ram
rts
init:
; patch imul16_func into a forwarding thunk to imul16xe_func
lda #$4c ; 'jmp' opcode
sta imul16_func
lda #.lobyte(imul16xe_func)
sta imul16_func + 1
lda #.hibyte(imul16xe_func)
sta imul16_func + 2
; ditto for sqr16_func -> sqr16xe_func
lda #$4c ; 'jmp' opcode
sta sqr16_func
lda #.lobyte(sqr16xe_func)
sta sqr16_func + 1
lda #.hibyte(sqr16xe_func)
sta sqr16_func + 2
; create the lookup table
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
sta ptr
lda #$40
sta ptr + 1
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
.endproc
; Initialize a 16 KB chunk of the table
; input: multipliers in temp
@ -983,6 +935,17 @@ common:
.endproc
; rounds to 16-bit first!
; input in FR0, 32 bits signed 6.26 fixed
; output in FR0, Atari float
; clobbers a, x, y, FR0, FR1
.proc fixed6_26_to_float
shift_round_16 FR0, 3
copy16 FR0, FR0 + 2
jsr fixed3_13_to_float
rts
.endproc
; input in FR0, Atari float
; output in FR0, 16 bits signed 3.13 fixed
; clobbers a, x, y, FR0, FR1
@ -1134,12 +1097,15 @@ keep_going:
shift_round_16 zy, 3
; zx_2 = zx * zx
fixup_sqr16_1:
sqr16 zx_2, zx + 2
; zy_2 = zy * zy
fixup_sqr16_2:
sqr16 zy_2, zy + 2
; zx_zy = zx * zy
fixup_imul16_1:
imul16 zx_zy, zx + 2, zy + 2
; dist = zx_2 + zy_2
@ -1603,7 +1569,7 @@ number_keys:
beq five
cpy #KEY_6
beq six
jmp skip_char
jmp letter_keys
one:
ldx #0
@ -1622,7 +1588,21 @@ five:
jmp load_key_viewport
six:
ldx #5
; fall through
jmp load_key_viewport
letter_keys:
cpy #KEY_X
bne not_x
jsr input_x
jmp done
not_x:
cpy #KEY_Y
bne not_y
jsr input_y
jmp done
not_y:
jmp skip_char
load_key_viewport:
jsr load_viewport
; fall through
@ -1632,6 +1612,23 @@ done:
.endproc
.proc input_x
ldx #col_x
ldy #1
jsr input_number
rts
.endproc
.proc input_y
rts
.endproc
.proc input_number
rts
.endproc
.proc clear_screen
; zero the range from framebuffer_top to display_list
lda #.lobyte(framebuffer_top)
@ -1679,9 +1676,7 @@ zero_byte_loop:
draw_string_const str_x
copy32 FR0, ox
shift_round_16 FR0, 3
copy16 FR0, FR0 + 2
jsr fixed3_13_to_float
jsr fixed6_26_to_float
jsr FASC
jsr draw_string
@ -1690,9 +1685,7 @@ zero_byte_loop:
draw_string_const str_y
copy32 FR0, oy
shift_round_16 FR0, 3
copy16 FR0, FR0 + 2
jsr fixed3_13_to_float
jsr fixed6_26_to_float
jsr FASC
jsr draw_string
@ -1991,51 +1984,25 @@ update_status:
lda FR0 + 1
sta elapsed_work + 1
;jsr IFP
;jsr FASC
;jsr draw_string
.macro countdown divisor, digits
.scope
; count the hours
ldx #0
countdown_loop:
lda elapsed_work + 1
cmp #.hibyte(divisor)
bcc countdown_done
lda elapsed_work
cmp #.lobyte(divisor)
bcc countdown_done
sec
lda elapsed_work
sbc #.lobyte(divisor)
sta elapsed_work
lda elapsed_work + 1
sbc #.hibyte(divisor)
sta elapsed_work + 1
inx
jmp countdown_loop
countdown_done:
lda digits,x
eor #$80
sta elapsed_digit
lda #.lobyte(elapsed_digit)
sta INBUFF
lda #.hibyte(elapsed_digit)
sta INBUFF + 1
jsr draw_string
.endscope
.endmacro
draw_string_const str_space
countdown 36000, digits_space
countdown 3600, digits_zero
.macro do_countdown divisor, digits
ldx #.lobyte(divisor)
ldy #.hibyte(divisor)
lda #.lobyte(digits)
sta INBUFF
lda #.hibyte(digits)
sta INBUFF + 1
jsr countdown
.endmacro
do_countdown 36000, digits_space
do_countdown 3600, digits_zero
draw_string_const str_h
countdown 600, digits_zero
countdown 60, digits_zero
do_countdown 600, digits_zero
do_countdown 60, digits_zero
draw_string_const str_m
countdown 10, digits_zero
countdown 1, digits_zero
do_countdown 10, digits_zero
do_countdown 1, digits_zero
draw_string_const str_s
skipped:
@ -2097,3 +2064,118 @@ loop:
jmp main_loop
.endproc
; digit string in INBUFF
; divisor X/Y
; clobbers temp, calls draw_string
.proc countdown
divisor = temp
stx divisor
sty divisor + 1
; count the hours
ldy #0
countdown_loop:
lda elapsed_work + 1
cmp divisor + 1
beq countdown_lobyte
bcc countdown_done
bcs countdown_inc
countdown_lobyte:
lda elapsed_work
cmp divisor
bcc countdown_done
countdown_inc:
sec
lda elapsed_work
sbc divisor
sta elapsed_work
lda elapsed_work + 1
sbc divisor + 1
sta elapsed_work + 1
iny
jmp countdown_loop
countdown_done:
lda (INBUFF),y
eor #$80
sta elapsed_digit
lda #.lobyte(elapsed_digit)
sta INBUFF
lda #.hibyte(elapsed_digit)
sta INBUFF + 1
jsr draw_string
rts
.endproc
.proc imul8xe_init
bank_switch 0
lda #0
sta EXTENDED_RAM
bank_switch 1
lda #1
sta EXTENDED_RAM
bank_switch 0
lda EXTENDED_RAM
beq init
; no bank switching available, we just overwrite the value in base ram
rts
init:
; patch imul16_func into a forwarding thunk to imul16xe_func
lda #$4c ; 'jmp' opcode
sta imul16_func
lda #.lobyte(imul16xe_func)
sta imul16_func + 1
sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1
lda #.hibyte(imul16xe_func)
sta imul16_func + 2
sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2
; ditto for sqr16_func -> sqr16xe_func
lda #$4c ; 'jmp' opcode
sta sqr16_func
lda #.lobyte(sqr16xe_func)
sta sqr16_func + 1
sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1
sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1
lda #.hibyte(sqr16xe_func)
sta sqr16_func + 2
sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2
sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2
; create the lookup table
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
sta ptr
lda #$40
sta ptr + 1
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
.endproc