update some timings

Shaves 3 seconds off initial view runtime on XE :D
Instead of relying solely on the JMP thunks added to imul16_func and sqr16_func, three call sites within the mandelbrot iteration function are patched directly to jsr to the XE versions, saving like 15 cycles per iter Ok so it's not a lot, but every seconds counts. ;) with XE code disabled: 1539 us/iter 5m13s with old XE code: 1417 us/iter 4m48s with new XE code: 1406 us/iter 4m45s
2025-09-16 21:56:50 -07:00 · 2025-09-06 19:53:25 -07:00 · 2025-09-01 12:28:33 -07:00 · 2025-07-03 18:43:10 -07:00 · 2025-07-03 18:41:24 -07:00 · 2025-06-28 13:43:43 -07:00
2 changed files with 207 additions and 123 deletions
--- a/4
+++ b/4
@ -3,7 +3,7 @@
 all : mandel.xex

 mandel.xex : mandel.o tables.o atari-asm-xex.cfg
-	ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg --mapfile mandel.map -o $@ mandel.o tables.o

 %.o : %.s
 	ca65 -o $@ $<
@ -15,4 +15,6 @@ clean :
 	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex
+	rm -f mandel.map
+

--- a/mandel.s
+++ b/mandel.s
@ -126,6 +126,10 @@ KEY_7     = 51
 KEY_8     = 53
 KEY_9     = 48
 KEY_0     = 50
+KEY_PERIOD = 34
+KEY_E     = 42
+KEY_X     = 22
+KEY_Y     = 43

 .struct float48
    exponent .byte
@ -403,6 +407,13 @@ elapsed_work:
 elapsed_digit:
    .byte 0

+input_col:
+    .byte 0
+input_row:
+    .byte 0
+input_max:
+    .byte 0
+
 ; 2 + 9 * byte cycles
 .macro add bytes, dest, arg1, arg2
    clc ; 2 cyc
@ -450,7 +461,7 @@ elapsed_digit:
    sub 4, dest, arg1, arg2
 .endmacro

-; 3 + 5 * bytes cycles
+; 3 + 5 * (bytes - 1) cycles
 .macro shl bytes, arg
    asl arg              ; 3 cyc
    .repeat bytes-1, i
@ -458,22 +469,23 @@ elapsed_digit:
    .endrepeat
 .endmacro

-; 13 cycles
+; 8 cycles
 .macro shl16 arg
    shl 2, arg
 .endmacro

-; 18 cycles
+; 13 cycles
 .macro shl24 arg
    shl 3, arg
 .endmacro

-; 23 cycles
+; 18 cycles
 .macro shl32 arg
    shl 4, arg
 .endmacro

 ; 6 * bytes cycles
+; 4 * bytes bytes
 .macro copy bytes, dest, arg
    .repeat bytes, byte ; 6 * bytes cycles
        lda arg + byte  ; 3 cyc
@ -482,6 +494,7 @@ elapsed_digit:
 .endmacro

 ; 12 cycles
+; 8 bytes
 .macro copy16 dest, arg
    copy 2, dest, arg
 .endmacro
@ -516,17 +529,19 @@ elapsed_digit:
    neg 4, arg
 .endmacro

-; 11-27 + 23 * shift cycles
-; 103-119 cycles for shift=4
+; 11-27 + 18 * shift cycles
+; 65-81 cycles for shift=3
 .macro shift_round_16 arg, shift
    .repeat shift
-        shl32 arg ; 23 cycles
+        shl32 arg ; 18 cycles
    .endrepeat
    round16 arg ; 11-27 cycles
 .endmacro

 ; input: arg1, arg2 as fixed4.12
 ; output: dest as fixed8.24
+; patch point jsr at 16 bytes in
+imul16_patch_offset = 16
 .macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
    copy16 FR1, arg2  ; 12 cyc
@ -536,6 +551,8 @@ elapsed_digit:

 ; input: arg as fixed4.12
 ; output: dest as fixed8.24
+; patch point jsr at 8 bytes in
+sqr16_patch_offset = 8
 .macro sqr16 dest, arg
    copy16 FR0, arg   ; 12 cyc
    jsr sqr16_func    ; ? cyc
@ -681,71 +698,6 @@ bank_switch_table:
    .endif
 .endmacro

-.proc imul8xe_init
-
-    bank_switch 0
-    lda #0
-    sta EXTENDED_RAM
-    bank_switch 1
-    lda #1
-    sta EXTENDED_RAM
-    bank_switch 0
-    lda EXTENDED_RAM
-    beq init
-
-    ; no bank switching available, we just overwrite the value in base ram
-    rts
-
-init:
-
-    ; patch imul16_func into a forwarding thunk to imul16xe_func
-    lda #$4c ; 'jmp' opcode
-    sta imul16_func
-    lda #.lobyte(imul16xe_func)
-    sta imul16_func + 1
-    lda #.hibyte(imul16xe_func)
-    sta imul16_func + 2
-
-    ; ditto for sqr16_func -> sqr16xe_func
-    lda #$4c ; 'jmp' opcode
-    sta sqr16_func
-    lda #.lobyte(sqr16xe_func)
-    sta sqr16_func + 1
-    lda #.hibyte(sqr16xe_func)
-    sta sqr16_func + 2
-
-    ; create the lookup table
-    ; go through the input set, in four 16KB chunks
-
-    arg1 = FR1
-    arg2 = FR2
-    result = FR0
-
-    lda #$00
-    sta arg1
-    sta arg2
-    sta ptr
-    lda #$40
-    sta ptr + 1
-
-    ; $00 * $00 -> $3f * $ff
-    bank_switch 0
-    jsr imul8xe_init_section
-
-    ; $40 * $00 -> $7f * $ff
-    bank_switch 1
-    jsr imul8xe_init_section
-
-    ; $80 * $00 -> $bf * $ff
-    bank_switch 2
-    jsr imul8xe_init_section
-
-    ; $c0 * $00 -> $ff * $ff
-    bank_switch 3
-    jsr imul8xe_init_section
-
-    rts
-.endproc

 ; Initialize a 16 KB chunk of the table
 ; input: multipliers in temp
@ -983,6 +935,17 @@ common:

 .endproc

+; rounds to 16-bit first!
+; input in FR0, 32 bits signed 6.26 fixed
+; output in FR0, Atari float
+; clobbers a, x, y, FR0, FR1
+.proc fixed6_26_to_float
+    shift_round_16 FR0, 3
+    copy16 FR0, FR0 + 2
+    jsr fixed3_13_to_float
+    rts
+.endproc
+
 ; input in FR0, Atari float
 ; output in FR0, 16 bits signed 3.13 fixed
 ; clobbers a, x, y, FR0, FR1
@ -1134,12 +1097,15 @@ keep_going:
    shift_round_16 zy, 3

    ; zx_2 = zx * zx
+fixup_sqr16_1:
    sqr16 zx_2, zx + 2

    ; zy_2 = zy * zy
+fixup_sqr16_2:
    sqr16 zy_2, zy + 2

    ; zx_zy = zx * zy
+fixup_imul16_1:
    imul16 zx_zy, zx + 2, zy + 2

    ; dist = zx_2 + zy_2
@ -1603,7 +1569,7 @@ number_keys:
    beq five
    cpy #KEY_6
    beq six
-    jmp skip_char
+    jmp letter_keys

 one:
    ldx #0
@ -1622,7 +1588,21 @@ five:
    jmp load_key_viewport
 six:
    ldx #5
-    ; fall through
+    jmp load_key_viewport
+
+letter_keys:
+    cpy #KEY_X
+    bne not_x
+    jsr input_x
+    jmp done
+not_x:
+    cpy #KEY_Y
+    bne not_y
+    jsr input_y
+    jmp done
+not_y:
+    jmp skip_char
+
 load_key_viewport:
    jsr load_viewport
    ; fall through
@ -1632,6 +1612,23 @@ done:

 .endproc

+.proc input_x
+    ldx #col_x
+    ldy #1
+    jsr input_number
+
+
+    rts
+.endproc
+
+.proc input_y
+    rts
+.endproc
+
+.proc input_number
+    rts
+.endproc
+
 .proc clear_screen
    ; zero the range from framebuffer_top to display_list
    lda #.lobyte(framebuffer_top)
@ -1679,9 +1676,7 @@ zero_byte_loop:
    draw_string_const str_x

    copy32 FR0, ox
-    shift_round_16 FR0, 3
-    copy16 FR0, FR0 + 2
-    jsr fixed3_13_to_float
+    jsr fixed6_26_to_float
    jsr FASC
    jsr draw_string

@ -1690,9 +1685,7 @@ zero_byte_loop:
    draw_string_const str_y

    copy32 FR0, oy
-    shift_round_16 FR0, 3
-    copy16 FR0, FR0 + 2
-    jsr fixed3_13_to_float
+    jsr fixed6_26_to_float
    jsr FASC
    jsr draw_string

@ -1991,51 +1984,25 @@ update_status:
    lda FR0 + 1
    sta elapsed_work + 1

-    ;jsr IFP
-    ;jsr FASC
-    ;jsr draw_string
-
-    .macro countdown divisor, digits
-        .scope
-            ; count the hours
-            ldx #0
-        countdown_loop:
-            lda elapsed_work + 1
-            cmp #.hibyte(divisor)
-            bcc countdown_done
-            lda elapsed_work
-            cmp #.lobyte(divisor)
-            bcc countdown_done
-            sec
-            lda elapsed_work
-            sbc #.lobyte(divisor)
-            sta elapsed_work
-            lda elapsed_work + 1
-            sbc #.hibyte(divisor)
-            sta elapsed_work + 1
-            inx
-            jmp countdown_loop
-        countdown_done:
-            lda digits,x
-            eor #$80
-            sta elapsed_digit
-            lda #.lobyte(elapsed_digit)
-            sta INBUFF
-            lda #.hibyte(elapsed_digit)
-            sta INBUFF + 1
-            jsr draw_string
-        .endscope
-    .endmacro
-
    draw_string_const str_space
-    countdown 36000, digits_space
-    countdown 3600, digits_zero
+    
+    .macro do_countdown divisor, digits
+        ldx #.lobyte(divisor)
+        ldy #.hibyte(divisor)
+        lda #.lobyte(digits)
+        sta INBUFF
+        lda #.hibyte(digits)
+        sta INBUFF + 1
+        jsr countdown
+    .endmacro
+    do_countdown 36000, digits_space
+    do_countdown 3600, digits_zero
    draw_string_const str_h
-    countdown 600, digits_zero
-    countdown 60, digits_zero
+    do_countdown 600, digits_zero
+    do_countdown 60, digits_zero
    draw_string_const str_m
-    countdown 10, digits_zero
-    countdown 1, digits_zero
+    do_countdown 10, digits_zero
+    do_countdown 1, digits_zero
    draw_string_const str_s

 skipped:
@ -2097,3 +2064,118 @@ loop:
    jmp main_loop

 .endproc
+
+; digit string in INBUFF
+; divisor X/Y
+; clobbers temp, calls draw_string
+.proc countdown
+    divisor = temp
+    stx divisor
+    sty divisor + 1
+
+    ; count the hours
+    ldy #0
+countdown_loop:
+    lda elapsed_work + 1
+    cmp divisor + 1
+    beq countdown_lobyte
+    bcc countdown_done
+    bcs countdown_inc
+countdown_lobyte:
+    lda elapsed_work
+    cmp divisor
+    bcc countdown_done
+countdown_inc:
+    sec
+    lda elapsed_work
+    sbc divisor
+    sta elapsed_work
+    lda elapsed_work + 1
+    sbc divisor + 1
+    sta elapsed_work + 1
+    iny
+    jmp countdown_loop
+countdown_done:
+    lda (INBUFF),y
+    eor #$80
+    sta elapsed_digit
+    lda #.lobyte(elapsed_digit)
+    sta INBUFF
+    lda #.hibyte(elapsed_digit)
+    sta INBUFF + 1
+    jsr draw_string
+    rts
+.endproc
+
+.proc imul8xe_init
+
+    bank_switch 0
+    lda #0
+    sta EXTENDED_RAM
+    bank_switch 1
+    lda #1
+    sta EXTENDED_RAM
+    bank_switch 0
+    lda EXTENDED_RAM
+    beq init
+
+    ; no bank switching available, we just overwrite the value in base ram
+    rts
+
+init:
+
+    ; patch imul16_func into a forwarding thunk to imul16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta imul16_func
+    lda #.lobyte(imul16xe_func)
+    sta imul16_func + 1
+    sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1
+    lda #.hibyte(imul16xe_func)
+    sta imul16_func + 2
+    sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2
+
+    ; ditto for sqr16_func -> sqr16xe_func
+    lda #$4c ; 'jmp' opcode
+    sta sqr16_func
+    lda #.lobyte(sqr16xe_func)
+    sta sqr16_func + 1
+    sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1
+    sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1
+    lda #.hibyte(sqr16xe_func)
+    sta sqr16_func + 2
+    sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2
+    sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2
+
+
+    ; create the lookup table
+    ; go through the input set, in four 16KB chunks
+
+    arg1 = FR1
+    arg2 = FR2
+    result = FR0
+
+    lda #$00
+    sta arg1
+    sta arg2
+    sta ptr
+    lda #$40
+    sta ptr + 1
+
+    ; $00 * $00 -> $3f * $ff
+    bank_switch 0
+    jsr imul8xe_init_section
+
+    ; $40 * $00 -> $7f * $ff
+    bank_switch 1
+    jsr imul8xe_init_section
+
+    ; $80 * $00 -> $bf * $ff
+    bank_switch 2
+    jsr imul8xe_init_section
+
+    ; $c0 * $00 -> $ff * $ff
+    bank_switch 3
+    jsr imul8xe_init_section
+
+    rts
+.endproc
Author	SHA1	Message	Date
Brooke Vibber	6479cf530c	update some timings	2025-09-16 21:56:50 -07:00
Brooke Vibber	29cd3d968f	Shaves 3 seconds off initial view runtime on XE :D Instead of relying solely on the JMP thunks added to imul16_func and sqr16_func, three call sites within the mandelbrot iteration function are patched directly to jsr to the XE versions, saving like 15 cycles per iter Ok so it's not a lot, but every seconds counts. ;) with XE code disabled: 1539 us/iter 5m13s with old XE code: 1417 us/iter 4m48s with new XE code: 1406 us/iter 4m45s	2025-09-06 19:53:25 -07:00
Brooke Vibber	b46e6fb343	fix typo on stub x/y inputs was accidentally falling through to the load a viewport from a keypress thingy which was not needed here	2025-09-01 12:28:33 -07:00
Brooke Vibber	f2a6af0995	Replace the not-enough-precision 32 bit to float impl keep the proc though to encapsulate it but uses the older logic of rounding down to 3.13 first	2025-07-03 18:43:10 -07:00
Brooke Vibber	96e0356e57	WIP input handling for coords experimental output via 32-bits mult, looses precision in conversion	2025-07-03 18:41:24 -07:00
Brooke Vibber	fab2760394	refactor countdown as a procedure call	2025-06-28 13:43:43 -07:00
Brooke Vibber	fd954da47e	Create map file for convenience export a symbol and it'll appear in mandel.map	2025-06-23 08:17:39 -07:00
Brooke Vibber	4bac47a4fd	fix at 256 seconds	2025-06-23 00:31:53 -07:00