diff --git a/Makefile b/Makefile index 711adcd..bd14c7d 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ all : mandel.xex mandel.xex : mandel.o tables.o atari-asm-xex.cfg - ld65 -C ./atari-asm-xex.cfg --mapfile mandel.map -o $@ mandel.o tables.o + ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o %.o : %.s ca65 -o $@ $< @@ -15,6 +15,4 @@ clean : rm -f tables.s rm -f *.o rm -f *.xex - rm -f mandel.map - diff --git a/mandel.s b/mandel.s index b52f24a..b8985b3 100644 --- a/mandel.s +++ b/mandel.s @@ -1,16 +1,16 @@ ; Our zero-page vars -ox = $80 ; fixed6.26: center point x -oy = $84 ; fixed6.26: center point y -cx = $88 ; fixed6.26: c_x -cy = $8c ; fixed6.26: c_y +ox = $80 ; fixed8.24: center point x +oy = $84 ; fixed8.24: center point y +cx = $88 ; fixed8.24: c_x +cy = $8c ; fixed8.24: c_y -zx = $90 ; fixed6.26: z_x -zy = $94 ; fixed6.26: z_y -zx_2 = $98 ; fixed6.26: z_x^2 -zy_2 = $9c ; fixed6.26: z_y^2 +zx = $90 ; fixed8.24: z_x +zy = $94 ; fixed8.24: z_y +zx_2 = $98 ; fixed8.24: z_x^2 +zy_2 = $9c ; fixed8.24: z_y^2 -zx_zy = $a0 ; fixed6.26: z_x * z_y -dist = $a4 ; fixed6.26: z_x^2 + z_y^2 +zx_zy = $a0 ; fixed8.24: z_x * z_y +dist = $a4 ; fixed8.24: z_x^2 + z_y^2 sx = $a8 ; i16: screen pixel x sy = $aa ; i16: screen pixel y z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not @@ -31,12 +31,10 @@ chroma_offset = $bb ; u8 palette_ticks = $bc ; u8 chroma_ticks = $bd ; u8 count_frames = $be ; u8 -; free space $bf +count_pixels = $bf ; u8 -count_iters = $c0 ; u16 -text_col = $c2 ; u8 -text_row = $c3 ; u8 -; free space c4-cb +total_pixels = $c0 ; float48 +total_ms = $c6 ; float48 temp = $cc ; u16 temp2 = $ce ; u16 @@ -61,12 +59,10 @@ LBUFF = $0580 ; result buffer for FASC routine ; FP ROM routine vectors FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set) IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48) -FPI = $D9D2 ; floating point to integer FADD = $DA66 ; ADDITION (FR0 += FR1) FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1) FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1) FDIV = $DB28 ; DIVISION (FR0 /= FR1) -ZFR0 = $DA44 ; clear FR0 ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX) FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX) FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX) @@ -80,7 +76,7 @@ framebuffer_bottom = $b000 display_list = $bf00 framebuffer_end = $c000 -height = 176 +height = 184 half_height = height >> 1 width = 160 half_width = width >> 1 @@ -126,10 +122,6 @@ KEY_7 = 51 KEY_8 = 53 KEY_9 = 48 KEY_0 = 50 -KEY_PERIOD = 34 -KEY_E = 42 -KEY_X = 22 -KEY_Y = 43 .struct float48 exponent .byte @@ -148,68 +140,25 @@ strings: str_self: .byte "MANDEL-6502" str_self_end: - .byte 0 str_speed: - .byte "us/iter: " + .byte " ms/px" str_speed_end: - .byte 0 str_run: .byte " RUN" str_run_end: - .byte 0 str_done: .byte "DONE" str_done_end: - .byte 0 -str_padding: - .byte " " -str_padding_end: - .byte 0 - -str_space: - .byte " " - .byte 0 - -str_h: - .byte "h" - .byte 0 -str_m: - .byte "m" - .byte 0 -str_s: - .byte "s" - .byte 0 +str_self_len = str_self_end - str_self str_speed_len = str_speed_end - str_speed str_run_len = str_run_end - str_run str_done_len = str_done_end - str_done -str_padding_len = str_padding_end - str_padding +speed_precision = 6 -; "3h59m59s" -str_elapsed_spacer = 8 -speed_start = 40 - str_done_len - str_speed_len - str_padding_len - str_elapsed_spacer - 1 +speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1 +speed_len = 14 + str_speed_len -col_x = 1 -str_x: - .byte "X:" - .byte 0 -str_x_len = 2 -str_x_space = 12 -str_x_padding = 2 - -col_y = col_x + str_x_len + str_x_space + str_x_padding -str_y: - .byte "Y:" - .byte 0 -str_y_len = 2 -str_y_space = 12 -str_y_padding = 2 - -col_zoom = col_y + str_y_len + str_y_space + str_y_padding -str_zoom: - .byte "ZOOM:" - .byte 0 -str_zoom_len = 5 char_map: ; Map ATASCII string values to framebuffer font entries @@ -225,12 +174,8 @@ char_map: .endrepeat hex_chars: -digits_zero: .byte "0123456789abcdef" -digits_space: - .byte " 123456789abcdef" - aspect: ; aspect ratio! ; pixels at 320w are 5:6 (narrow) @@ -244,49 +189,20 @@ aspect: ; ; 184h is the equiv of 220.8h at square pixels ; 320 / 220.8 = 1.45 display aspect ratio -aspect_x: ; fixed3.13 5/4 - .word 5 << (13 - 2) +aspect_x: ; fixed4.16 5/4 + .word 5 << (12 - 2) -aspect_y: ; fixed3.13 3/4 - .word 3 << (13 - 2) +aspect_y: ; fixed4.16 3/4 + .word 3 << (12 - 2) -fixed3_13_as_float: ; float48 - ; 1 << 13 - ; 8192 - ; 81 92 . 00 00 00 - .byte 65 ; exponent/sign - +1 byte - .byte $81 - .byte $92 - .byte $00 - .byte $00 - .byte $00 - -sec_per_frame: ; float48 00 . 01 66 66 66 67 - .byte 63 ; exponent/sign - -1 bytes - .byte $01 ; BCD digits +ms_per_frame: ; float48 16.66666667 + .byte 64 ; exponent/sign + .byte $16 ; BCD digits .byte $66 .byte $66 .byte $66 .byte $67 -us_per_sec: ; float48 1e9 01 00 0,0 00 . 00 - .byte 67 ; exponent/sign +3 bytes - .byte $01 ; BCD digits - .byte $00 - .byte $00 - .byte $00 - .byte $00 - -total_iters: ; float48 - .repeat 6 - .byte 0 - .endrepeat - -total_sec: ; float48 - .repeat 6 - .byte 0 - .endrepeat - display_list_start: ; 24 lines overscan .repeat 3 @@ -310,10 +226,6 @@ display_list_start: .byte $0e .endrep - ; 8 scan lines, 1 row of 40-column text - .byte $42 - .addr textbuffer + 40 - .byte $41 ; jump and blank .addr display_list display_list_end: @@ -322,9 +234,9 @@ display_list_len = display_list_end - display_list_start color_map: .byte 0 .repeat 85 - .byte %01010101 - .byte %10101010 - .byte %11111111 + .byte 1 + .byte 2 + .byte 3 .endrepeat @@ -373,46 +285,23 @@ fill_masks: .byte %00000001 .byte %00000000 -pixel_masks: - .byte %11111111 - .byte %11110000 - .byte %11000000 - viewport_zoom: - .byte 0 - .byte 5 - .byte 7 - .byte 5 - .byte 7 - .byte 7 + .byte 1 + .byte 6 + .byte 8 + .byte 6 viewport_ox: - .dword ($00000000 & $3fffffff) << 2 - .dword ($ff110000 & $3fffffff) << 2 - .dword ($ff110000 & $3fffffff) << 2 - .dword ($fe400000 & $3fffffff) << 2 - .dword ($fe3b0000 & $3fffffff) << 2 - .dword $fd220000 + .dword $00000000 + .dword $ff110000 + .dword $ff110000 + .dword $fe400000 viewport_oy: - .dword ($00000000 & $3fffffff) << 2 - .dword ($ffb60000 & $3fffffff) << 2 - .dword ($ffbe0000 & $3fffffff) << 2 - .dword ($00000000 & $3fffffff) << 2 - .dword ($fffe0000 & $3fffffff) << 2 - .dword $ff000000 - -elapsed_work: - .dword 0 -elapsed_digit: - .byte 0 - -input_col: - .byte 0 -input_row: - .byte 0 -input_max: - .byte 0 + .dword $00000000 + .dword $ffb60000 + .dword $ffbe0000 + .dword $00000000 ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 @@ -461,7 +350,7 @@ input_max: sub 4, dest, arg1, arg2 .endmacro -; 3 + 5 * (bytes - 1) cycles +; 3 + 5 * bytes cycles .macro shl bytes, arg asl arg ; 3 cyc .repeat bytes-1, i @@ -469,23 +358,22 @@ input_max: .endrepeat .endmacro -; 8 cycles +; 13 cycles .macro shl16 arg shl 2, arg .endmacro -; 13 cycles +; 18 cycles .macro shl24 arg shl 3, arg .endmacro -; 18 cycles +; 23 cycles .macro shl32 arg shl 4, arg .endmacro ; 6 * bytes cycles -; 4 * bytes bytes .macro copy bytes, dest, arg .repeat bytes, byte ; 6 * bytes cycles lda arg + byte ; 3 cyc @@ -494,7 +382,6 @@ input_max: .endmacro ; 12 cycles -; 8 bytes .macro copy16 dest, arg copy 2, dest, arg .endmacro @@ -529,19 +416,17 @@ input_max: neg 4, arg .endmacro -; 11-27 + 18 * shift cycles -; 65-81 cycles for shift=3 +; 11-27 + 23 * shift cycles +; 103-119 cycles for shift=4 .macro shift_round_16 arg, shift .repeat shift - shl32 arg ; 18 cycles + shl32 arg ; 23 cycles .endrepeat round16 arg ; 11-27 cycles .endmacro ; input: arg1, arg2 as fixed4.12 ; output: dest as fixed8.24 -; patch point jsr at 16 bytes in -imul16_patch_offset = 16 .macro imul16 dest, arg1, arg2 copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc @@ -551,8 +436,6 @@ imul16_patch_offset = 16 ; input: arg as fixed4.12 ; output: dest as fixed8.24 -; patch point jsr at 8 bytes in -sqr16_patch_offset = 8 .macro sqr16 dest, arg copy16 FR0, arg ; 12 cyc jsr sqr16_func ; ? cyc @@ -570,6 +453,20 @@ sqr16_patch_offset = 8 sta dest + 1 .endmacro +; input: arg as u8 +; input/output: dest as u16 +; clobbers a, x +.macro sqr8_add16 dest, arg + ldx arg + clc + lda sqr_lobyte,x + adc dest + sta dest + lda sqr_hibyte,x + adc dest + 1 + sta dest + 1 +.endmacro + .segment "TABLES" ; lookup table for top byte -> PORTB value for bank-switch .align 256 @@ -698,6 +595,71 @@ bank_switch_table: .endif .endmacro +.proc imul8xe_init + + bank_switch 0 + lda #0 + sta EXTENDED_RAM + bank_switch 1 + lda #1 + sta EXTENDED_RAM + bank_switch 0 + lda EXTENDED_RAM + beq init + + ; no bank switching available, we just overwrite the value in base ram + rts + +init: + + ; patch imul16_func into a forwarding thunk to imul16xe_func + lda #$4c ; 'jmp' opcode + sta imul16_func + lda #.lobyte(imul16xe_func) + sta imul16_func + 1 + lda #.hibyte(imul16xe_func) + sta imul16_func + 2 + + ; ditto for sqr16_func -> sqr16xe_func + lda #$4c ; 'jmp' opcode + sta sqr16_func + lda #.lobyte(sqr16xe_func) + sta sqr16_func + 1 + lda #.hibyte(sqr16xe_func) + sta sqr16_func + 2 + + ; create the lookup table + ; go through the input set, in four 16KB chunks + + arg1 = FR1 + arg2 = FR2 + result = FR0 + + lda #$00 + sta arg1 + sta arg2 + sta ptr + lda #$40 + sta ptr + 1 + + ; $00 * $00 -> $3f * $ff + bank_switch 0 + jsr imul8xe_init_section + + ; $40 * $00 -> $7f * $ff + bank_switch 1 + jsr imul8xe_init_section + + ; $80 * $00 -> $bf * $ff + bank_switch 2 + jsr imul8xe_init_section + + ; $c0 * $00 -> $ff * $ff + bank_switch 3 + jsr imul8xe_init_section + + rts +.endproc ; Initialize a 16 KB chunk of the table ; input: multipliers in temp @@ -787,8 +749,9 @@ inner_loop: ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 imul8 result, arg1, arg2, xe - - imul8 result + 2, arg1 + 1, arg2 + 1, xe + lda #0 + sta result + 2 + sta result + 3 imul8 inter, arg1 + 1, arg2, xe add16 result + 1, result + 1, inter @@ -798,6 +761,9 @@ inner_loop: add16 result + 1, result + 1, inter add_carry result + 3 + imul8 inter, arg1 + 1, arg2 + 1, xe + add16 result + 2, result + 2, inter + ; In case of negative inputs, adjust high word ; https://stackoverflow.com/a/28827013 lda arg1 + 1 @@ -830,8 +796,9 @@ arg2_pos: ; h*h*256*256 + h*l*256 + h*l*256 + l*l sqr8 result, arg - - sqr8 result + 2, arg + 1 + lda #0 + sta result + 2 + sta result + 3 imul8 inter, arg + 1, arg, xe add16 result + 1, result + 1, inter @@ -839,6 +806,8 @@ arg2_pos: add16 result + 1, result + 1, inter add_carry result + 3 + sqr8_add16 result + 2, arg + 1 + rts ; 6 cyc .endscope .endmacro @@ -904,83 +873,10 @@ next: .endmacro -; input in FR0, 16 bits signed 3.13 fixed -; output in FR0, Atari float -; clobbers a, x, y, FR0, FR1 -.proc fixed3_13_to_float - ldx #.lobyte(fixed3_13_as_float) - ldy #.hibyte(fixed3_13_as_float) - jsr FLD1R - - ; check sign bit! conversion routine is for unsigned - lda FR0 + 1 - bpl positive - -negative: - neg16 FR0 - jsr IFP - - ; set float sign bit - lda FR0 - ora #$80 - sta FR0 - jmp common - -positive: - jsr IFP - -common: - jsr FDIV - rts - -.endproc - -; rounds to 16-bit first! -; input in FR0, 32 bits signed 6.26 fixed -; output in FR0, Atari float -; clobbers a, x, y, FR0, FR1 -.proc fixed6_26_to_float - shift_round_16 FR0, 3 - copy16 FR0, FR0 + 2 - jsr fixed3_13_to_float - rts -.endproc - -; input in FR0, Atari float -; output in FR0, 16 bits signed 3.13 fixed -; clobbers a, x, y, FR0, FR1 -.proc float_to_fixed3_13 - ldx #.lobyte(fixed3_13_as_float) - ldy #.hibyte(fixed3_13_as_float) - jsr FLD1R - jsr FMUL - - ; check sign bit! conversion routine is for unsigned - lda FR0 - bcc positive - -negative: - ; clearfloat sign bit - lda FR0 - eor #$80 - sta FR0 - - jsr FPI - neg16 FR0 - jmp common - -positive: - jsr FPI - -common: - rts - -.endproc - .proc mandelbrot ; input: - ; cx: position scaled to 6.26 fixed point - -32..+31.9 - ; cy: position scaled to 6.26 + ; cx: position scaled to 8.24 fixed point - -128..+127.9 + ; cy: position scaled to 8.24 ; ; output: ; iter: iteration count at escape or 0 @@ -1031,11 +927,6 @@ common: sta z_buffer_end loop: - inc count_iters - bne low_iters - inc count_iters + 1 -low_iters: - ; iter++ & max-iters break inc iter bne keep_going @@ -1043,7 +934,7 @@ low_iters: keep_going: .macro quick_exit arg, max - ; arg: fixed6.26 + ; arg: fixed8.24 ; max: integer .local positive .local negative @@ -1056,12 +947,12 @@ keep_going: bmi negative positive: - cmp #(max << 2) + cmp #max bmi all_done ; 'less than' jmp exit_path negative: - cmp #(256 - (max << 2)) + cmp #(256 - max) beq first_equal ; 'equal' on first byte bpl all_done ; 'greater than' @@ -1081,7 +972,7 @@ keep_going: all_done: .endmacro - ; 6.26: (-32 .. 31.9) + ; 8.24: (-128 .. 127.9) ; zx = zx_2 - zy_2 + cx sub32 zx, zx_2, zy_2 add32 zx, zx, cx @@ -1092,20 +983,17 @@ keep_going: add32 zy, zy, cy quick_exit zy, 2 - ; convert 6.26 -> 3.13: (-4 .. +3.9) - shift_round_16 zx, 3 - shift_round_16 zy, 3 + ; convert 8.24 -> 4.12: (-8 .. +7.9) + shift_round_16 zx, 4 + shift_round_16 zy, 4 ; zx_2 = zx * zx -fixup_sqr16_1: sqr16 zx_2, zx + 2 ; zy_2 = zy * zy -fixup_sqr16_2: sqr16 zy_2, zy + 2 ; zx_zy = zx * zy -fixup_imul16_1: imul16 zx_zy, zx + 2, zy + 2 ; dist = zx_2 + zy_2 @@ -1219,9 +1107,9 @@ enough: .endmacro .macro zoom_factor dest, src, aspect - ; output: dest: fixed6.26 - ; input: src: fixed3.13 - ; aspect: fixed3.13 + ; output: dest: fixed8.24 + ; input: src: fixed4.12 + ; aspect: fixed4.12 ; clobbers A, X, flags, etc copy16 dest, src scale_zoom dest @@ -1239,11 +1127,8 @@ enough: ; iter -> color ldx iter lda color_map,x - ldx fill_level - and pixel_masks,x sta pixel_color - lda pixel_masks,x - eor #$ff + lda #(255 - 3) sta pixel_mask ; sy -> line base address in temp @@ -1292,23 +1177,22 @@ point: ; pixel_mask <<= pixel_shift (shifting in ones) and #3 sta pixel_shift + lda #3 + sec + sbc pixel_shift tax shift_loop: beq shift_done - lsr pixel_color - lsr pixel_color + asl pixel_color + asl pixel_color sec - ror pixel_mask + rol pixel_mask sec - ror pixel_mask + rol pixel_mask dex jmp shift_loop shift_done: - ldy fill_level - ldx fill_masks,y - inx - ; pixel_offset = temp >> 2 lda temp lsr a @@ -1316,94 +1200,48 @@ shift_done: sta pixel_offset tay -draw_pixel: ; read, mask, or, write lda (pixel_ptr),y and pixel_mask ora pixel_color sta (pixel_ptr),y - dex - beq done - clc - lda #40 - adc pixel_ptr - sta pixel_ptr - lda #0 - adc pixel_ptr + 1 - sta pixel_ptr + 1 - jmp draw_pixel - -done: rts .endproc -; in/out: column in text_col -; in: row in text_row -; in: pointer to string in INBUFF -; clobbers x/y/a/temp -.proc draw_string - drawptr = temp - strptr = INBUFF - - clc - lda #.lobyte(textbuffer) - adc text_col - sta temp - lda #.hibyte(textbuffer) - adc #0 - sta temp + 1 - - ldx text_row - beq done_rows -continue_rows: - clc - lda temp - adc #40 - sta temp - lda temp + 1 - adc #0 - sta temp + 1 - dex - bne continue_rows - -done_rows: - - ldy #0 +.macro draw_text_indirect col, len, strptr + ; clobbers A, X + .local loop + .local done + ldx #0 loop: - lda (strptr),y - ; if char's null, terminate c-style + cpx #len beq done - ; save the char for terminator check - pha - ; strip the high bit (terminator) - and #$7f - tax - lda char_map,x - sta (drawptr),y - iny - - pla - ; _last_ char has high bit set in atari rom routines - bmi done + txa + tay + lda (strptr),y + tay + lda char_map,y + sta textbuffer + col,x + inx jmp loop - done: - ; move the text column pointer - tya - clc - adc text_col - sta text_col +.endmacro - rts -.endproc - -.macro draw_string_const str - lda #.lobyte(str) - sta INBUFF - lda #.hibyte(str) - sta INBUFF + 1 - jsr draw_string +.macro draw_text col, len, cstr + ; clobbers A, X + .local loop + .local done + ldx #0 +loop: + cpx #len + beq done + ldy cstr,x + lda char_map,y + sta textbuffer + col,x + inx + jmp loop +done: .endmacro .proc vblank_handler @@ -1529,7 +1367,7 @@ skip_char: plus: lda zoom - cmp #7 + cmp #8 bpl skip_char inc zoom jmp done @@ -1540,20 +1378,16 @@ minus: dec zoom jmp done up: - add32 oy, oy, temp - jsr display_coords + sub32 oy, oy, temp jmp done down: - sub32 oy, oy, temp - jsr display_coords + add32 oy, oy, temp jmp done left: sub32 ox, ox, temp - jsr display_coords jmp done right: add32 ox, ox, temp - jsr display_coords jmp done number_keys: @@ -1565,11 +1399,7 @@ number_keys: beq three cpy #KEY_4 beq four - cpy #KEY_5 - beq five - cpy #KEY_6 - beq six - jmp letter_keys + jmp skip_char one: ldx #0 @@ -1582,27 +1412,7 @@ three: jmp load_key_viewport four: ldx #3 - jmp load_key_viewport -five: - ldx #4 - jmp load_key_viewport -six: - ldx #5 - jmp load_key_viewport - -letter_keys: - cpy #KEY_X - bne not_x - jsr input_x - jmp done -not_x: - cpy #KEY_Y - bne not_y - jsr input_y - jmp done -not_y: - jmp skip_char - + ; fall through load_key_viewport: jsr load_viewport ; fall through @@ -1612,23 +1422,6 @@ done: .endproc -.proc input_x - ldx #col_x - ldy #1 - jsr input_number - - - rts -.endproc - -.proc input_y - rts -.endproc - -.proc input_number - rts -.endproc - .proc clear_screen ; zero the range from framebuffer_top to display_list lda #.lobyte(framebuffer_top) @@ -1654,59 +1447,12 @@ zero_byte_loop: .proc status_bar ; Status bar - - lda #0 - sta text_col - lda #0 - sta text_row - draw_string_const str_self - - lda #(40 - str_run_len) - sta text_col - draw_string_const str_run + draw_text 0, str_self_len, str_self + draw_text 40 - str_run_len, str_run_len, str_run rts .endproc -.proc display_coords - lda #1 - sta text_row - lda #col_x - sta text_col - draw_string_const str_x - - copy32 FR0, ox - jsr fixed6_26_to_float - jsr FASC - jsr draw_string - - lda #col_y - sta text_col - draw_string_const str_y - - copy32 FR0, oy - jsr fixed6_26_to_float - jsr FASC - jsr draw_string - - lda #col_zoom - sta text_col - draw_string_const str_zoom - - lda zoom - clc - adc #0 - sta FR0 - lda #0 - sta FR0 + 1 - jsr IFP - jsr FASC - jsr draw_string - - rts - -.endproc - ; input: viewport selector in x ; clobbers: a, x .proc load_viewport @@ -1758,7 +1504,6 @@ zero_byte_loop: sta DMACTL jsr clear_screen - jsr display_coords ; Copy the display list into properly aligned memory ; Can't cross 1024-byte boundaries :D @@ -1797,24 +1542,19 @@ copy_byte_loop: jsr SETVBV main_loop: - ; count_frames = 0; count_iters = 0 + ; count_frames = 0; count_pixels = 0 lda #0 sta count_frames - sta count_iters - sta count_iters + 1 + sta count_pixels - ; total_sec = 0.0; total_iters = 0.0 - jsr ZFR0 - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) - jsr FST0R - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) - jsr FST0R + ; total_ms = 0.0; total_pixels = 0.0 + ldx #total_ms + jsr ZF1 + ldx #total_pixels + jsr ZF1 jsr clear_screen jsr status_bar - jsr display_coords lda #0 sta fill_level @@ -1872,7 +1612,6 @@ not_skipped_mask: zoom_factor cx, sx, aspect_x add32 cx, cx, ox zoom_factor cy, sy, aspect_y - neg32 cy add32 cy, cy, oy jsr mandelbrot jsr pset @@ -1884,32 +1623,38 @@ not_skipped_mask: no_key: ; check if we should update the counters + ; + ; count_pixels >= width? update! + inc count_pixels + lda count_pixels + cmp #width + bmi update_status ; count_frames >= 120? update! lda count_frames cmp #120 ; >= 2 seconds - bpl update_status - jmp skipped + bmi skipped update_status: - ; FR0 = (float)count_iters & clear count_iters - copy16 FR0, count_iters - jsr IFP + ; FR0 = (float)count_pixels & clear count_pixels + lda count_pixels + sta FR0 lda #0 - sta count_iters - sta count_iters + 1 + sta FR0 + 1 + sta count_pixels + jsr IFP - ; FR1 = total_iters - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) + ; FR1 = total_pixels + ldx #.lobyte(total_pixels) + ldy #.hibyte(total_pixels) jsr FLD1R ; FR0 += FR1 jsr FADD - ; total_iters = FR0 - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) + ; total_pixels = FR0 + ldx #.lobyte(total_pixels) + ldy #.hibyte(total_pixels) jsr FST0R @@ -1922,100 +1667,44 @@ update_status: sta count_frames jsr IFP - ; FR0 *= sec_per_frame - ldx #.lobyte(sec_per_frame) - ldy #.hibyte(sec_per_frame) + ; FR0 *= ms_per_frame + ldx #.lobyte(ms_per_frame) + ldy #.hibyte(ms_per_frame) jsr FLD1R jsr FMUL - ; FR0 += total_sec - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) + ; FR0 += total_ms + ldx #total_ms + ldy #0 jsr FLD1R jsr FADD - ; total_sec = FR0 - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) + ; total_ms = FR0 + ldx #total_ms + ldy #0 jsr FST0R - ; FR0 /= total_iters - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) + ; FR0 /= total_pixels + ldx #total_pixels + ldy #0 jsr FLD1R jsr FDIV - ; FR0 *= us_per_sec - ldx #.lobyte(us_per_sec) - ldy #.hibyte(us_per_sec) - jsr FLD1R - jsr FMUL - - ; round (down) to integer - jsr FPI - clc - jsr IFP - - lda #speed_start - sta text_col - lda #0 - sta text_row - draw_string_const str_speed - - lda text_col - pha - draw_string_const str_padding - pla - sta text_col - - ; convert to ASCII in INBUFF and print + ; convert to ASCII in INBUFF jsr FASC - jsr draw_string - ; elapsed time - ; FR0 = total_sec - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) - jsr FLD0R - ; FR0 -> integer -> elapsed_work - jsr FPI - lda FR0 - sta elapsed_work - lda FR0 + 1 - sta elapsed_work + 1 - - draw_string_const str_space - - .macro do_countdown divisor, digits - ldx #.lobyte(divisor) - ldy #.hibyte(divisor) - lda #.lobyte(digits) - sta INBUFF - lda #.hibyte(digits) - sta INBUFF + 1 - jsr countdown - .endmacro - do_countdown 36000, digits_space - do_countdown 3600, digits_zero - draw_string_const str_h - do_countdown 600, digits_zero - do_countdown 60, digits_zero - draw_string_const str_m - do_countdown 10, digits_zero - do_countdown 1, digits_zero - draw_string_const str_s + ; print the first 6 digits + draw_text_indirect speed_start, speed_precision, INBUFF + draw_text speed_start + speed_precision, str_speed_len, str_speed skipped: - ; sx += fill_level[fill_masks] + 1 - ldx fill_level - lda fill_masks,x clc - adc #1 ; will never carry - adc sx + lda sx + adc #1 sta sx - lda #0 - adc sx + 1 + lda sx + 1 + adc #0 sta sx + 1 lda sx @@ -2025,15 +1714,12 @@ skipped: loop_sx_done: - ; sy += fill_level[fill_masks] + 1 - ldx fill_level - lda fill_masks,x clc - adc #1 ; will never carry - adc sy + lda sy + adc #1 sta sy - lda #0 - adc sy + 1 + lda sy + 1 + adc #0 sta sy + 1 lda sy @@ -2052,130 +1738,9 @@ fill_loop_done: loop: ; finished - - lda #(40 - str_done_len) - sta text_col - lda #0 - sta text_row - draw_string_const str_done - + draw_text 40 - str_done_len, str_done_len, str_done jsr keycheck beq loop jmp main_loop .endproc - -; digit string in INBUFF -; divisor X/Y -; clobbers temp, calls draw_string -.proc countdown - divisor = temp - stx divisor - sty divisor + 1 - - ; count the hours - ldy #0 -countdown_loop: - lda elapsed_work + 1 - cmp divisor + 1 - beq countdown_lobyte - bcc countdown_done - bcs countdown_inc -countdown_lobyte: - lda elapsed_work - cmp divisor - bcc countdown_done -countdown_inc: - sec - lda elapsed_work - sbc divisor - sta elapsed_work - lda elapsed_work + 1 - sbc divisor + 1 - sta elapsed_work + 1 - iny - jmp countdown_loop -countdown_done: - lda (INBUFF),y - eor #$80 - sta elapsed_digit - lda #.lobyte(elapsed_digit) - sta INBUFF - lda #.hibyte(elapsed_digit) - sta INBUFF + 1 - jsr draw_string - rts -.endproc - -.proc imul8xe_init - - bank_switch 0 - lda #0 - sta EXTENDED_RAM - bank_switch 1 - lda #1 - sta EXTENDED_RAM - bank_switch 0 - lda EXTENDED_RAM - beq init - - ; no bank switching available, we just overwrite the value in base ram - rts - -init: - - ; patch imul16_func into a forwarding thunk to imul16xe_func - lda #$4c ; 'jmp' opcode - sta imul16_func - lda #.lobyte(imul16xe_func) - sta imul16_func + 1 - sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1 - lda #.hibyte(imul16xe_func) - sta imul16_func + 2 - sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2 - - ; ditto for sqr16_func -> sqr16xe_func - lda #$4c ; 'jmp' opcode - sta sqr16_func - lda #.lobyte(sqr16xe_func) - sta sqr16_func + 1 - sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1 - sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1 - lda #.hibyte(sqr16xe_func) - sta sqr16_func + 2 - sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2 - sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2 - - - ; create the lookup table - ; go through the input set, in four 16KB chunks - - arg1 = FR1 - arg2 = FR2 - result = FR0 - - lda #$00 - sta arg1 - sta arg2 - sta ptr - lda #$40 - sta ptr + 1 - - ; $00 * $00 -> $3f * $ff - bank_switch 0 - jsr imul8xe_init_section - - ; $40 * $00 -> $7f * $ff - bank_switch 1 - jsr imul8xe_init_section - - ; $80 * $00 -> $bf * $ff - bank_switch 2 - jsr imul8xe_init_section - - ; $c0 * $00 -> $ff * $ff - bank_switch 3 - jsr imul8xe_init_section - - rts -.endproc diff --git a/readme.md b/readme.md index 2c9efc1..d60644c 100644 --- a/readme.md +++ b/readme.md @@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g ## Current state -Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys. +Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys. The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. @@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication -The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates. +The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26. Iterations are capped at 255. diff --git a/todo.md b/todo.md index 6807ae2..284d653 100644 --- a/todo.md +++ b/todo.md @@ -1,17 +1,19 @@ things to try: -* fix status bar to show elapsed time, per-iter time, per-pixel iter count - -* 'turbo' mode disabling graphics in full or part +* skip add on the top-byte multiply in sqr8/mul8 + * should save a few cycles, suggestion by jamey * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D -* maybe clean up the load/layout of the big mul table - -* consider alternate lookup tables in the top 16KB under ROM +* try 3.13 fixed point instead of 4.12 for more precision + * can we get away without the extra bit? + * since exit compare space would be 6.26 i think so * y-axis mirror optimization +* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering + * maybe redo tiering to just 4x4, 2x2, 1x1? + * extract viewport for display & re-input via keyboard * fujinet screenshot/viewport uploader