diff --git a/Makefile b/Makefile index 711adcd..bd14c7d 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ all : mandel.xex mandel.xex : mandel.o tables.o atari-asm-xex.cfg - ld65 -C ./atari-asm-xex.cfg --mapfile mandel.map -o $@ mandel.o tables.o + ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o %.o : %.s ca65 -o $@ $< @@ -15,6 +15,4 @@ clean : rm -f tables.s rm -f *.o rm -f *.xex - rm -f mandel.map - diff --git a/mandel.s b/mandel.s index b52f24a..b0c2b42 100644 --- a/mandel.s +++ b/mandel.s @@ -1,44 +1,43 @@ ; Our zero-page vars -ox = $80 ; fixed6.26: center point x -oy = $84 ; fixed6.26: center point y -cx = $88 ; fixed6.26: c_x -cy = $8c ; fixed6.26: c_y +sx = $80 ; i16: screen pixel x +sy = $82 ; i16: screen pixel y +ox = $84 ; fixed4.12: center point x +oy = $86 ; fixed4.12: center point y +cx = $88 ; fixed4.12: c_x +cy = $8a ; fixed4.12: c_y +zx = $8c ; fixed4.12: z_x +zy = $8e ; fixed4.12: z_y -zx = $90 ; fixed6.26: z_x -zy = $94 ; fixed6.26: z_y -zx_2 = $98 ; fixed6.26: z_x^2 -zy_2 = $9c ; fixed6.26: z_y^2 +zx_2 = $90 ; fixed4.12: z_x^2 +zy_2 = $92 ; fixed4.12: z_y^2 +zx_zy = $94 ; fixed4.12: z_x * z_y +dist = $96 ; fixed4.12: z_x^2 + z_y^2 -zx_zy = $a0 ; fixed6.26: z_x * z_y -dist = $a4 ; fixed6.26: z_x^2 + z_y^2 -sx = $a8 ; i16: screen pixel x -sy = $aa ; i16: screen pixel y -z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not -z_buffer_start = $ad ; u8: index into z_buffer -z_buffer_end = $ae ; u8: index into z_buffer -iter = $af ; u8: iteration count +iter = $a0 ; u8: iteration count -ptr = $b0 ; u16 -pixel_ptr = $b2 ; u16 -zoom = $b4 ; u8: zoom shift level -fill_level = $b5 ; u8 -pixel_color = $b6 ; u8 -pixel_mask = $b7 ; u8 -pixel_shift = $b8 ; u8 -pixel_offset = $b9 ; u8 -palette_offset = $ba ; u8 -chroma_offset = $bb ; u8 -palette_ticks = $bc ; u8 -chroma_ticks = $bd ; u8 -count_frames = $be ; u8 -; free space $bf +zoom = $a1 ; u8: zoom shift level +count_frames = $a2 ; u8 +count_pixels = $a3 ; u8 +total_ms = $a4 ; float48 +total_pixels = $aa ; float48 -count_iters = $c0 ; u16 -text_col = $c2 ; u8 -text_row = $c3 ; u8 -; free space c4-cb -temp = $cc ; u16 -temp2 = $ce ; u16 +z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not +z_buffer_start = $b1 ; u8: index into z_buffer +z_buffer_end = $b2 ; u8: index into z_buffer +temp = $b4 ; u16 +temp2 = $b6 ; u16 +pixel_ptr = $b8 ; u16 +pixel_color = $ba ; u8 +pixel_mask = $bb ; u8 +pixel_shift = $bc ; u8 +pixel_offset = $bd ; u8 +fill_level = $be ; u8 +palette_offset = $bf ; u8 + +palette_ticks = $c0 ; u8 +chroma_ticks = $c1 ; u8 +chroma_offset = $c2 ; u8 +ptr = $c4 ; u16 palette_delay = 23 chroma_delay = 137 @@ -61,12 +60,10 @@ LBUFF = $0580 ; result buffer for FASC routine ; FP ROM routine vectors FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set) IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48) -FPI = $D9D2 ; floating point to integer FADD = $DA66 ; ADDITION (FR0 += FR1) FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1) FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1) FDIV = $DB28 ; DIVISION (FR0 /= FR1) -ZFR0 = $DA44 ; clear FR0 ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX) FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX) FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX) @@ -80,7 +77,7 @@ framebuffer_bottom = $b000 display_list = $bf00 framebuffer_end = $c000 -height = 176 +height = 184 half_height = height >> 1 width = 160 half_width = width >> 1 @@ -126,21 +123,14 @@ KEY_7 = 51 KEY_8 = 53 KEY_9 = 48 KEY_0 = 50 -KEY_PERIOD = 34 -KEY_E = 42 -KEY_X = 22 -KEY_Y = 43 .struct float48 exponent .byte mantissa .byte 5 .endstruct -.import mul_lobyte256 -.import mul_hibyte256 -.import mul_hibyte512 -.import sqr_lobyte -.import sqr_hibyte +.import mul_lobyte +.import mul_hibyte .data @@ -148,68 +138,25 @@ strings: str_self: .byte "MANDEL-6502" str_self_end: - .byte 0 str_speed: - .byte "us/iter: " + .byte " ms/px" str_speed_end: - .byte 0 str_run: .byte " RUN" str_run_end: - .byte 0 str_done: .byte "DONE" str_done_end: - .byte 0 -str_padding: - .byte " " -str_padding_end: - .byte 0 - -str_space: - .byte " " - .byte 0 - -str_h: - .byte "h" - .byte 0 -str_m: - .byte "m" - .byte 0 -str_s: - .byte "s" - .byte 0 +str_self_len = str_self_end - str_self str_speed_len = str_speed_end - str_speed str_run_len = str_run_end - str_run str_done_len = str_done_end - str_done -str_padding_len = str_padding_end - str_padding +speed_precision = 6 -; "3h59m59s" -str_elapsed_spacer = 8 -speed_start = 40 - str_done_len - str_speed_len - str_padding_len - str_elapsed_spacer - 1 +speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1 +speed_len = 14 + str_speed_len -col_x = 1 -str_x: - .byte "X:" - .byte 0 -str_x_len = 2 -str_x_space = 12 -str_x_padding = 2 - -col_y = col_x + str_x_len + str_x_space + str_x_padding -str_y: - .byte "Y:" - .byte 0 -str_y_len = 2 -str_y_space = 12 -str_y_padding = 2 - -col_zoom = col_y + str_y_len + str_y_space + str_y_padding -str_zoom: - .byte "ZOOM:" - .byte 0 -str_zoom_len = 5 char_map: ; Map ATASCII string values to framebuffer font entries @@ -225,12 +172,8 @@ char_map: .endrepeat hex_chars: -digits_zero: .byte "0123456789abcdef" -digits_space: - .byte " 123456789abcdef" - aspect: ; aspect ratio! ; pixels at 320w are 5:6 (narrow) @@ -244,49 +187,20 @@ aspect: ; ; 184h is the equiv of 220.8h at square pixels ; 320 / 220.8 = 1.45 display aspect ratio -aspect_x: ; fixed3.13 5/4 - .word 5 << (13 - 2) +aspect_x: ; fixed4.16 5/4 + .word 5 << (12 - 2) -aspect_y: ; fixed3.13 3/4 - .word 3 << (13 - 2) +aspect_y: ; fixed4.16 3/4 + .word 3 << (12 - 2) -fixed3_13_as_float: ; float48 - ; 1 << 13 - ; 8192 - ; 81 92 . 00 00 00 - .byte 65 ; exponent/sign - +1 byte - .byte $81 - .byte $92 - .byte $00 - .byte $00 - .byte $00 - -sec_per_frame: ; float48 00 . 01 66 66 66 67 - .byte 63 ; exponent/sign - -1 bytes - .byte $01 ; BCD digits +ms_per_frame: ; float48 16.66666667 + .byte 64 ; exponent/sign + .byte $16 ; BCD digits .byte $66 .byte $66 .byte $66 .byte $67 -us_per_sec: ; float48 1e9 01 00 0,0 00 . 00 - .byte 67 ; exponent/sign +3 bytes - .byte $01 ; BCD digits - .byte $00 - .byte $00 - .byte $00 - .byte $00 - -total_iters: ; float48 - .repeat 6 - .byte 0 - .endrepeat - -total_sec: ; float48 - .repeat 6 - .byte 0 - .endrepeat - display_list_start: ; 24 lines overscan .repeat 3 @@ -310,10 +224,6 @@ display_list_start: .byte $0e .endrep - ; 8 scan lines, 1 row of 40-column text - .byte $42 - .addr textbuffer + 40 - .byte $41 ; jump and blank .addr display_list display_list_end: @@ -322,9 +232,9 @@ display_list_len = display_list_end - display_list_start color_map: .byte 0 .repeat 85 - .byte %01010101 - .byte %10101010 - .byte %11111111 + .byte 1 + .byte 2 + .byte 3 .endrepeat @@ -373,46 +283,23 @@ fill_masks: .byte %00000001 .byte %00000000 -pixel_masks: - .byte %11111111 - .byte %11110000 - .byte %11000000 - viewport_zoom: - .byte 0 - .byte 5 - .byte 7 - .byte 5 - .byte 7 - .byte 7 + .byte 1 + .byte 6 + .byte 8 + .byte 6 viewport_ox: - .dword ($00000000 & $3fffffff) << 2 - .dword ($ff110000 & $3fffffff) << 2 - .dword ($ff110000 & $3fffffff) << 2 - .dword ($fe400000 & $3fffffff) << 2 - .dword ($fe3b0000 & $3fffffff) << 2 - .dword $fd220000 + .word $0000 + .word $f110 + .word $f110 + .word $e400 viewport_oy: - .dword ($00000000 & $3fffffff) << 2 - .dword ($ffb60000 & $3fffffff) << 2 - .dword ($ffbe0000 & $3fffffff) << 2 - .dword ($00000000 & $3fffffff) << 2 - .dword ($fffe0000 & $3fffffff) << 2 - .dword $ff000000 - -elapsed_work: - .dword 0 -elapsed_digit: - .byte 0 - -input_col: - .byte 0 -input_row: - .byte 0 -input_max: - .byte 0 + .word $0000 + .word $fb60 + .word $fbe0 + .word $0000 ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 @@ -431,7 +318,7 @@ input_max: ; 38 cycles .macro add32 dest, arg1, arg2 - add 4, dest, arg1, arg2 + add 4, dest, arg2, dest .endmacro ; 8 cycles @@ -485,7 +372,6 @@ input_max: .endmacro ; 6 * bytes cycles -; 4 * bytes bytes .macro copy bytes, dest, arg .repeat bytes, byte ; 6 * bytes cycles lda arg + byte ; 3 cyc @@ -494,7 +380,6 @@ input_max: .endmacro ; 12 cycles -; 8 bytes .macro copy16 dest, arg copy 2, dest, arg .endmacro @@ -529,44 +414,41 @@ input_max: neg 4, arg .endmacro -; 11-27 + 18 * shift cycles -; 65-81 cycles for shift=3 +; 11-27 + 23 * shift cycles +; 103-119 cycles for shift=4 .macro shift_round_16 arg, shift .repeat shift - shl32 arg ; 18 cycles + shl32 arg ; 23 cycles .endrepeat round16 arg ; 11-27 cycles .endmacro -; input: arg1, arg2 as fixed4.12 -; output: dest as fixed8.24 -; patch point jsr at 16 bytes in -imul16_patch_offset = 16 -.macro imul16 dest, arg1, arg2 +.macro imul16_round dest, arg1, arg2, shift copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc jsr imul16_func ; ? cyc - copy32 dest, FR2 ; 24 cyc + shift_round_16 FR2, shift ; 103-119 cycles for shift=4 + copy16 dest, FR2 + 2 ; 12 cyc .endmacro -; input: arg as fixed4.12 -; output: dest as fixed8.24 -; patch point jsr at 8 bytes in -sqr16_patch_offset = 8 -.macro sqr16 dest, arg +.macro sqr16_round dest, arg, shift + ;imul16_round dest, arg, arg, shift copy16 FR0, arg ; 12 cyc - jsr sqr16_func ; ? cyc - copy32 dest, FR2 ; 24 cyc + jsr sqr16_func ; ? cyc + shift_round_16 FR2, shift ; 103-119 cycles for shift=4 + copy16 dest, FR2 + 2 ; 12 cyc .endmacro -; input: arg as u8 -; output: dest as u16 ; clobbers a, x .macro sqr8 dest, arg ldx arg - lda sqr_lobyte,x + txa + lsr + lda mul_lobyte,x + rol sta dest - lda sqr_hibyte,x + lda mul_hibyte,x + rol sta dest + 1 .endmacro @@ -655,22 +537,25 @@ bank_switch_table: clc ; 2 cyc adc mul_factor_x ; 3 cyc tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc - under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc + lda mul_hibyte,x ; 4 cyc + bcc next ; 2 cyc + ; carry is set so we get to add 1 for free, but need to add 0x80 + adc #$7f ; 2 cyc + clc ; 2 cyc + ; stash the sum temporarily so we can use it as an operand to add + stx mul_product_lo ; 3 cyc + adc mul_product_lo ; 3 cyc next: + sec ; 2 cyc sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc + lda mul_lobyte,x ; 4 cyc ; - a^2/2 ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc + sbc mul_lobyte,x ; 4 cyc sta mul_product_lo ; 3 cyc lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc + sbc mul_hibyte,x ; 4 cyc sta mul_product_hi ; 3 cyc ; + x & a & 1: @@ -689,15 +574,80 @@ bank_switch_table: ; - x^2/2 small_product: sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc + sbc mul_lobyte,x ; 4 cyc sta mul_product_lo ; 3 cyc lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc + sbc mul_hibyte,x ; 4 cyc sta mul_product_hi ; 3 cyc .endscope .endif .endmacro +.proc imul8xe_init + + bank_switch 0 + lda #0 + sta EXTENDED_RAM + bank_switch 1 + lda #1 + sta EXTENDED_RAM + bank_switch 0 + lda EXTENDED_RAM + beq init + + ; no bank switching available, we just overwrite the value in base ram + rts + +init: + + ; patch imul16_func into a forwarding thunk to imul16xe_func + lda #$4c ; 'jmp' opcode + sta imul16_func + lda #.lobyte(imul16xe_func) + sta imul16_func + 1 + lda #.hibyte(imul16xe_func) + sta imul16_func + 2 + + ; ditto for sqr16_func -> sqr16xe_func + lda #$4c ; 'jmp' opcode + sta sqr16_func + lda #.lobyte(sqr16xe_func) + sta sqr16_func + 1 + lda #.hibyte(sqr16xe_func) + sta sqr16_func + 2 + + ; create the lookup table + ; go through the input set, in four 16KB chunks + + arg1 = FR1 + arg2 = FR2 + result = FR0 + + lda #$00 + sta arg1 + sta arg2 + sta ptr + lda #$40 + sta ptr + 1 + + ; $00 * $00 -> $3f * $ff + bank_switch 0 + jsr imul8xe_init_section + + ; $40 * $00 -> $7f * $ff + bank_switch 1 + jsr imul8xe_init_section + + ; $80 * $00 -> $bf * $ff + bank_switch 2 + jsr imul8xe_init_section + + ; $c0 * $00 -> $ff * $ff + bank_switch 3 + jsr imul8xe_init_section + + rts +.endproc ; Initialize a 16 KB chunk of the table ; input: multipliers in temp @@ -787,8 +737,9 @@ inner_loop: ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 imul8 result, arg1, arg2, xe - - imul8 result + 2, arg1 + 1, arg2 + 1, xe + lda #0 + sta result + 2 + sta result + 3 imul8 inter, arg1 + 1, arg2, xe add16 result + 1, result + 1, inter @@ -798,6 +749,9 @@ inner_loop: add16 result + 1, result + 1, inter add_carry result + 3 + imul8 inter, arg1 + 1, arg2 + 1, xe + add16 result + 2, result + 2, inter + ; In case of negative inputs, adjust high word ; https://stackoverflow.com/a/28827013 lda arg1 + 1 @@ -830,11 +784,10 @@ arg2_pos: ; h*h*256*256 + h*l*256 + h*l*256 + l*l sqr8 result, arg - sqr8 result + 2, arg + 1 imul8 inter, arg + 1, arg, xe - add16 result + 1, result + 1, inter + shl16 inter add_carry result + 3 add16 result + 1, result + 1, inter add_carry result + 3 @@ -904,83 +857,10 @@ next: .endmacro -; input in FR0, 16 bits signed 3.13 fixed -; output in FR0, Atari float -; clobbers a, x, y, FR0, FR1 -.proc fixed3_13_to_float - ldx #.lobyte(fixed3_13_as_float) - ldy #.hibyte(fixed3_13_as_float) - jsr FLD1R - - ; check sign bit! conversion routine is for unsigned - lda FR0 + 1 - bpl positive - -negative: - neg16 FR0 - jsr IFP - - ; set float sign bit - lda FR0 - ora #$80 - sta FR0 - jmp common - -positive: - jsr IFP - -common: - jsr FDIV - rts - -.endproc - -; rounds to 16-bit first! -; input in FR0, 32 bits signed 6.26 fixed -; output in FR0, Atari float -; clobbers a, x, y, FR0, FR1 -.proc fixed6_26_to_float - shift_round_16 FR0, 3 - copy16 FR0, FR0 + 2 - jsr fixed3_13_to_float - rts -.endproc - -; input in FR0, Atari float -; output in FR0, 16 bits signed 3.13 fixed -; clobbers a, x, y, FR0, FR1 -.proc float_to_fixed3_13 - ldx #.lobyte(fixed3_13_as_float) - ldy #.hibyte(fixed3_13_as_float) - jsr FLD1R - jsr FMUL - - ; check sign bit! conversion routine is for unsigned - lda FR0 - bcc positive - -negative: - ; clearfloat sign bit - lda FR0 - eor #$80 - sta FR0 - - jsr FPI - neg16 FR0 - jmp common - -positive: - jsr FPI - -common: - rts - -.endproc - .proc mandelbrot ; input: - ; cx: position scaled to 6.26 fixed point - -32..+31.9 - ; cy: position scaled to 6.26 + ; cx: position scaled to 4.12 fixed point - -8..+7.9 + ; cy: position scaled to 4.12 ; ; output: ; iter: iteration count at escape or 0 @@ -992,50 +872,16 @@ common: ; zx_zy = 0 ; dist = 0 ; iter = 0 -; lda #00 -; ldx #(iter - zx + 1) -;initloop: -; sta zx - 1,x -; dex -; bne initloop -; sta z_buffer_start -; sta z_buffer_end - lda #00 - sta zx - sta zx + 1 - sta zx + 2 - sta zx + 3 - sta zy - sta zy + 1 - sta zy + 2 - sta zy + 3 - sta zx_2 - sta zx_2 + 1 - sta zx_2 + 2 - sta zx_2 + 3 - sta zy_2 - sta zy_2 + 1 - sta zy_2 + 2 - sta zy_2 + 3 - sta zx_zy - sta zx_zy + 1 - sta zx_zy + 2 - sta zx_zy + 3 - sta dist - sta dist + 1 - sta dist + 2 - sta dist + 3 - sta iter + ldx #(iter - zx + 1) +initloop: + sta zx - 1,x + dex + bne initloop sta z_buffer_start sta z_buffer_end loop: - inc count_iters - bne low_iters - inc count_iters + 1 -low_iters: - ; iter++ & max-iters break inc iter bne keep_going @@ -1043,8 +889,6 @@ low_iters: keep_going: .macro quick_exit arg, max - ; arg: fixed6.26 - ; max: integer .local positive .local negative .local nope_out @@ -1052,64 +896,51 @@ keep_going: .local all_done ; check sign bit - lda arg + 3 + lda arg + 1 bmi negative positive: - cmp #(max << 2) + cmp #((max) << 4) bmi all_done ; 'less than' jmp exit_path negative: - cmp #(256 - (max << 2)) + cmp #(256 - ((max) << 4)) beq first_equal ; 'equal' on first byte bpl all_done ; 'greater than' nope_out: jmp exit_path - + first_equal: - ; following bytes all 0 shows it's really 'equal' - lda arg + 2 - bne all_done - lda arg + 1 - bne all_done lda arg - bne all_done - jmp exit_path + beq nope_out ; 2nd byte 0 shows it's really 'equal' all_done: .endmacro - ; 6.26: (-32 .. 31.9) + ; 4.12: (-8 .. +7.9) ; zx = zx_2 - zy_2 + cx - sub32 zx, zx_2, zy_2 - add32 zx, zx, cx + sub16 zx, zx_2, zy_2 + add16 zx, zx, cx quick_exit zx, 2 ; zy = zx_zy + zx_zy + cy - add32 zy, zx_zy, zx_zy - add32 zy, zy, cy + add16 zy, zx_zy, zx_zy + add16 zy, zy, cy quick_exit zy, 2 - ; convert 6.26 -> 3.13: (-4 .. +3.9) - shift_round_16 zx, 3 - shift_round_16 zy, 3 - ; zx_2 = zx * zx -fixup_sqr16_1: - sqr16 zx_2, zx + 2 + sqr16_round zx_2, zx, 4 ; zy_2 = zy * zy -fixup_sqr16_2: - sqr16 zy_2, zy + 2 + sqr16_round zy_2, zy, 4 ; zx_zy = zx * zy -fixup_imul16_1: - imul16 zx_zy, zx + 2, zy + 2 + imul16_round zx_zy, zx, zy, 4 ; dist = zx_2 + zy_2 - add32 dist, zx_2, zy_2 + add16 dist, zx_2, zy_2 quick_exit dist, 4 ; if may be in the lake, look for looping output with a small buffer @@ -1146,10 +977,10 @@ z_buffer_loop: ; Compare the previously stored z values ldy #0 - z_compare zx + 2 - z_compare zx + 3 - z_compare zy + 2 - z_compare zy + 3 + z_compare zx + z_compare zx + 1 + z_compare zy + z_compare zy + 1 cpy #4 bne z_no_matches @@ -1164,10 +995,10 @@ z_no_matches: z_nothing_to_read: ; Store and expand - z_store zx + 2 - z_store zx + 3 - z_store zy + 2 - z_store zy + 3 + z_store zx + z_store zx + 1 + z_store zy + z_store zy + 1 z_advance stx z_buffer_end @@ -1218,17 +1049,14 @@ cont: enough: .endmacro -.macro zoom_factor dest, src, aspect - ; output: dest: fixed6.26 - ; input: src: fixed3.13 - ; aspect: fixed3.13 +.macro zoom_factor dest, src, zoom, aspect ; clobbers A, X, flags, etc copy16 dest, src scale_zoom dest ; cy = cy * (3 / 4) ; cx = cx * (5 / 4) - imul16 dest, dest, aspect + imul16_round dest, dest, aspect, 4 .endmacro .proc pset @@ -1239,11 +1067,8 @@ enough: ; iter -> color ldx iter lda color_map,x - ldx fill_level - and pixel_masks,x sta pixel_color - lda pixel_masks,x - eor #$ff + lda #(255 - 3) sta pixel_mask ; sy -> line base address in temp @@ -1292,23 +1117,22 @@ point: ; pixel_mask <<= pixel_shift (shifting in ones) and #3 sta pixel_shift + lda #3 + sec + sbc pixel_shift tax shift_loop: beq shift_done - lsr pixel_color - lsr pixel_color + asl pixel_color + asl pixel_color sec - ror pixel_mask + rol pixel_mask sec - ror pixel_mask + rol pixel_mask dex jmp shift_loop shift_done: - ldy fill_level - ldx fill_masks,y - inx - ; pixel_offset = temp >> 2 lda temp lsr a @@ -1316,94 +1140,48 @@ shift_done: sta pixel_offset tay -draw_pixel: ; read, mask, or, write lda (pixel_ptr),y and pixel_mask ora pixel_color sta (pixel_ptr),y - dex - beq done - clc - lda #40 - adc pixel_ptr - sta pixel_ptr - lda #0 - adc pixel_ptr + 1 - sta pixel_ptr + 1 - jmp draw_pixel - -done: rts .endproc -; in/out: column in text_col -; in: row in text_row -; in: pointer to string in INBUFF -; clobbers x/y/a/temp -.proc draw_string - drawptr = temp - strptr = INBUFF - - clc - lda #.lobyte(textbuffer) - adc text_col - sta temp - lda #.hibyte(textbuffer) - adc #0 - sta temp + 1 - - ldx text_row - beq done_rows -continue_rows: - clc - lda temp - adc #40 - sta temp - lda temp + 1 - adc #0 - sta temp + 1 - dex - bne continue_rows - -done_rows: - - ldy #0 +.macro draw_text_indirect col, len, strptr + ; clobbers A, X + .local loop + .local done + ldx #0 loop: - lda (strptr),y - ; if char's null, terminate c-style + cpx #len beq done - ; save the char for terminator check - pha - ; strip the high bit (terminator) - and #$7f - tax - lda char_map,x - sta (drawptr),y - iny - - pla - ; _last_ char has high bit set in atari rom routines - bmi done + txa + tay + lda (strptr),y + tay + lda char_map,y + sta textbuffer + col,x + inx jmp loop - done: - ; move the text column pointer - tya - clc - adc text_col - sta text_col +.endmacro - rts -.endproc - -.macro draw_string_const str - lda #.lobyte(str) - sta INBUFF - lda #.hibyte(str) - sta INBUFF + 1 - jsr draw_string +.macro draw_text col, len, cstr + ; clobbers A, X + .local loop + .local done + ldx #0 +loop: + cpx #len + beq done + ldy cstr,x + lda char_map,y + sta textbuffer + col,x + inx + jmp loop +done: .endmacro .proc vblank_handler @@ -1503,15 +1281,12 @@ skip_luma: cpy #KEY_MINUS beq minus - ; temp+temp2 = $00010000 << (8 - zoom) - lda #$00 + ; temp = $0010 << (8 - zoom) + lda #$10 sta temp - sta temp + 1 - lda #$01 - sta temp + 2 lda #$00 - sta temp + 3 - scale_zoom temp + 2 + sta temp + 1 + scale_zoom temp cpy #KEY_UP beq up @@ -1521,7 +1296,14 @@ skip_luma: beq left cpy #KEY_RIGHT beq right - jmp number_keys + cpy #KEY_1 + beq one + cpy #KEY_2 + beq two + cpy #KEY_3 + beq three + cpy #KEY_4 + beq four skip_char: lda #0 @@ -1529,7 +1311,7 @@ skip_char: plus: lda zoom - cmp #7 + cmp #8 bpl skip_char inc zoom jmp done @@ -1540,37 +1322,17 @@ minus: dec zoom jmp done up: - add32 oy, oy, temp - jsr display_coords + sub16 oy, oy, temp jmp done down: - sub32 oy, oy, temp - jsr display_coords + add16 oy, oy, temp jmp done left: - sub32 ox, ox, temp - jsr display_coords + sub16 ox, ox, temp jmp done right: - add32 ox, ox, temp - jsr display_coords + add16 ox, ox, temp jmp done - -number_keys: - cpy #KEY_1 - beq one - cpy #KEY_2 - beq two - cpy #KEY_3 - beq three - cpy #KEY_4 - beq four - cpy #KEY_5 - beq five - cpy #KEY_6 - beq six - jmp letter_keys - one: ldx #0 jmp load_key_viewport @@ -1582,27 +1344,7 @@ three: jmp load_key_viewport four: ldx #3 - jmp load_key_viewport -five: - ldx #4 - jmp load_key_viewport -six: - ldx #5 - jmp load_key_viewport - -letter_keys: - cpy #KEY_X - bne not_x - jsr input_x - jmp done -not_x: - cpy #KEY_Y - bne not_y - jsr input_y - jmp done -not_y: - jmp skip_char - + ; fall through load_key_viewport: jsr load_viewport ; fall through @@ -1612,23 +1354,6 @@ done: .endproc -.proc input_x - ldx #col_x - ldy #1 - jsr input_number - - - rts -.endproc - -.proc input_y - rts -.endproc - -.proc input_number - rts -.endproc - .proc clear_screen ; zero the range from framebuffer_top to display_list lda #.lobyte(framebuffer_top) @@ -1654,59 +1379,12 @@ zero_byte_loop: .proc status_bar ; Status bar - - lda #0 - sta text_col - lda #0 - sta text_row - draw_string_const str_self - - lda #(40 - str_run_len) - sta text_col - draw_string_const str_run + draw_text 0, str_self_len, str_self + draw_text 40 - str_run_len, str_run_len, str_run rts .endproc -.proc display_coords - lda #1 - sta text_row - lda #col_x - sta text_col - draw_string_const str_x - - copy32 FR0, ox - jsr fixed6_26_to_float - jsr FASC - jsr draw_string - - lda #col_y - sta text_col - draw_string_const str_y - - copy32 FR0, oy - jsr fixed6_26_to_float - jsr FASC - jsr draw_string - - lda #col_zoom - sta text_col - draw_string_const str_zoom - - lda zoom - clc - adc #0 - sta FR0 - lda #0 - sta FR0 + 1 - jsr IFP - jsr FASC - jsr draw_string - - rts - -.endproc - ; input: viewport selector in x ; clobbers: a, x .proc load_viewport @@ -1716,32 +1394,17 @@ zero_byte_loop: txa asl a - asl a - tax lda viewport_ox,x sta ox lda viewport_oy,x sta oy - inx lda viewport_ox,x sta ox + 1 lda viewport_oy,x sta oy + 1 - inx - lda viewport_ox,x - sta ox + 2 - lda viewport_oy,x - sta oy + 2 - - inx - lda viewport_ox,x - sta ox + 3 - lda viewport_oy,x - sta oy + 3 - rts .endproc @@ -1758,7 +1421,6 @@ zero_byte_loop: sta DMACTL jsr clear_screen - jsr display_coords ; Copy the display list into properly aligned memory ; Can't cross 1024-byte boundaries :D @@ -1797,24 +1459,19 @@ copy_byte_loop: jsr SETVBV main_loop: - ; count_frames = 0; count_iters = 0 + ; count_frames = 0; count_pixels = 0 lda #0 sta count_frames - sta count_iters - sta count_iters + 1 + sta count_pixels - ; total_sec = 0.0; total_iters = 0.0 - jsr ZFR0 - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) - jsr FST0R - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) - jsr FST0R + ; total_ms = 0.0; total_pixels = 0.0 + ldx #total_ms + jsr ZF1 + ldx #total_pixels + jsr ZF1 jsr clear_screen jsr status_bar - jsr display_coords lda #0 sta fill_level @@ -1869,11 +1526,10 @@ skipped_mask: not_skipped_mask: ; run the fractal! - zoom_factor cx, sx, aspect_x - add32 cx, cx, ox - zoom_factor cy, sy, aspect_y - neg32 cy - add32 cy, cy, oy + zoom_factor cx, sx, zoom, aspect_x + add16 cx, cx, ox + zoom_factor cy, sy, zoom, aspect_y + add16 cy, cy, oy jsr mandelbrot jsr pset @@ -1884,32 +1540,38 @@ not_skipped_mask: no_key: ; check if we should update the counters + ; + ; count_pixels >= width? update! + inc count_pixels + lda count_pixels + cmp #width + bmi update_status ; count_frames >= 120? update! lda count_frames cmp #120 ; >= 2 seconds - bpl update_status - jmp skipped + bmi skipped update_status: - ; FR0 = (float)count_iters & clear count_iters - copy16 FR0, count_iters - jsr IFP + ; FR0 = (float)count_pixels & clear count_pixels + lda count_pixels + sta FR0 lda #0 - sta count_iters - sta count_iters + 1 + sta FR0 + 1 + sta count_pixels + jsr IFP - ; FR1 = total_iters - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) + ; FR1 = total_pixels + ldx #.lobyte(total_pixels) + ldy #.hibyte(total_pixels) jsr FLD1R ; FR0 += FR1 jsr FADD - ; total_iters = FR0 - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) + ; total_pixels = FR0 + ldx #.lobyte(total_pixels) + ldy #.hibyte(total_pixels) jsr FST0R @@ -1922,100 +1584,44 @@ update_status: sta count_frames jsr IFP - ; FR0 *= sec_per_frame - ldx #.lobyte(sec_per_frame) - ldy #.hibyte(sec_per_frame) + ; FR0 *= ms_per_frame + ldx #.lobyte(ms_per_frame) + ldy #.hibyte(ms_per_frame) jsr FLD1R jsr FMUL - ; FR0 += total_sec - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) + ; FR0 += total_ms + ldx #total_ms + ldy #0 jsr FLD1R jsr FADD - ; total_sec = FR0 - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) + ; total_ms = FR0 + ldx #total_ms + ldy #0 jsr FST0R - ; FR0 /= total_iters - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) + ; FR0 /= total_pixels + ldx #total_pixels + ldy #0 jsr FLD1R jsr FDIV - ; FR0 *= us_per_sec - ldx #.lobyte(us_per_sec) - ldy #.hibyte(us_per_sec) - jsr FLD1R - jsr FMUL - - ; round (down) to integer - jsr FPI - clc - jsr IFP - - lda #speed_start - sta text_col - lda #0 - sta text_row - draw_string_const str_speed - - lda text_col - pha - draw_string_const str_padding - pla - sta text_col - - ; convert to ASCII in INBUFF and print + ; convert to ASCII in INBUFF jsr FASC - jsr draw_string - ; elapsed time - ; FR0 = total_sec - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) - jsr FLD0R - ; FR0 -> integer -> elapsed_work - jsr FPI - lda FR0 - sta elapsed_work - lda FR0 + 1 - sta elapsed_work + 1 - - draw_string_const str_space - - .macro do_countdown divisor, digits - ldx #.lobyte(divisor) - ldy #.hibyte(divisor) - lda #.lobyte(digits) - sta INBUFF - lda #.hibyte(digits) - sta INBUFF + 1 - jsr countdown - .endmacro - do_countdown 36000, digits_space - do_countdown 3600, digits_zero - draw_string_const str_h - do_countdown 600, digits_zero - do_countdown 60, digits_zero - draw_string_const str_m - do_countdown 10, digits_zero - do_countdown 1, digits_zero - draw_string_const str_s + ; print the first 6 digits + draw_text_indirect speed_start, speed_precision, INBUFF + draw_text speed_start + speed_precision, str_speed_len, str_speed skipped: - ; sx += fill_level[fill_masks] + 1 - ldx fill_level - lda fill_masks,x clc - adc #1 ; will never carry - adc sx + lda sx + adc #1 sta sx - lda #0 - adc sx + 1 + lda sx + 1 + adc #0 sta sx + 1 lda sx @@ -2025,15 +1631,12 @@ skipped: loop_sx_done: - ; sy += fill_level[fill_masks] + 1 - ldx fill_level - lda fill_masks,x clc - adc #1 ; will never carry - adc sy + lda sy + adc #1 sta sy - lda #0 - adc sy + 1 + lda sy + 1 + adc #0 sta sy + 1 lda sy @@ -2052,130 +1655,9 @@ fill_loop_done: loop: ; finished - - lda #(40 - str_done_len) - sta text_col - lda #0 - sta text_row - draw_string_const str_done - + draw_text 40 - str_done_len, str_done_len, str_done jsr keycheck beq loop jmp main_loop .endproc - -; digit string in INBUFF -; divisor X/Y -; clobbers temp, calls draw_string -.proc countdown - divisor = temp - stx divisor - sty divisor + 1 - - ; count the hours - ldy #0 -countdown_loop: - lda elapsed_work + 1 - cmp divisor + 1 - beq countdown_lobyte - bcc countdown_done - bcs countdown_inc -countdown_lobyte: - lda elapsed_work - cmp divisor - bcc countdown_done -countdown_inc: - sec - lda elapsed_work - sbc divisor - sta elapsed_work - lda elapsed_work + 1 - sbc divisor + 1 - sta elapsed_work + 1 - iny - jmp countdown_loop -countdown_done: - lda (INBUFF),y - eor #$80 - sta elapsed_digit - lda #.lobyte(elapsed_digit) - sta INBUFF - lda #.hibyte(elapsed_digit) - sta INBUFF + 1 - jsr draw_string - rts -.endproc - -.proc imul8xe_init - - bank_switch 0 - lda #0 - sta EXTENDED_RAM - bank_switch 1 - lda #1 - sta EXTENDED_RAM - bank_switch 0 - lda EXTENDED_RAM - beq init - - ; no bank switching available, we just overwrite the value in base ram - rts - -init: - - ; patch imul16_func into a forwarding thunk to imul16xe_func - lda #$4c ; 'jmp' opcode - sta imul16_func - lda #.lobyte(imul16xe_func) - sta imul16_func + 1 - sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1 - lda #.hibyte(imul16xe_func) - sta imul16_func + 2 - sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2 - - ; ditto for sqr16_func -> sqr16xe_func - lda #$4c ; 'jmp' opcode - sta sqr16_func - lda #.lobyte(sqr16xe_func) - sta sqr16_func + 1 - sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1 - sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1 - lda #.hibyte(sqr16xe_func) - sta sqr16_func + 2 - sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2 - sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2 - - - ; create the lookup table - ; go through the input set, in four 16KB chunks - - arg1 = FR1 - arg2 = FR2 - result = FR0 - - lda #$00 - sta arg1 - sta arg2 - sta ptr - lda #$40 - sta ptr + 1 - - ; $00 * $00 -> $3f * $ff - bank_switch 0 - jsr imul8xe_init_section - - ; $40 * $00 -> $7f * $ff - bank_switch 1 - jsr imul8xe_init_section - - ; $80 * $00 -> $bf * $ff - bank_switch 2 - jsr imul8xe_init_section - - ; $c0 * $00 -> $ff * $ff - bank_switch 3 - jsr imul8xe_init_section - - rts -.endproc diff --git a/readme.md b/readme.md index 2c9efc1..f297d60 100644 --- a/readme.md +++ b/readme.md @@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g ## Current state -Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys. +Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys. The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. @@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3 * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication -The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates. +The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13. Iterations are capped at 255. @@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e ## Todo -See ideas in `todo.md`. +See ideas in `todo.md`. \ No newline at end of file diff --git a/tables.js b/tables.js index 50cbef9..176e4df 100644 --- a/tables.js +++ b/tables.js @@ -11,40 +11,19 @@ function db(func) { return lines.join('\n'); } -let squares = []; -for (let i = 0; i < 512; i++) { - squares.push(Math.trunc((i * i + 1) / 2)); -} - console.log( `.segment "TABLES" -.export mul_lobyte256 -.export mul_hibyte256 -.export mul_hibyte512 -.export sqr_lobyte -.export sqr_hibyte +.export mul_lobyte +.export mul_hibyte -; (i * i + 1) / 2 for the multiplier +; (i * i) / 2 for the multiplier .align 256 -mul_lobyte256: -${db((i) => squares[i] & 0xff)} +mul_lobyte: +${db((i) => ((i * i) >> 1) & 0xff)} .align 256 -mul_hibyte256: -${db((i) => (squares[i] >> 8) & 0xff)} - -.align 256 -mul_hibyte512: -${db((i) => (squares[i + 256] >> 8) & 0xff)} - -; (i * i) for the plain squares -.align 256 -sqr_lobyte: -${db((i) => (i * i) & 0xff)} - -.align 256 -sqr_hibyte: -${db((i) => ((i * i) >> 8) & 0xff)} +mul_hibyte: +${db((i) => ((i * i) >> 9) & 0xff)} `); diff --git a/todo.md b/todo.md index 6807ae2..1281de7 100644 --- a/todo.md +++ b/todo.md @@ -1,17 +1,15 @@ things to try: -* fix status bar to show elapsed time, per-iter time, per-pixel iter count - -* 'turbo' mode disabling graphics in full or part - * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D -* maybe clean up the load/layout of the big mul table - -* consider alternate lookup tables in the top 16KB under ROM +* try 3.13 fixed point instead of 4.12 for more precision + * can we get away without the extra bit? * y-axis mirror optimization +* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering + * maybe redo tiering to just 4x4, 2x2, 1x1? + * extract viewport for display & re-input via keyboard * fujinet screenshot/viewport uploader