diff --git a/Makefile b/Makefile index c94074b..008bf8c 100644 --- a/Makefile +++ b/Makefile @@ -2,11 +2,8 @@ all : mandel.xex -mandel.xex : mandel.o mandel-core.o tables.o atari-xex.cfg - ld65 -C ./atari-xex.cfg --mapfile mandel.map -o $@ mandel.o mandel-core.o tables.o atari.lib - -mandel.s : mandel.c mandel.h - cc65 -o $@ mandel.c +mandel.xex : mandel.o tables.o + ld65 -C ./atari-asm-xex.cfg -o $@ $+ %.o : %.s ca65 -o $@ $< @@ -16,7 +13,6 @@ tables.s : tables.js clean : rm -f tables.s - rm -f mandel.s rm -f *.o rm -f *.xex - rm -f mandel.map + diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg deleted file mode 100644 index 93b80f3..0000000 --- a/atari-asm-xex.cfg +++ /dev/null @@ -1,28 +0,0 @@ -FEATURES { - STARTADDRESS: default = $2E00; -} -SYMBOLS { - __STARTADDRESS__: type = export, value = %S; -} -MEMORY { - ZP: file = "", define = yes, start = $0082, size = $007E; - MAIN: file = %O, define = yes, start = %S, size = $4000 - %S; - # Keep $4000-7fff clear for expanded RAM access window - TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000; - # Keep $a000-$bfff clear for BASIC cartridge -} -FILES { - %O: format = atari; -} -FORMATS { - atari: runad = start; -} -SEGMENTS { - ZEROPAGE: load = ZP, type = zp, optional = yes; - EXTZP: load = ZP, type = zp, optional = yes; # to enable modules to be able to link to C and assembler programs - CODE: load = MAIN, type = rw, define = yes; - RODATA: load = MAIN, type = ro optional = yes; - DATA: load = MAIN, type = rw optional = yes; - BSS: load = MAIN, type = bss, optional = yes, define = yes; - TABLES: load = TABLES, type = ro, optional = yes, align = 256; -} diff --git a/atari-xex.cfg b/atari-xex.cfg deleted file mode 100644 index 467d9d4..0000000 --- a/atari-xex.cfg +++ /dev/null @@ -1,69 +0,0 @@ -# Sample linker configuration for C programs using the Atari binary file support. -# Use with: cl65 -tatari -Catari-xex.cfg prog.c -o prog.xex -FEATURES { - STARTADDRESS: default = $8000; -} -SYMBOLS { - __SYSTEM_CHECK__: type = import; # force inclusion of "system check" load chunk - __STACKSIZE__: type = weak, value = $0800; # 2k stack - __STARTADDRESS__: type = export, value = %S; - __RESERVED_MEMORY__: type = weak, value = $0000; - __SYSCHKHDR__: type = export, value = 0; # Disable system check header - __SYSCHKTRL__: type = export, value = 0; # Disable system check trailer - __TABLESEG_START__: type = weak, value = $2E00 + $0300; - __TABLESEG_SIZE__: type = weak, value = 6 * $100; - __BANKSY_START__: type = weak, value = $4000; - __BANKSY_SIZE__: type = weak, value = $4000; - __FRAMEBUFFER_START__: type = weak, value = $A000; -} -MEMORY { -# Note -- $80 and $81 (LOMEM) appear to be reserved in ZP. - ZP: file = "", define = yes, start = $0082, size = $007E; -# "system check" load chunk - SYSCHKCHNK: file = %O, start = $2E00, size = $0300; -# Note $a000-$bfff is against the BASIC cartridge, may require booting with OPTION. - TABLES: file = %O, define = yes, start = __TABLESEG_START__, size = __TABLESEG_SIZE__; -# We reserve $4000-7fff for the bank-switch window. -# In theory we could keep data and code here that we only use on 48k/64k systems. - BANKSWITCH: file = "", define = yes, start = __BANKSY_START__, size = __BANKSY_SIZE__; -# "main program" load chunk - MAIN: file = %O, define = yes, start = %S, size = __FRAMEBUFFER_START__ - __STACKSIZE__ - __RESERVED_MEMORY__ - %S; -} -FILES { - %O: format = atari; -} -FORMATS { - atari: runad = start, - initad = SYSCHKCHNK: __SYSTEM_CHECK__; -} -SEGMENTS { - ZEROPAGE: load = ZP, type = zp; - EXTZP: load = ZP, type = zp, optional = yes; - SYSCHK: load = SYSCHKCHNK, type = rw, define = yes, optional = yes; - TABLES: load = TABLES, type = ro, optional = yes, align = 256; - BANKSWICH: load = BANKSWITCH, type = ro, optional = yes; - STARTUP: load = MAIN, type = ro, define = yes; - LOWBSS: load = MAIN, type = rw, optional = yes; # not zero initialized - LOWCODE: load = MAIN, type = ro, define = yes, optional = yes; - ONCE: load = MAIN, type = ro, optional = yes; - CODE: load = MAIN, type = ro, define = yes; - RODATA: load = MAIN, type = ro; - DATA: load = MAIN, type = rw; - INIT: load = MAIN, type = rw, optional = yes; - BSS: load = MAIN, type = bss, define = yes; -} -FEATURES { - CONDES: type = constructor, - label = __CONSTRUCTOR_TABLE__, - count = __CONSTRUCTOR_COUNT__, - segment = ONCE; - CONDES: type = destructor, - label = __DESTRUCTOR_TABLE__, - count = __DESTRUCTOR_COUNT__, - segment = RODATA; - CONDES: type = interruptor, - label = __INTERRUPTOR_TABLE__, - count = __INTERRUPTOR_COUNT__, - segment = RODATA, - import = __CALLIRQ__; -} diff --git a/mandel-core.s b/mandel-core.s deleted file mode 100644 index 6ebb089..0000000 --- a/mandel-core.s +++ /dev/null @@ -1,2181 +0,0 @@ -; Our zero-page vars -ox = $80 ; fixed6.26: center point x -oy = $84 ; fixed6.26: center point y -cx = $88 ; fixed6.26: c_x -cy = $8c ; fixed6.26: c_y - -zx = $90 ; fixed6.26: z_x -zy = $94 ; fixed6.26: z_y -zx_2 = $98 ; fixed6.26: z_x^2 -zy_2 = $9c ; fixed6.26: z_y^2 - -zx_zy = $a0 ; fixed6.26: z_x * z_y -dist = $a4 ; fixed6.26: z_x^2 + z_y^2 -sx = $a8 ; i16: screen pixel x -sy = $aa ; i16: screen pixel y -z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not -z_buffer_start = $ad ; u8: index into z_buffer -z_buffer_end = $ae ; u8: index into z_buffer -iter = $af ; u8: iteration count - -ptr = $b0 ; u16 -pixel_ptr = $b2 ; u16 -zoom = $b4 ; u8: zoom shift level -fill_level = $b5 ; u8 -pixel_color = $b6 ; u8 -pixel_mask = $b7 ; u8 -pixel_shift = $b8 ; u8 -pixel_offset = $b9 ; u8 -palette_offset = $ba ; u8 -chroma_offset = $bb ; u8 -palette_ticks = $bc ; u8 -chroma_ticks = $bd ; u8 -count_frames = $be ; u8 -; free space $bf - -count_iters = $c0 ; u16 -text_col = $c2 ; u8 -text_row = $c3 ; u8 -; free space c4-cb -temp = $cc ; u16 -temp2 = $ce ; u16 - -palette_delay = 23 -chroma_delay = 137 - - -; FP registers in zero page -FR0 = $d4 ; float48 -FRE = $da -FR1 = $e0 ; float48 -FR2 = $e6 ; float48 -CIX = $f2 ; u8 - index into INBUFF -INBUFF = $f3 ; u16 - pointer to ascii -FLPTR = $fc ; u16 - pointer to user buffer float48 - -CH1 = $02f2 ; previous character read from keyboard -CH = $02fc ; current character read from keyboard - -LBUFF = $0580 ; result buffer for FASC routine - -; FP ROM routine vectors -FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set) -IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48) -FPI = $D9D2 ; floating point to integer -FADD = $DA66 ; ADDITION (FR0 += FR1) -FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1) -FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1) -FDIV = $DB28 ; DIVISION (FR0 /= FR1) -ZFR0 = $DA44 ; clear FR0 -ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX) -FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX) -FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX) -FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX) -FMOVE = $DDB6 ; MOVE FR0 TO FR1 - -; High data -framebuffer_top = $a000 -textbuffer = $af00 -framebuffer_bottom = $b000 -display_list = $bf00 -framebuffer_end = $c000 - -height = 176 -half_height = height >> 1 -width = 160 -half_width = width >> 1 -stride = width >> 2 - -EXTENDED_RAM = $4000 ; 16KiB bank on the XE -PORTB = $D301 ; memory & bank-switch for XL/XE - -DMACTL = $D400 -DLISTL = $D402 -DLISTH = $D403 -WSYNC = $D40A - -; OS shadow registers -SDLSTL = $230 -SDLSTH = $231 - -; interrupt stuff -SYSVBV = $E45F -XITVBV = $E462 -SETVBV = $E45C - -COLOR0 = $2C4 -COLOR1 = $2C5 -COLOR2 = $2C6 -COLOR3 = $2C7 -COLOR4 = $2C8 - -; Keycodes! -KEY_PLUS = $06 -KEY_MINUS = $0e -KEY_UP = $8e -KEY_DOWN = $8f -KEY_LEFT = $86 -KEY_RIGHT = $87 -KEY_1 = $1f -KEY_2 = $1e -KEY_3 = $1a -KEY_4 = 24 -KEY_5 = 29 -KEY_6 = 27 -KEY_7 = 51 -KEY_8 = 53 -KEY_9 = 48 -KEY_0 = 50 -KEY_PERIOD = 34 -KEY_E = 42 -KEY_X = 22 -KEY_Y = 43 - -.struct float48 - exponent .byte - mantissa .byte 5 -.endstruct - -.import mul_lobyte256 -.import mul_hibyte256 -.import mul_hibyte512 -.import sqr_lobyte -.import sqr_hibyte - -.data - -strings: -str_self: - .byte "MANDEL-6502" -str_self_end: - .byte 0 -str_speed: - .byte "us/iter: " -str_speed_end: - .byte 0 -str_run: - .byte " RUN" -str_run_end: - .byte 0 -str_done: - .byte "DONE" -str_done_end: - .byte 0 -str_padding: - .byte " " -str_padding_end: - .byte 0 - -str_space: - .byte " " - .byte 0 - -str_h: - .byte "h" - .byte 0 -str_m: - .byte "m" - .byte 0 -str_s: - .byte "s" - .byte 0 - -str_speed_len = str_speed_end - str_speed -str_run_len = str_run_end - str_run -str_done_len = str_done_end - str_done -str_padding_len = str_padding_end - str_padding - -; "3h59m59s" -str_elapsed_spacer = 8 -speed_start = 40 - str_done_len - str_speed_len - str_padding_len - str_elapsed_spacer - 1 - -col_x = 1 -str_x: - .byte "X:" - .byte 0 -str_x_len = 2 -str_x_space = 12 -str_x_padding = 2 - -col_y = col_x + str_x_len + str_x_space + str_x_padding -str_y: - .byte "Y:" - .byte 0 -str_y_len = 2 -str_y_space = 12 -str_y_padding = 2 - -col_zoom = col_y + str_y_len + str_y_space + str_y_padding -str_zoom: - .byte "ZOOM:" - .byte 0 -str_zoom_len = 5 - -char_map: - ; Map ATASCII string values to framebuffer font entries - ; Sighhhhh - .repeat 32, i - .byte i + 64 - .endrepeat - .repeat 64, i - .byte i - .endrepeat - .repeat 32, i - .byte 96 + i - .endrepeat - -hex_chars: -digits_zero: - .byte "0123456789abcdef" - -digits_space: - .byte " 123456789abcdef" - -aspect: - ; aspect ratio! - ; pixels at 320w are 5:6 (narrow) - ; pixels at 160w are 5:3 (wide) - ; - ; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4) - ; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4) - ; - ; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624 - ; &horizontal range -80 .. 79.9 is -3.125 .. 3.124 - ; - ; 184h is the equiv of 220.8h at square pixels - ; 320 / 220.8 = 1.45 display aspect ratio -aspect_x: ; fixed3.13 5/4 - .word 5 << (13 - 2) - -aspect_y: ; fixed3.13 3/4 - .word 3 << (13 - 2) - -fixed3_13_as_float: ; float48 - ; 1 << 13 - ; 8192 - ; 81 92 . 00 00 00 - .byte 65 ; exponent/sign - +1 byte - .byte $81 - .byte $92 - .byte $00 - .byte $00 - .byte $00 - -sec_per_frame: ; float48 00 . 01 66 66 66 67 - .byte 63 ; exponent/sign - -1 bytes - .byte $01 ; BCD digits - .byte $66 - .byte $66 - .byte $66 - .byte $67 - -us_per_sec: ; float48 1e9 01 00 0,0 00 . 00 - .byte 67 ; exponent/sign +3 bytes - .byte $01 ; BCD digits - .byte $00 - .byte $00 - .byte $00 - .byte $00 - -total_iters: ; float48 - .repeat 6 - .byte 0 - .endrepeat - -total_sec: ; float48 - .repeat 6 - .byte 0 - .endrepeat - -display_list_start: - ; 24 lines overscan - .repeat 3 - .byte $70 ; 8 blank lines - .endrep - - ; 8 scan lines, 1 row of 40-column text - .byte $42 - .addr textbuffer - - ; 184 lines graphics - ; ANTIC mode e (160px 2bpp, 1 scan line per line) - .byte $4e - .addr framebuffer_top - .repeat half_height - 1 - .byte $0e - .endrep - .byte $4e - .addr framebuffer_bottom - .repeat half_height - 1 - .byte $0e - .endrep - - ; 8 scan lines, 1 row of 40-column text - .byte $42 - .addr textbuffer + 40 - - .byte $41 ; jump and blank - .addr display_list -display_list_end: -display_list_len = display_list_end - display_list_start - -color_map: - .byte 0 - .repeat 85 - .byte %01010101 - .byte %10101010 - .byte %11111111 - .endrepeat - - -palette_start: - .byte $0e - .byte $08 - .byte $04 -palette_repeat: - .byte $0e - .byte $08 - -palette_entries = 3 - -palette_chroma: - .repeat 15, i - .byte (i + 1) << 4 - .endrepeat - .repeat 2, i - .byte (i + 1) << 4 - .endrepeat -palette_chroma_entries = 15 - -.code - -;z_buffer_len = 16 ; 10.863 ms/px -;z_buffer_len = 12 ; 10.619 ms/px -z_buffer_len = 8 ; 10.612 ms/px -;z_buffer_len = 4 ; 12.395 ms/px -z_buffer_mask = z_buffer_len - 1 -z_buffer: - ; the last N zx/zy values - .repeat z_buffer_len - .word 0 - .word 0 - .endrepeat - -.export _mandel_start - -;max_fill_level = 6 -max_fill_level = 3 -fill_masks: -; .byte %00011111 -; .byte %00001111 -; .byte %00000111 - .byte %00000011 - .byte %00000001 - .byte %00000000 - -pixel_masks: - .byte %11111111 - .byte %11110000 - .byte %11000000 - -viewport_zoom: - .byte 0 - .byte 5 - .byte 7 - .byte 5 - .byte 7 - .byte 7 - -viewport_ox: - .dword ($00000000 & $3fffffff) << 2 - .dword ($ff110000 & $3fffffff) << 2 - .dword ($ff110000 & $3fffffff) << 2 - .dword ($fe400000 & $3fffffff) << 2 - .dword ($fe3b0000 & $3fffffff) << 2 - .dword $fd220000 - -viewport_oy: - .dword ($00000000 & $3fffffff) << 2 - .dword ($ffb60000 & $3fffffff) << 2 - .dword ($ffbe0000 & $3fffffff) << 2 - .dword ($00000000 & $3fffffff) << 2 - .dword ($fffe0000 & $3fffffff) << 2 - .dword $ff000000 - -elapsed_work: - .dword 0 -elapsed_digit: - .byte 0 - -input_col: - .byte 0 -input_row: - .byte 0 -input_max: - .byte 0 - -; 2 + 9 * byte cycles -.macro add bytes, dest, arg1, arg2 - clc ; 2 cyc - .repeat bytes, byte ; 9 * byte cycles - lda arg1 + byte - adc arg2 + byte - sta dest + byte - .endrepeat -.endmacro - -; 20 cycles -.macro add16 dest, arg1, arg2 - add 2, dest, arg1, arg2 -.endmacro - -; 38 cycles -.macro add32 dest, arg1, arg2 - add 4, dest, arg1, arg2 -.endmacro - -; 8 cycles -.macro add_carry dest - lda dest ; 3 cyc - adc #0 ; 2 cyc - sta dest ; 3 cyc -.endmacro - -; 2 + 9 * byte cycles -.macro sub bytes, dest, arg1, arg2 - sec ; 2 cyc - .repeat bytes, byte ; 9 * byte cycles - lda arg1 + byte - sbc arg2 + byte - sta dest + byte - .endrepeat -.endmacro - -; 20 cycles -.macro sub16 dest, arg1, arg2 - sub 2, dest, arg1, arg2 -.endmacro - -; 38 cycles -.macro sub32 dest, arg1, arg2 - sub 4, dest, arg1, arg2 -.endmacro - -; 3 + 5 * (bytes - 1) cycles -.macro shl bytes, arg - asl arg ; 3 cyc - .repeat bytes-1, i - rol arg + 1 + i ; 5 cyc - .endrepeat -.endmacro - -; 8 cycles -.macro shl16 arg - shl 2, arg -.endmacro - -; 13 cycles -.macro shl24 arg - shl 3, arg -.endmacro - -; 18 cycles -.macro shl32 arg - shl 4, arg -.endmacro - -; 6 * bytes cycles -; 4 * bytes bytes -.macro copy bytes, dest, arg - .repeat bytes, byte ; 6 * bytes cycles - lda arg + byte ; 3 cyc - sta dest + byte ; 3 cyc - .endrepeat -.endmacro - -; 12 cycles -; 8 bytes -.macro copy16 dest, arg - copy 2, dest, arg -.endmacro - -; 24 cycles -.macro copy32 dest, arg - copy 4, dest, arg -.endmacro - -; 36 cycles -.macro copyfloat dest, arg - copy 6, dest, arg -.endmacro - -; 2 + 8 * byte cycles -.macro neg bytes, arg - sec ; 2 cyc - .repeat bytes, byte ; 8 * byte cycles - lda #00 ; 2 cyc - sbc arg + byte ; 3 cyc - sta arg + byte ; 3 cyc - .endrepeat -.endmacro - -; 18 cycles -.macro neg16 arg - neg 2, arg -.endmacro - -; 34 cycles -.macro neg32 arg - neg 4, arg -.endmacro - -; 11-27 + 18 * shift cycles -; 65-81 cycles for shift=3 -.macro shift_round_16 arg, shift - .repeat shift - shl32 arg ; 18 cycles - .endrepeat - round16 arg ; 11-27 cycles -.endmacro - -; input: arg1, arg2 as fixed4.12 -; output: dest as fixed8.24 -; patch point jsr at 16 bytes in -imul16_patch_offset = 16 -.macro imul16 dest, arg1, arg2 - copy16 FR0, arg1 ; 12 cyc - copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; ? cyc - copy32 dest, FR2 ; 24 cyc -.endmacro - -; input: arg as fixed4.12 -; output: dest as fixed8.24 -; patch point jsr at 8 bytes in -sqr16_patch_offset = 8 -.macro sqr16 dest, arg - copy16 FR0, arg ; 12 cyc - jsr sqr16_func ; ? cyc - copy32 dest, FR2 ; 24 cyc -.endmacro - -; input: arg as u8 -; output: dest as u16 -; clobbers a, x -.macro sqr8 dest, arg - ldx arg - lda sqr_lobyte,x - sta dest - lda sqr_hibyte,x - sta dest + 1 -.endmacro - -.segment "TABLES" -; lookup table for top byte -> PORTB value for bank-switch -.align 256 -bank_switch_table: - .repeat 256, i - .byte ((i & $c0) >> 4) | $e3 - .endrepeat - -.code - -.macro bank_switch bank - lda #((bank << 2) | $e3) - sta PORTB -.endmacro - -.macro imul8 dest, arg1, arg2, xe - .if xe - ; using 64KB lookup table - ; 51-70 cycles - ; clobbers x, y, dest, ptr - .scope - output = dest - - ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch_table,x ; 4 cyc - sta PORTB ; 4 cyc - - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - txa ; 2 cyc - and #$3f ; 2 cyc - ora #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - - ; copy the entry into output - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - tay ; 2 cyc - lda (ptr),y ; 5 cyc - sta output ; 3 cyc - iny ; 2 cyc - lda (ptr),y ; 5 cyc - sta output+1 ; 3 cyc - - ; note: we are not restoring memory to save 6 cycles! - ; this means those 16kb have to be switched back to base RAM - ; if we need to use them anywhere else - ;;; restore memory - ;;lda #$81 ; 2 cyc - disabled - ;;sta PORTB ; 4 cyc - disabled - - ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc - and #1 ; 2 cyc - beq done ; 2 cyc - - ; add arg2 one last time for the skipped bit - clc ; 2 cyc - txa ; 2 cyc - adc output ; 3 cyc - sta output ; 3 cyc - lda #0 ; 2 cyc - adc output+1 ; 3 cyc - sta output+1 ; 3 cyc - - done: - .endscope - .else - ; Using base 48k RAM compatibility mode - ; Small table of half squares - ; Adapted from https://everything2.com/title/Fast+6502+multiplication - ; 81-92 cycles - .scope - mul_factor_a = arg1 - mul_factor_x = arg2 - mul_product_lo = dest - mul_product_hi = dest + 1 - - lda mul_factor_a ; 3 cyc - - ; (a + x)^2/2 - clc ; 2 cyc - adc mul_factor_x ; 3 cyc - tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc - under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc - next: - sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc - - ; - a^2/2 - ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - - ; + x & a & 1: - ; (this is a kludge to correct a - ; roundoff error that makes odd * odd too low) - ldx mul_factor_x ; 3 cyc - txa ; 2 cyc - and mul_factor_a ; 3 cyc - and #1 ; 2 cyc - - clc ; 2 cyc - adc mul_product_lo ; 3 cyc - bcc small_product ; 2 cyc - inc mul_product_hi ; 5 cyc - - ; - x^2/2 - small_product: - sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - .endscope - .endif -.endmacro - - -; Initialize a 16 KB chunk of the table -; input: multipliers in temp -; output: new multipliers in temp -; clobbers: temp, temp2 -.proc imul8xe_init_section - arg1 = FR1 - arg2 = FR2 - result = FR0 - ptr = temp2 - - lda #$00 - sta ptr - lda #$40 - sta ptr + 1 - - ldy #0 - - ; outer loop: $00 -> $3f -outer_loop: - - ; reset result to 0 - lda #0 - sta result - sta result + 1 - - ; inner loop: $00 -> $ff -inner_loop: - - ; copy result to data set - lda result - sta (ptr),y - lda result + 1 - iny - sta (ptr),y - dey - - ; result += 2 * arg2 - clc - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result + 1 - clc - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result + 1 - - ; inner loop check - inc arg1 - inc arg1 - inc ptr - inc ptr - bne inner_loop - - ; outer loop check - inc arg2 - inc ptr + 1 - lda ptr + 1 - cmp #$80 - bne outer_loop - - rts - -.endproc - -.macro imul16_impl xe - .local arg1 - .local arg2 - .local result - .local inter - .local arg1_pos - .local arg2_pos - arg1 = FR0 ; 16-bit arg (clobbered) - arg2 = FR1 ; 16-bit arg (clobbered) - result = FR2 ; 32-bit result - inter = temp2 - - ; h1l1 * h2l2 - ; (h1*256 + l1) * (h2*256 + l2) - ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) - ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - - imul8 result, arg1, arg2, xe - - imul8 result + 2, arg1 + 1, arg2 + 1, xe - - imul8 inter, arg1 + 1, arg2, xe - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8 inter, arg1, arg2 + 1, xe - add16 result + 1, result + 1, inter - add_carry result + 3 - - ; In case of negative inputs, adjust high word - ; https://stackoverflow.com/a/28827013 - lda arg1 + 1 - bpl arg1_pos - sub16 result + 2, result + 2, arg2 -arg1_pos: - lda arg2 + 1 - bpl arg2_pos - sub16 result + 2, result + 2, arg1 -arg2_pos: - - rts ; 6 cyc -.endmacro - -.macro sqr16_impl xe - .scope - arg = FR0 ; 16-bit arg (clobbered) - result = FR2 ; 32-bit result - ;inter = temp2 - inter = FR1 - - lda arg + 1 - bpl arg_pos - neg16 arg - arg_pos: - - ; hl * hl - ; (h*256 + l) * (h*256 + l) - ; h*256*(h*256 + l) + l*(h*256 + l) - ; h*h*256*256 + h*l*256 + h*l*256 + l*l - - sqr8 result, arg - - sqr8 result + 2, arg + 1 - - imul8 inter, arg + 1, arg, xe - add16 result + 1, result + 1, inter - add_carry result + 3 - add16 result + 1, result + 1, inter - add_carry result + 3 - - rts ; 6 cyc - .endscope -.endmacro - -.proc imul16_func - imul16_impl 0 -.endproc - -.proc imul16xe_func - imul16_impl 1 -.endproc - -.proc sqr16_func - sqr16_impl 0 -.endproc - -.proc sqr16xe_func - sqr16_impl 1 -.endproc - -; 11-27 cycles -.macro round16 arg - ; Round top 16 bits of 32-bit fixed-point number in-place - .local increment - .local high_half - .local check_sign - .local next - - ; low word > $8000: round up - ; = $8000: round up if positive - ; round down if negative - ; < $8000: round down - - ; $8000 17 - ; $8001 27 - ; $8100 21 - ; $7fff 11 - - lda arg + 1 ; 3 cyc - cmp #$80 ; 2 cyc - beq high_half ; 2 cyc - - bpl increment ; 2 cyc - - bmi next ; 2 cyc - -high_half: - lda arg ; 3 cyc - beq check_sign ; 2 cyc - - jmp increment ; 3 cyc - -check_sign: - lda arg + 3 ; 3 cyc - bmi next ; 2 cyc - -increment: ; 5-10 cyc - inc arg + 2 ; 5 cyc - bne next ; 2 cyc - inc arg + 3 ; 5 cyc - -next: - -.endmacro - -; input in FR0, 16 bits signed 3.13 fixed -; output in FR0, Atari float -; clobbers a, x, y, FR0, FR1 -.proc fixed3_13_to_float - ldx #.lobyte(fixed3_13_as_float) - ldy #.hibyte(fixed3_13_as_float) - jsr FLD1R - - ; check sign bit! conversion routine is for unsigned - lda FR0 + 1 - bpl positive - -negative: - neg16 FR0 - jsr IFP - - ; set float sign bit - lda FR0 - ora #$80 - sta FR0 - jmp common - -positive: - jsr IFP - -common: - jsr FDIV - rts - -.endproc - -; rounds to 16-bit first! -; input in FR0, 32 bits signed 6.26 fixed -; output in FR0, Atari float -; clobbers a, x, y, FR0, FR1 -.proc fixed6_26_to_float - shift_round_16 FR0, 3 - copy16 FR0, FR0 + 2 - jsr fixed3_13_to_float - rts -.endproc - -; input in FR0, Atari float -; output in FR0, 16 bits signed 3.13 fixed -; clobbers a, x, y, FR0, FR1 -.proc float_to_fixed3_13 - ldx #.lobyte(fixed3_13_as_float) - ldy #.hibyte(fixed3_13_as_float) - jsr FLD1R - jsr FMUL - - ; check sign bit! conversion routine is for unsigned - lda FR0 - bcc positive - -negative: - ; clearfloat sign bit - lda FR0 - eor #$80 - sta FR0 - - jsr FPI - neg16 FR0 - jmp common - -positive: - jsr FPI - -common: - rts - -.endproc - -.proc mandelbrot - ; input: - ; cx: position scaled to 6.26 fixed point - -32..+31.9 - ; cy: position scaled to 6.26 - ; - ; output: - ; iter: iteration count at escape or 0 - - ; zx = 0 - ; zy = 0 - ; zx_2 = 0 - ; zy_2 = 0 - ; zx_zy = 0 - ; dist = 0 - ; iter = 0 -; lda #00 -; ldx #(iter - zx + 1) -;initloop: -; sta zx - 1,x -; dex -; bne initloop -; sta z_buffer_start -; sta z_buffer_end - - lda #00 - sta zx - sta zx + 1 - sta zx + 2 - sta zx + 3 - sta zy - sta zy + 1 - sta zy + 2 - sta zy + 3 - sta zx_2 - sta zx_2 + 1 - sta zx_2 + 2 - sta zx_2 + 3 - sta zy_2 - sta zy_2 + 1 - sta zy_2 + 2 - sta zy_2 + 3 - sta zx_zy - sta zx_zy + 1 - sta zx_zy + 2 - sta zx_zy + 3 - sta dist - sta dist + 1 - sta dist + 2 - sta dist + 3 - sta iter - sta z_buffer_start - sta z_buffer_end - -loop: - inc count_iters - bne low_iters - inc count_iters + 1 -low_iters: - - ; iter++ & max-iters break - inc iter - bne keep_going - jmp exit_path -keep_going: - - .macro quick_exit arg, max - ; arg: fixed6.26 - ; max: integer - .local positive - .local negative - .local nope_out - .local first_equal - .local all_done - - ; check sign bit - lda arg + 3 - bmi negative - - positive: - cmp #(max << 2) - bmi all_done ; 'less than' - jmp exit_path - - negative: - cmp #(256 - (max << 2)) - beq first_equal ; 'equal' on first byte - bpl all_done ; 'greater than' - - nope_out: - jmp exit_path - - first_equal: - ; following bytes all 0 shows it's really 'equal' - lda arg + 2 - bne all_done - lda arg + 1 - bne all_done - lda arg - bne all_done - jmp exit_path - - all_done: - .endmacro - - ; 6.26: (-32 .. 31.9) - ; zx = zx_2 - zy_2 + cx - sub32 zx, zx_2, zy_2 - add32 zx, zx, cx - quick_exit zx, 2 - - ; zy = zx_zy + zx_zy + cy - add32 zy, zx_zy, zx_zy - add32 zy, zy, cy - quick_exit zy, 2 - - ; convert 6.26 -> 3.13: (-4 .. +3.9) - shift_round_16 zx, 3 - shift_round_16 zy, 3 - - ; zx_2 = zx * zx -fixup_sqr16_1: - sqr16 zx_2, zx + 2 - - ; zy_2 = zy * zy -fixup_sqr16_2: - sqr16 zy_2, zy + 2 - - ; zx_zy = zx * zy -fixup_imul16_1: - imul16 zx_zy, zx + 2, zy + 2 - - ; dist = zx_2 + zy_2 - add32 dist, zx_2, zy_2 - quick_exit dist, 4 - - ; if may be in the lake, look for looping output with a small buffer - ; as an optimization vs running to max iters - lda z_buffer_active - beq skip_z_buffer - - ldx z_buffer_start - cpx z_buffer_end - beq z_nothing_to_read - -z_buffer_loop: - .macro z_compare arg - .local compare_no_match - lda z_buffer,x - inx - cmp arg - bne compare_no_match - iny - compare_no_match: - .endmacro - .macro z_advance - .local skip_reset_x - cpx #(z_buffer_len * 4) - bmi skip_reset_x - ldx #0 - skip_reset_x: - .endmacro - .macro z_store arg - lda arg - sta z_buffer,x - inx - .endmacro - - ; Compare the previously stored z values - ldy #0 - z_compare zx + 2 - z_compare zx + 3 - z_compare zy + 2 - z_compare zy + 3 - - cpy #4 - bne z_no_matches - jmp z_exit - -z_no_matches: - z_advance - - cpx z_buffer_end - bne z_buffer_loop - -z_nothing_to_read: - - ; Store and expand - z_store zx + 2 - z_store zx + 3 - z_store zy + 2 - z_store zy + 3 - z_advance - stx z_buffer_end - - ; Increment the start roller if necessary (limit size) - lda iter - cmp #(z_buffer_len * 4) - bmi skip_inc_start - lda z_buffer_start - clc - adc #4 - tax - z_advance - stx z_buffer_start -skip_inc_start: - -skip_z_buffer: - - jmp loop - -z_exit: - lda #0 - sta iter - -exit_path: - ldx #0 - lda iter - bne next - inx -next: - stx z_buffer_active - rts - -.endproc - -.macro scale_zoom dest - ; clobbers X, flags - .local cont - .local enough - - ; cx = (sx << (8 - zoom)) - ldx zoom -cont: - cpx #8 - beq enough - shl16 dest - inx - jmp cont -enough: -.endmacro - -.macro zoom_factor dest, src, aspect - ; output: dest: fixed6.26 - ; input: src: fixed3.13 - ; aspect: fixed3.13 - ; clobbers A, X, flags, etc - copy16 dest, src - scale_zoom dest - - ; cy = cy * (3 / 4) - ; cx = cx * (5 / 4) - imul16 dest, dest, aspect -.endmacro - -.proc pset - ; screen coords in signed sx,sy - ; iter holds the target to use - ; @todo implement - - ; iter -> color - ldx iter - lda color_map,x - ldx fill_level - and pixel_masks,x - sta pixel_color - lda pixel_masks,x - eor #$ff - sta pixel_mask - - ; sy -> line base address in temp - lda sy - bpl positive - -negative: - ; temp1 = top half - lda #.lobyte(framebuffer_top + stride * half_height) - sta pixel_ptr - lda #.hibyte(framebuffer_top + stride * half_height) - sta pixel_ptr + 1 - jmp point - -positive: - - lda #.lobyte(framebuffer_bottom) - sta pixel_ptr - lda #.hibyte(framebuffer_bottom) - sta pixel_ptr + 1 - -point: - - ; pixel_ptr += sy * stride - ; temp * 40 - ; = temp * 32 + temp * 8 - ; = (temp << 5) + (temp << 3) - copy16 temp, sy - shl16 temp - shl16 temp - shl16 temp - add16 pixel_ptr, pixel_ptr, temp - shl16 temp - shl16 temp - add16 pixel_ptr, pixel_ptr, temp - - ; Ok so temp1 points to the start of the line, which is 40 bytes. - ; Get the byte and bit offsets - lda sx - clc - adc #half_width - sta temp - - ; pixel_shift = temp & 3 - ; pixel_color <<= pixel_shift (shifting in zeros) - ; pixel_mask <<= pixel_shift (shifting in ones) - and #3 - sta pixel_shift - tax -shift_loop: - beq shift_done - lsr pixel_color - lsr pixel_color - sec - ror pixel_mask - sec - ror pixel_mask - dex - jmp shift_loop -shift_done: - - ldy fill_level - ldx fill_masks,y - inx - - ; pixel_offset = temp >> 2 - lda temp - lsr a - lsr a - sta pixel_offset - tay - -draw_pixel: - ; read, mask, or, write - lda (pixel_ptr),y - and pixel_mask - ora pixel_color - sta (pixel_ptr),y - - dex - beq done - clc - lda #40 - adc pixel_ptr - sta pixel_ptr - lda #0 - adc pixel_ptr + 1 - sta pixel_ptr + 1 - jmp draw_pixel - -done: - rts -.endproc - -; in/out: column in text_col -; in: row in text_row -; in: pointer to string in INBUFF -; clobbers x/y/a/temp -.proc draw_string - drawptr = temp - strptr = INBUFF - - clc - lda #.lobyte(textbuffer) - adc text_col - sta temp - lda #.hibyte(textbuffer) - adc #0 - sta temp + 1 - - ldx text_row - beq done_rows -continue_rows: - clc - lda temp - adc #40 - sta temp - lda temp + 1 - adc #0 - sta temp + 1 - dex - bne continue_rows - -done_rows: - - ldy #0 -loop: - lda (strptr),y - ; if char's null, terminate c-style - beq done - ; save the char for terminator check - pha - ; strip the high bit (terminator) - and #$7f - tax - lda char_map,x - sta (drawptr),y - iny - - pla - ; _last_ char has high bit set in atari rom routines - bmi done - jmp loop - -done: - ; move the text column pointer - tya - clc - adc text_col - sta text_col - - rts -.endproc - -.macro draw_string_const str - lda #.lobyte(str) - sta INBUFF - lda #.hibyte(str) - sta INBUFF + 1 - jsr draw_string -.endmacro - -.proc vblank_handler - inc count_frames - - inc chroma_ticks - lda chroma_ticks - cmp #(chroma_delay) - bne skip_chroma - - lda #0 - sta chroma_ticks - - inc chroma_offset - lda chroma_offset - cmp #(palette_chroma_entries) - bne skip_chroma - - lda #0 - sta chroma_offset -skip_chroma: - - inc palette_ticks - lda palette_ticks - cmp #(palette_delay) - bne skip_luma - - lda #0 - sta palette_ticks - - inc palette_offset - lda palette_offset - cmp #(palette_entries) - bne skip_luma - - lda #0 - sta palette_offset - -skip_luma: - jsr update_palette - jmp XITVBV -.endproc - -.proc update_palette - lda #0 - sta COLOR4 - - ldx chroma_offset - ldy palette_offset - lda palette_chroma,x - ora palette_start,y - sta COLOR2 - - ;inx - iny - lda palette_chroma,x - ora palette_start,y - sta COLOR1 - - ;inx - iny - lda palette_chroma,x - ora palette_start,y - sta COLOR0 - - rts -.endproc - -.proc update_speed - ; convert frames (u16) to fp - ; add to frames_total - ; convert pixels (u16) to fp - ; add to pixels_total - ; (frames_total * 16.66666667) / pixels_total - ; convert to ATASCII - ; draw text -.endproc - -.proc keycheck - ; clobbers all - ; returns 255 in A if state change or 0 if no change - - ; check keyboard buffer - lda CH - cmp #$ff - beq skip_char - - ; Clear the keyboard buffer and re-enable interrupts - ldx #$ff - stx CH - - tay - - lda zoom - cpy #KEY_PLUS - beq plus - cpy #KEY_MINUS - beq minus - - ; temp+temp2 = $00010000 << (8 - zoom) - lda #$00 - sta temp - sta temp + 1 - lda #$01 - sta temp + 2 - lda #$00 - sta temp + 3 - scale_zoom temp + 2 - - cpy #KEY_UP - beq up - cpy #KEY_DOWN - beq down - cpy #KEY_LEFT - beq left - cpy #KEY_RIGHT - beq right - jmp number_keys - -skip_char: - lda #0 - rts - -plus: - lda zoom - cmp #7 - bpl skip_char - inc zoom - jmp done -minus: - lda zoom - cmp #1 - bmi skip_char - dec zoom - jmp done -up: - add32 oy, oy, temp - jsr display_coords - jmp done -down: - sub32 oy, oy, temp - jsr display_coords - jmp done -left: - sub32 ox, ox, temp - jsr display_coords - jmp done -right: - add32 ox, ox, temp - jsr display_coords - jmp done - -number_keys: - cpy #KEY_1 - beq one - cpy #KEY_2 - beq two - cpy #KEY_3 - beq three - cpy #KEY_4 - beq four - cpy #KEY_5 - beq five - cpy #KEY_6 - beq six - jmp letter_keys - -one: - ldx #0 - jmp load_key_viewport -two: - ldx #1 - jmp load_key_viewport -three: - ldx #2 - jmp load_key_viewport -four: - ldx #3 - jmp load_key_viewport -five: - ldx #4 - jmp load_key_viewport -six: - ldx #5 - jmp load_key_viewport - -letter_keys: - cpy #KEY_X - bne not_x - jsr input_x - jmp done -not_x: - cpy #KEY_Y - bne not_y - jsr input_y - jmp done -not_y: - jmp skip_char - -load_key_viewport: - jsr load_viewport - ; fall through -done: - lda #255 - rts - -.endproc - -.proc input_x - ldx #col_x - ldy #1 - jsr input_number - - - rts -.endproc - -.proc input_y - rts -.endproc - -.proc input_number - rts -.endproc - -.proc clear_screen - ; zero the range from framebuffer_top to display_list - lda #.lobyte(framebuffer_top) - sta temp - lda #.hibyte(framebuffer_top) - sta temp + 1 - -zero_page_loop: - lda #0 - ldy #0 -zero_byte_loop: - sta (temp),y - iny - bne zero_byte_loop - - inc temp + 1 - lda temp + 1 - cmp #.hibyte(display_list) - bne zero_page_loop - - rts -.endproc - -.proc status_bar - ; Status bar - - lda #0 - sta text_col - lda #0 - sta text_row - draw_string_const str_self - - lda #(40 - str_run_len) - sta text_col - draw_string_const str_run - - rts -.endproc - -.proc display_coords - lda #1 - sta text_row - lda #col_x - sta text_col - draw_string_const str_x - - copy32 FR0, ox - jsr fixed6_26_to_float - jsr FASC - jsr draw_string - - lda #col_y - sta text_col - draw_string_const str_y - - copy32 FR0, oy - jsr fixed6_26_to_float - jsr FASC - jsr draw_string - - lda #col_zoom - sta text_col - draw_string_const str_zoom - - lda zoom - clc - adc #0 - sta FR0 - lda #0 - sta FR0 + 1 - jsr IFP - jsr FASC - jsr draw_string - - rts - -.endproc - -; input: viewport selector in x -; clobbers: a, x -.proc load_viewport - - lda viewport_zoom,x - sta zoom - - txa - asl a - asl a - - tax - lda viewport_ox,x - sta ox - lda viewport_oy,x - sta oy - - inx - lda viewport_ox,x - sta ox + 1 - lda viewport_oy,x - sta oy + 1 - - inx - lda viewport_ox,x - sta ox + 2 - lda viewport_oy,x - sta oy + 2 - - inx - lda viewport_ox,x - sta ox + 3 - lda viewport_oy,x - sta oy + 3 - - rts -.endproc - -.proc _mandel_start - - jsr imul8xe_init - - ; initialize viewport - ldx #0 ; overview - jsr load_viewport - - ; Disable display DMA - lda #0 - sta DMACTL - - jsr clear_screen - jsr display_coords - - ; Copy the display list into properly aligned memory - ; Can't cross 1024-byte boundaries :D - ldx #0 -copy_byte_loop: - lda display_list_start,x - sta display_list,x - inx - cpx #display_list_len - bne copy_byte_loop - - ; Set up the display list - lda #.lobyte(display_list) - sta DLISTL ; actual register - sta SDLSTL ; shadow register the OS will copy in - lda #.hibyte(display_list) - sta DLISTH ; actual register - sta SDLSTH ; shadow register the OS will copy in - - ; Re-enable display DMA - lda #$22 - sta DMACTL - - ; Initialize the palette - lda #0 - sta palette_offset - sta palette_delay - sta chroma_offset - sta chroma_delay - jsr update_palette - - ; install the vblank handler - lda #7 ; deferred - ldx #.hibyte(vblank_handler) - ldy #.lobyte(vblank_handler) - jsr SETVBV - -main_loop: - ; count_frames = 0; count_iters = 0 - lda #0 - sta count_frames - sta count_iters - sta count_iters + 1 - - ; total_sec = 0.0; total_iters = 0.0 - jsr ZFR0 - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) - jsr FST0R - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) - jsr FST0R - - jsr clear_screen - jsr status_bar - jsr display_coords - - lda #0 - sta fill_level - -fill_loop: - - ; sy = -92 .. 91 - lda #(256-half_height) - sta sy - lda #(256-1) - sta sy + 1 - -loop_sy: - ; sx = -80 .. 79 - lda #(256-half_width) - sta sx - lda #(256-1) - sta sx + 1 - -loop_sx: - ; check the fill mask - ldy #0 - -loop_skip_level: - cpy fill_level - beq current_level - - lda fill_masks,y - and sx - bne not_skipped_mask1 - - lda fill_masks,y - and sy - beq skipped_mask - -not_skipped_mask1: - iny - jmp loop_skip_level - -current_level: - lda fill_masks,y - and sx - bne skipped_mask - - lda fill_masks,y - and sy - beq not_skipped_mask - -skipped_mask: - jmp skipped - -not_skipped_mask: - - ; run the fractal! - zoom_factor cx, sx, aspect_x - add32 cx, cx, ox - zoom_factor cy, sy, aspect_y - neg32 cy - add32 cy, cy, oy - jsr mandelbrot - jsr pset - - jsr keycheck - beq no_key - ; @fixme clear the pixel stats - jmp main_loop - -no_key: - ; check if we should update the counters - - ; count_frames >= 120? update! - lda count_frames - cmp #120 ; >= 2 seconds - bpl update_status - jmp skipped - -update_status: - ; FR0 = (float)count_iters & clear count_iters - copy16 FR0, count_iters - jsr IFP - lda #0 - sta count_iters - sta count_iters + 1 - - ; FR1 = total_iters - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) - jsr FLD1R - - ; FR0 += FR1 - jsr FADD - - ; total_iters = FR0 - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) - jsr FST0R - - - ; FR0 = (float)count_frames & clear count_frames - ; warning: this should really disable interrupts @TODO - lda count_frames - sta FR0 - lda #0 - sta FR0 + 1 - sta count_frames - jsr IFP - - ; FR0 *= sec_per_frame - ldx #.lobyte(sec_per_frame) - ldy #.hibyte(sec_per_frame) - jsr FLD1R - jsr FMUL - - ; FR0 += total_sec - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) - jsr FLD1R - jsr FADD - - ; total_sec = FR0 - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) - jsr FST0R - - ; FR0 /= total_iters - ldx #.lobyte(total_iters) - ldy #.hibyte(total_iters) - jsr FLD1R - jsr FDIV - - ; FR0 *= us_per_sec - ldx #.lobyte(us_per_sec) - ldy #.hibyte(us_per_sec) - jsr FLD1R - jsr FMUL - - ; round (down) to integer - jsr FPI - clc - jsr IFP - - lda #speed_start - sta text_col - lda #0 - sta text_row - draw_string_const str_speed - - lda text_col - pha - draw_string_const str_padding - pla - sta text_col - - ; convert to ASCII in INBUFF and print - jsr FASC - jsr draw_string - - ; elapsed time - ; FR0 = total_sec - ldx #.lobyte(total_sec) - ldy #.hibyte(total_sec) - jsr FLD0R - ; FR0 -> integer -> elapsed_work - jsr FPI - lda FR0 - sta elapsed_work - lda FR0 + 1 - sta elapsed_work + 1 - - draw_string_const str_space - - .macro do_countdown divisor, digits - ldx #.lobyte(divisor) - ldy #.hibyte(divisor) - lda #.lobyte(digits) - sta INBUFF - lda #.hibyte(digits) - sta INBUFF + 1 - jsr countdown - .endmacro - do_countdown 36000, digits_space - do_countdown 3600, digits_zero - draw_string_const str_h - do_countdown 600, digits_zero - do_countdown 60, digits_zero - draw_string_const str_m - do_countdown 10, digits_zero - do_countdown 1, digits_zero - draw_string_const str_s - -skipped: - - ; sx += fill_level[fill_masks] + 1 - ldx fill_level - lda fill_masks,x - clc - adc #1 ; will never carry - adc sx - sta sx - lda #0 - adc sx + 1 - sta sx + 1 - - lda sx - cmp #half_width - beq loop_sx_done - jmp loop_sx - -loop_sx_done: - - ; sy += fill_level[fill_masks] + 1 - ldx fill_level - lda fill_masks,x - clc - adc #1 ; will never carry - adc sy - sta sy - lda #0 - adc sy + 1 - sta sy + 1 - - lda sy - cmp #half_height - beq loop_sy_done - jmp loop_sy - -loop_sy_done: - -fill_loop_done: - inc fill_level - lda fill_level - cmp #max_fill_level - beq loop - jmp fill_loop - -loop: - ; finished - - lda #(40 - str_done_len) - sta text_col - lda #0 - sta text_row - draw_string_const str_done - - jsr keycheck - beq loop - jmp main_loop - -.endproc - -; digit string in INBUFF -; divisor X/Y -; clobbers temp, calls draw_string -.proc countdown - divisor = temp - stx divisor - sty divisor + 1 - - ; count the hours - ldy #0 -countdown_loop: - lda elapsed_work + 1 - cmp divisor + 1 - beq countdown_lobyte - bcc countdown_done - bcs countdown_inc -countdown_lobyte: - lda elapsed_work - cmp divisor - bcc countdown_done -countdown_inc: - sec - lda elapsed_work - sbc divisor - sta elapsed_work - lda elapsed_work + 1 - sbc divisor + 1 - sta elapsed_work + 1 - iny - jmp countdown_loop -countdown_done: - lda (INBUFF),y - eor #$80 - sta elapsed_digit - lda #.lobyte(elapsed_digit) - sta INBUFF - lda #.hibyte(elapsed_digit) - sta INBUFF + 1 - jsr draw_string - rts -.endproc - -.proc imul8xe_init - - bank_switch 0 - lda #0 - sta EXTENDED_RAM - bank_switch 1 - lda #1 - sta EXTENDED_RAM - bank_switch 0 - lda EXTENDED_RAM - beq init - - ; no bank switching available, we just overwrite the value in base ram - rts - -init: - - ; patch imul16_func into a forwarding thunk to imul16xe_func - lda #$4c ; 'jmp' opcode - sta imul16_func - lda #.lobyte(imul16xe_func) - sta imul16_func + 1 - sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1 - lda #.hibyte(imul16xe_func) - sta imul16_func + 2 - sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2 - - ; ditto for sqr16_func -> sqr16xe_func - lda #$4c ; 'jmp' opcode - sta sqr16_func - lda #.lobyte(sqr16xe_func) - sta sqr16_func + 1 - sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1 - sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1 - lda #.hibyte(sqr16xe_func) - sta sqr16_func + 2 - sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2 - sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2 - - - ; create the lookup table - ; go through the input set, in four 16KB chunks - - arg1 = FR1 - arg2 = FR2 - result = FR0 - - lda #$00 - sta arg1 - sta arg2 - sta ptr - lda #$40 - sta ptr + 1 - - ; $00 * $00 -> $3f * $ff - bank_switch 0 - jsr imul8xe_init_section - - ; $40 * $00 -> $7f * $ff - bank_switch 1 - jsr imul8xe_init_section - - ; $80 * $00 -> $bf * $ff - bank_switch 2 - jsr imul8xe_init_section - - ; $c0 * $00 -> $ff * $ff - bank_switch 3 - jsr imul8xe_init_section - - rts -.endproc diff --git a/mandel.c b/mandel.c deleted file mode 100644 index f287fa3..0000000 --- a/mandel.c +++ /dev/null @@ -1,15 +0,0 @@ -/** - * The UI and I/O wrapper for the Mandelbrot runner, in C. - * - * For the moment *all* logic is in mandel-core.s, I'm just - * trying to get this to run within a cc65 environment. - * Eventually just the inner loop fun will live in there. - */ - -#include -#include -#include "mandel.h" - -void main(void) { - mandel_start(); -} \ No newline at end of file diff --git a/mandel.h b/mandel.h deleted file mode 100644 index e43fad7..0000000 --- a/mandel.h +++ /dev/null @@ -1,4 +0,0 @@ -#include - -// From mandel-core.s: -extern void mandel_start(void); diff --git a/mandel.s b/mandel.s new file mode 100644 index 0000000..fcc7867 --- /dev/null +++ b/mandel.s @@ -0,0 +1,1320 @@ +; Our zero-page vars +sx = $80 ; i16: screen pixel x +sy = $82 ; i16: screen pixel y +ox = $84 ; fixed4.12: center point x +oy = $86 ; fixed4.12: center point y +cx = $88 ; fixed4.12: c_x +cy = $8a ; fixed4.12: c_y +zx = $8c ; fixed4.12: z_x +zy = $8e ; fixed4.12: z_y + +zx_2 = $90 ; fixed4.12: z_x^2 +zy_2 = $92 ; fixed4.12: z_y^2 +zx_zy = $94 ; fixed4.12: z_x * z_y +dist = $96 ; fixed4.12: z_x^2 + z_y^2 + +iter = $a0 ; u8: iteration count + +zoom = $a1 ; u8: zoom shift level +count_frames = $a2 ; u8 +count_pixels = $a3 ; u8 +total_ms = $a4 ; float48 +total_pixels = $aa ; float48 + +z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not +z_buffer_start = $b1 ; u8: index into z_buffer +z_buffer_end = $b2 ; u8: index into z_buffer +temp = $b4 ; u16 +temp2 = $b6 ; u16 + +pixel_ptr = $b8 ; u16 +pixel_color = $ba ; u8 +pixel_mask = $bb ; u8 +pixel_shift = $bc ; u8 +pixel_offset = $bd ; u8 +fill_level = $be ; u8 +palette_offset = $bf ; u8 + +; FP registers in zero page +FR0 = $d4 ; float48 +FRE = $da +FR1 = $e0 ; float48 +FR2 = $e6 ; float48 +CIX = $f2 ; u8 - index into INBUFF +INBUFF = $f3 ; u16 - pointer to ascii +FLPTR = $fc ; u16 - pointer to user buffer float48 + +CH1 = $02f2 ; previous character read from keyboard +CH = $02fc ; current character read from keyboard + +LBUFF = $0580 ; result buffer for FASC routine + +; FP ROM routine vectors +FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set) +IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48) +FADD = $DA66 ; ADDITION (FR0 += FR1) +FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1) +FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1) +FDIV = $DB28 ; DIVISION (FR0 /= FR1) +ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX) +FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX) +FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX) +FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX) +FMOVE = $DDB6 ; MOVE FR0 TO FR1 + +; High data +framebuffer_top = $8000 +textbuffer = $8f00 +framebuffer_bottom = $9000 +display_list = $9f00 +framebuffer_end = $a000 + +height = 184 +half_height = height >> 1 +width = 160 +half_width = width >> 1 +stride = width >> 2 + +DMACTL = $D400 +DLISTL = $D402 +DLISTH = $D403 +WSYNC = $D40A + +; OS shadow registers +SDLSTL = $230 +SDLSTH = $231 + +; interrupt stuff +SYSVBV = $E45F +XITVBV = $E462 +SETVBV = $E45C + +COLOR0 = $2C4 +COLOR1 = $2C5 +COLOR2 = $2C6 +COLOR3 = $2C7 +COLOR4 = $2C8 + +; Keycodes! +KEY_PLUS = $06 +KEY_MINUS = $0e +KEY_UP = $8e +KEY_DOWN = $8f +KEY_LEFT = $86 +KEY_RIGHT = $87 + +.struct float48 + exponent .byte + mantissa .byte 6 +.endstruct + +.import mul_lobyte256 +.import mul_hibyte256 +.import mul_hibyte512 + +.data + +strings: +str_self: + .byte "MANDEL-6502" +str_self_end: +str_speed: + .byte " ms/px" +str_speed_end: +str_run: + .byte " RUN" +str_run_end: +str_done: + .byte "DONE" +str_done_end: + +str_self_len = str_self_end - str_self +str_speed_len = str_speed_end - str_speed +str_run_len = str_run_end - str_run +str_done_len = str_done_end - str_done +speed_precision = 6 + +speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1 +speed_len = 14 + str_speed_len + + +char_map: + ; Map ATASCII string values to framebuffer font entries + ; Sighhhhh + .repeat 32, i + .byte i + 64 + .endrepeat + .repeat 64, i + .byte i + .endrepeat + .repeat 32, i + .byte 96 + i + .endrepeat + +hex_chars: + .byte "0123456789abcdef" + +aspect: + ; aspect ratio! + ; pixels at 320w are 5:6 (narrow) + ; pixels at 160w are 5:3 (wide) + ; + ; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4) + ; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4) + ; + ; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624 + ; &horizontal range -80 .. 79.9 is -3.125 .. 3.124 + ; + ; 184h is the equiv of 220.8h at square pixels + ; 320 / 220.8 = 1.45 display aspect ratio +aspect_x: ; fixed4.16 5/4 + .word 5 << (12 - 2) + +aspect_y: ; fixed4.16 3/4 + .word 3 << (12 - 2) + +ms_per_frame: ; float48 16.66666667 + .byte 64 ; exponent/sign + .byte $16 ; BCD digits + .byte $66 + .byte $66 + .byte $66 + .byte $67 + +display_list_start: + ; 24 lines overscan + .repeat 3 + .byte $70 ; 8 blank lines + .endrep + + ; 8 scan lines, 1 row of 40-column text + .byte $42 + .addr textbuffer + + ; 184 lines graphics + ; ANTIC mode e (160px 2bpp, 1 scan line per line) + .byte $4e + .addr framebuffer_top + .repeat half_height - 1 + .byte $0e + .endrep + .byte $4e + .addr framebuffer_bottom + .repeat half_height - 1 + .byte $0e + .endrep + + .byte $41 ; jump and blank + .addr display_list +display_list_end: +display_list_len = display_list_end - display_list_start + +color_map: + .byte 0 + .repeat 85 + .byte 1 + .byte 2 + .byte 3 + .endrepeat + +palette: + .byte $00 + .byte $46 + .byte $78 + .byte $b4 +.code + +z_buffer_len = 16 +z_buffer_mask = z_buffer_len - 1 +z_buffer: + ; the last N zx/zy values + .repeat z_buffer_len + .word 0 + .word 0 + .endrepeat + +.export start + +max_fill_level = 6 +fill_masks: + .byte %00011111 + .byte %00001111 + .byte %00000111 + .byte %00000011 + .byte %00000001 + .byte %00000000 + +; 2 + 9 * byte cycles +.macro add bytes, dest, arg1, arg2 + clc ; 2 cyc + .repeat bytes, byte ; 9 * byte cycles + lda arg1 + byte + adc arg2 + byte + sta dest + byte + .endrepeat +.endmacro + +.macro add16 dest, arg1, arg2 + add 2, dest, arg1, arg2 +.endmacro + +.macro add32 dest, arg1, arg2 + add 4, dest, arg2, dest +.endmacro + +.macro add_carry dest + lda dest + adc #0 + sta dest +.endmacro + +; 2 + 9 * byte cycles +.macro sub bytes, dest, arg1, arg2 + sec ; 2 cyc + .repeat bytes, byte ; 9 * byte cycles + lda arg1 + byte + sbc arg2 + byte + sta dest + byte + .endrepeat +.endmacro + +.macro sub16 dest, arg1, arg2 + sub 2, dest, arg1, arg2 +.endmacro + +.macro sub32 dest, arg1, arg2 + sub 4, dest, arg1, arg2 +.endmacro + +.macro shl bytes, arg + asl arg + .repeat bytes-1, i + rol arg + 1 + i + .endrepeat +.endmacro + +.macro shl16 arg + shl 2, arg +.endmacro + +.macro shl24 arg + shl 3, arg +.endmacro + +.macro shl32 arg + shl 4, arg +.endmacro + +; 6 * bytes cycles +.macro copy bytes, dest, arg + .repeat bytes, byte ; 6 * bytes cycles + lda arg + byte ; 3 cyc + sta dest + byte ; 3 cyc + .endrepeat +.endmacro + +.macro copy16 dest, arg + copy 2, dest, arg +.endmacro + +.macro copy32 dest, arg + copy 4, dest, arg +.endmacro + +.macro copyfloat dest, arg + copy 6, dest, arg +.endmacro + +; 2 + 8 * byte cycles +.macro neg bytes, arg + sec ; 2 cyc + .repeat bytes, byte ; 8 * byte cycles + lda #00 ; 2 cyc + sbc arg + byte ; 3 cyc + sta arg + byte ; 3 cyc + .endrepeat +.endmacro + +; 18 cycles +.macro neg16 arg + neg 2, arg +.endmacro + +; 34 cycles +.macro neg32 arg + neg 4, arg +.endmacro + +; inner loop for imul16 +; bitnum < 8: 25 or 41 cycles +; bitnum >= 8: 30 or 46 cycles +.macro bitmul16 arg1, arg2, result, bitnum + .local zero + .local one + .local next + + ; does 16-bit adds + ; arg1 and arg2 are treated as unsigned + ; negative signed inputs must be flipped first + + ; 7 cycles up to the branch + + ; check if arg1 has 0 or 1 bit in this place + ; 5 cycles either way + .if bitnum < 8 + lda arg1 ; 3 cyc + and #(1 << (bitnum)) ; 2 cyc + .else + lda arg1 + 1 ; 3 cyc + and #(1 << ((bitnum) - 8)) ; 2 cyc + .endif + bne one ; 2 cyc + +zero: ; 18 cyc, 23 cyc + lsr result + 3 ; 5 cyc + jmp next ; 3 cyc + +one: ; 32 cyc, 37 cyc + ; 16-bit add on the top bits + clc ; 2 cyc + lda result + 2 ; 3 cyc + adc arg2 ; 3 cyc + sta result + 2 ; 3 cyc + lda result + 3 ; 3 cyc + adc arg2 + 1 ; 3 cyc + ror a ; 2 cyc - get a jump on the shift + sta result + 3 ; 3 cyc +next: + ror result + 2 ; 5 cyc + ror result + 1 ; 5 cyc + .if bitnum >= 8 + ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte + ; when it's all uninitialized data + ror result ; 5 cyc + .endif + +.endmacro + +; 5 to 25 cycles +.macro check_sign arg + ; Check sign bit and flip argument to postive, + ; keeping a count of sign bits in the Y register. + .local positive + lda arg + 1 ; 3 cyc + bpl positive ; 2 cyc + neg16 arg ; 18 cyc + iny ; 2 cyc +positive: +.endmacro + +; 518 - 828 cyc +.macro imul16 dest, arg1, arg2 + copy16 FR0, arg1 ; 12 cyc + copy16 FR1, arg2 ; 12 cyc + jsr imul16_func ; 470-780 cyc + copy32 dest, FR2 ; 24 cyc +.endmacro + +.macro shift_round_16 arg, shift + .repeat shift + shl32 arg + .endrepeat + round16 arg +.endmacro + +.macro imul16_round dest, arg1, arg2, shift + copy16 FR0, arg1 ; 12 cyc + copy16 FR1, arg2 ; 12 cyc + jsr imul16_func ; 470-780 cyc + shift_round_16 FR2, shift + copy16 dest, FR2 + 2 ; 12 cyc +.endmacro + +; min 470 cycles +; max 780 cycles +.proc imul16_func_orig + arg1 = FR0 ; 16-bit arg (clobbered) + arg2 = FR1 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y + check_sign arg1 ; 5 to 25 cyc + check_sign arg2 ; 5 to 25 cyc + + ; zero out the 32-bit temp's top 16 bits + lda #0 ; 2 cyc + sta result + 2 ; 3 cyc + sta result + 3 ; 3 cyc + ; the bottom two bytes will get cleared by the shifts + + ; unrolled loop for maximum speed, at the cost + ; of a larger routine + ; 440 to 696 cycles + .repeat 16, bitnum + ; bitnum < 8: 25 or 41 cycles + ; bitnum >= 8: 30 or 46 cycles + bitmul16 arg1, arg2, result, bitnum + .endrepeat + + ; In case of mixed input signs, return a negative result. + cpy #1 ; 2 cyc + bne positive_result ; 2 cyc + neg32 result ; 34 cyc +positive_result: + + rts ; 6 cyc +.endproc + +; Adapted from https://everything2.com/title/Fast+6502+multiplication +.macro imul8 dest, arg1, arg2 + .local under256 + .local next + .local small_product + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 + + lda mul_factor_a ; setup: 6 cycles + ;ldx mul_factor_x + + clc ; (a + x)^2/2: 23 cycles + adc mul_factor_x + tax + bcc under256 + lda mul_hibyte512,x + bcs next + under256: + lda mul_hibyte256,x + sec + next: + sta mul_product_hi + lda mul_lobyte256,x + + ldx mul_factor_a ; - a^2/2: 20 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + + ldx mul_factor_x ; + x & a & 1: 22 cycles + txa ; (this is a kludge to correct a + and mul_factor_a ; roundoff error that makes odd * odd too low) + and #1 + + clc + adc mul_product_lo + bcc small_product + inc mul_product_hi + small_product: + sec ; - x^2/2: 25 cycles + sbc mul_lobyte256,x + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + .endscope +.endmacro + +.proc imul16_func + arg1 = FR0 ; 16-bit arg (clobbered) + arg2 = FR1 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + inter = temp2 + + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y + check_sign arg1 ; 5 to 25 cyc + check_sign arg2 ; 5 to 25 cyc + + ; h1l1 * h2l2 + ; (h1*256 + l1) * (h2*256 + l2) + ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) + ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 + + lda #0 + sta result + 0 + sta result + 1 + sta result + 2 + sta result + 3 + + imul8 inter, arg1, arg2 + add16 result, result, inter + + imul8 inter, arg1 + 1, arg2 + add16 result + 1, result + 1, inter + + imul8 inter, arg1, arg2 + 1 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1 + 1, arg2 + 1 + add16 result + 2, result + 2, inter + + ; In case of mixed input signs, return a negative result. + cpy #1 ; 2 cyc + bne positive_result ; 2 cyc + neg32 result ; 34 cyc +positive_result: + + rts ; 6 cyc +.endproc + +.macro round16 arg + ; Round top 16 bits of 32-bit fixed-point number in-place + .local increment + .local high_half + .local check_sign + .local next + + ; low word > $8000: round up + ; = $8000: round up if positive + ; round down if negative + ; < $8000: round down + + lda arg + 1 + cmp #$80 + beq high_half + bpl increment + bmi next + +high_half: + lda arg + beq check_sign + bpl increment + bmi next + +check_sign: + lda arg + 3 + bmi next + +increment: ; 5-10 cyc + inc arg + 2 ; 5 cyc + bne next ; 2 cyc + inc arg + 3 ; 5 cyc + +next: + +.endmacro + +.proc mandelbrot + ; input: + ; cx: position scaled to 4.12 fixed point - -8..+7.9 + ; cy: position scaled to 4.12 + ; + ; output: + ; iter: iteration count at escape or 0 + + ; zx = 0 + ; zy = 0 + ; zx_2 = 0 + ; zy_2 = 0 + ; zx_zy = 0 + ; dist = 0 + ; iter = 0 + lda #00 + ldx #(iter - zx + 1) +initloop: + sta zx - 1,x + dex + bne initloop + sta z_buffer_start + sta z_buffer_end + +loop: + ; iter++ & max-iters break + inc iter + bne keep_going + jmp exit_path +keep_going: + + .macro quick_exit arg, max + .local positive + .local negative + .local nope_out + .local first_equal + .local all_done + + ; check sign bit + lda arg + 1 + bmi negative + + positive: + cmp #((max) << 4) + bmi all_done ; 'less than' + jmp exit_path + + negative: + cmp #(256 - ((max) << 4)) + beq first_equal ; 'equal' on first byte + bpl all_done ; 'greater than' + + nope_out: + jmp exit_path + + first_equal: + lda arg + beq nope_out ; 2nd byte 0 shows it's really 'equal' + + all_done: + .endmacro + + ; 4.12: (-8 .. +7.9) + ; zx = zx_2 - zy_2 + cx + sub16 zx, zx_2, zy_2 + add16 zx, zx, cx + quick_exit zx, 2 + + ; zy = zx_zy + zx_zy + cy + add16 zy, zx_zy, zx_zy + add16 zy, zy, cy + quick_exit zy, 2 + + ; zx_2 = zx * zx + imul16_round zx_2, zx, zx, 4 + + ; zy_2 = zy * zy + imul16_round zy_2, zy, zy, 4 + + ; zx_zy = zx * zy + imul16_round zx_zy, zx, zy, 4 + + ; dist = zx_2 + zy_2 + add16 dist, zx_2, zy_2 + quick_exit dist, 4 + + ; if may be in the lake, look for looping output with a small buffer + ; as an optimization vs running to max iters + lda z_buffer_active + beq skip_z_buffer + + ldx z_buffer_start + cpx z_buffer_end + beq z_nothing_to_read + +z_buffer_loop: + .macro z_compare arg + .local compare_no_match + lda z_buffer,x + inx + cmp arg + bne compare_no_match + iny + compare_no_match: + .endmacro + .macro z_advance + .local skip_reset_x + cpx #(z_buffer_len * 4) + bmi skip_reset_x + ldx #0 + skip_reset_x: + .endmacro + .macro z_store arg + lda arg + sta z_buffer,x + inx + .endmacro + + ; Compare the previously stored z values + ldy #0 + z_compare zx + z_compare zx + 1 + z_compare zy + z_compare zy + 1 + + cpy #4 + bne z_no_matches + jmp z_exit + +z_no_matches: + z_advance + + cpx z_buffer_end + bne z_buffer_loop + +z_nothing_to_read: + + ; Store and expand + z_store zx + z_store zx + 1 + z_store zy + z_store zy + 1 + z_advance + stx z_buffer_end + + ; Increment the start roller if necessary (limit size) + lda iter + cmp #(z_buffer_len * 4) + bmi skip_inc_start + lda z_buffer_start + clc + adc #4 + tax + z_advance + stx z_buffer_start +skip_inc_start: + +skip_z_buffer: + + jmp loop + +z_exit: + lda #0 + sta iter + +exit_path: + ldx #0 + lda iter + bne next + inx +next: + stx z_buffer_active + rts + +.endproc + +.macro scale_zoom dest + ; clobbers X, flags + .local cont + .local enough + + ; cx = (sx << (8 - zoom)) + ldx zoom +cont: + cpx #8 + beq enough + shl16 dest + inx + jmp cont +enough: +.endmacro + +.macro zoom_factor dest, src, zoom, aspect + ; clobbers A, X, flags, etc + copy16 dest, src + scale_zoom dest + + ; cy = cy * (3 / 4) + ; cx = cx * (5 / 4) + imul16_round dest, dest, aspect, 4 +.endmacro + +.proc pset + ; screen coords in signed sx,sy + ; iter holds the target to use + ; @todo implement + + ; iter -> color + ldx iter + lda color_map,x + sta pixel_color + lda #(255 - 3) + sta pixel_mask + + ; sy -> line base address in temp + lda sy + bpl positive + +negative: + ; temp1 = top half + lda #.lobyte(framebuffer_top + stride * half_height) + sta pixel_ptr + lda #.hibyte(framebuffer_top + stride * half_height) + sta pixel_ptr + 1 + jmp point + +positive: + + lda #.lobyte(framebuffer_bottom) + sta pixel_ptr + lda #.hibyte(framebuffer_bottom) + sta pixel_ptr + 1 + +point: + + ; pixel_ptr += sy * stride + ; temp * 40 + ; = temp * 32 + temp * 8 + ; = (temp << 5) + (temp << 3) + copy16 temp, sy + shl16 temp + shl16 temp + shl16 temp + add16 pixel_ptr, pixel_ptr, temp + shl16 temp + shl16 temp + add16 pixel_ptr, pixel_ptr, temp + + ; Ok so temp1 points to the start of the line, which is 40 bytes. + ; Get the byte and bit offsets + lda sx + clc + adc #half_width + sta temp + + ; pixel_shift = temp & 3 + ; pixel_color <<= pixel_shift (shifting in zeros) + ; pixel_mask <<= pixel_shift (shifting in ones) + and #3 + sta pixel_shift + lda #3 + sec + sbc pixel_shift + tax +shift_loop: + beq shift_done + asl pixel_color + asl pixel_color + sec + rol pixel_mask + sec + rol pixel_mask + dex + jmp shift_loop +shift_done: + + ; pixel_offset = temp >> 2 + lda temp + lsr a + lsr a + sta pixel_offset + tay + + ; read, mask, or, write + lda (pixel_ptr),y + and pixel_mask + ora pixel_color + sta (pixel_ptr),y + + rts +.endproc + +.macro draw_text_indirect col, len, strptr + ; clobbers A, X + .local loop + .local done + ldx #0 +loop: + cpx #len + beq done + txa + tay + lda (strptr),y + tay + lda char_map,y + sta textbuffer + col,x + inx + jmp loop +done: +.endmacro + +.macro draw_text col, len, cstr + ; clobbers A, X + .local loop + .local done + ldx #0 +loop: + cpx #len + beq done + ldy cstr,x + lda char_map,y + sta textbuffer + col,x + inx + jmp loop +done: +.endmacro + +.proc vblank_handler + inc count_frames + inc palette_offset + jsr update_palette + jmp XITVBV +.endproc + +.proc update_palette + lda palette + sta COLOR4 + + clc + lda palette_offset + and #$f0 + adc palette + 1 + sta COLOR0 + + clc + lda palette_offset + and #$f0 + adc palette + 2 + sta COLOR1 + + clc + lda palette_offset + and #$f0 + adc palette + 3 + sta COLOR2 +.endproc + +.proc update_speed + ; convert frames (u16) to fp + ; add to frames_total + ; convert pixels (u16) to fp + ; add to pixels_total + ; (frames_total * 16.66666667) / pixels_total + ; convert to ATASCII + ; draw text +.endproc + +.proc keycheck + ; clobbers all + ; returns 255 in A if state change or 0 if no change + + ; check keyboard buffer + lda CH + cmp #$ff + beq skip_char + + ; Clear the keyboard buffer and re-enable interrupts + ldx #$ff + stx CH + + tay + + lda zoom + cpy #KEY_PLUS + beq plus + cpy #KEY_MINUS + beq minus + + ; temp = $0010 << (8 - zoom) + lda #$10 + sta temp + lda #$00 + sta temp + 1 + scale_zoom temp + + cpy #KEY_UP + beq up + cpy #KEY_DOWN + beq down + cpy #KEY_LEFT + beq left + cpy #KEY_RIGHT + beq right + +skip_char: + lda #0 + rts + +plus: + cmp #8 + bpl skip_char + inc zoom + jmp done +minus: + cmp #1 + bmi skip_char + dec zoom + jmp done +up: + sub16 oy, oy, temp + jmp done +down: + add16 oy, oy, temp + jmp done +left: + sub16 ox, ox, temp + jmp done +right: + add16 ox, ox, temp +done: + lda #255 + rts + +.endproc + +.proc clear_screen + ; zero the range from framebuffer_top to display_list + lda #.lobyte(framebuffer_top) + sta temp + lda #.hibyte(framebuffer_top) + sta temp + 1 + +zero_page_loop: + lda #0 + ldy #0 +zero_byte_loop: + sta (temp),y + iny + bne zero_byte_loop + + inc temp + 1 + lda temp + 1 + cmp #.hibyte(display_list) + bne zero_page_loop + + rts +.endproc + +.proc status_bar + ; Status bar + draw_text 0, str_self_len, str_self + draw_text 40 - str_run_len, str_run_len, str_run + + rts +.endproc + +.proc start + + ; ox = 0; oy = 0; zoom = 0 + ; count_frames = 0; count_pixels = 0 + lda #0 + sta ox + sta ox + 1 + sta oy + sta oy + 1 + sta count_frames + sta count_pixels + + ; total_ms = 0.0; total_pixels = 0.0 + ldx #total_ms + jsr ZF1 + ldx #total_pixels + jsr ZF1 + + ; zoom = 2x + lda #1 + sta zoom + + ; Disable display DMA + lda #0 + sta DMACTL + + jsr clear_screen + + ; Copy the display list into properly aligned memory + ; Can't cross 1024-byte boundaries :D + ldx #0 +copy_byte_loop: + lda display_list_start,x + sta display_list,x + inx + cpx #display_list_len + bne copy_byte_loop + + ; Set up the display list + lda #.lobyte(display_list) + sta DLISTL ; actual register + sta SDLSTL ; shadow register the OS will copy in + lda #.hibyte(display_list) + sta DLISTH ; actual register + sta SDLSTH ; shadow register the OS will copy in + + ; Re-enable display DMA + lda #$22 + sta DMACTL + + ; Initialize the palette + lda #0 + sta palette_offset + jsr update_palette + + ; install the vblank handler + lda #7 ; deferred + ldx #.hibyte(vblank_handler) + ldy #.lobyte(vblank_handler) + jsr SETVBV + +main_loop: + jsr clear_screen + jsr status_bar + + lda #0 + sta fill_level + +fill_loop: + + ; sy = -92 .. 91 + lda #(256-half_height) + sta sy + lda #(256-1) + sta sy + 1 + +loop_sy: + ; sx = -80 .. 79 + lda #(256-half_width) + sta sx + lda #(256-1) + sta sx + 1 + +loop_sx: + ; check the fill mask + ldy #0 + +loop_skip_level: + cpy fill_level + beq current_level + + lda fill_masks,y + and sx + bne not_skipped_mask1 + + lda fill_masks,y + and sy + beq skipped_mask + +not_skipped_mask1: + iny + jmp loop_skip_level + +current_level: + lda fill_masks,y + and sx + bne skipped_mask + + lda fill_masks,y + and sy + beq not_skipped_mask + +skipped_mask: + jmp skipped + +not_skipped_mask: + + ; run the fractal! + zoom_factor cx, sx, zoom, aspect_x + add16 cx, cx, ox + zoom_factor cy, sy, zoom, aspect_y + add16 cy, cy, oy + jsr mandelbrot + jsr pset + + jsr keycheck + beq no_key + ; @fixme clear the pixel stats + jmp main_loop + +no_key: + ; check if we should update the counters + ; + ; count_pixels >= width? update! + inc count_pixels + lda count_pixels + cmp #width + bmi update_status + + ; count_frames >= 120? update! + lda count_frames + cmp #120 ; >= 2 seconds + bmi skipped + +update_status: + ; FR0 = (float)count_pixels & clear count_pixels + lda count_pixels + sta FR0 + lda #0 + sta FR0 + 1 + sta count_pixels + jsr IFP + + ; FR1 = total_pixels + ldx #.lobyte(total_pixels) + ldy #.hibyte(total_pixels) + jsr FLD1R + + ; FR0 += FR1 + jsr FADD + + ; total_pixels = FR0 + ldx #.lobyte(total_pixels) + ldy #.hibyte(total_pixels) + jsr FST0R + + + ; FR0 = (float)count_frames & clear count_frames + ; warning: this should really disable interrupts @TODO + lda count_frames + sta FR0 + lda #0 + sta FR0 + 1 + sta count_frames + jsr IFP + + ; FR0 *= ms_per_frame + ldx #.lobyte(ms_per_frame) + ldy #.hibyte(ms_per_frame) + jsr FLD1R + jsr FMUL + + ; FR0 += total_ms + ldx #total_ms + ldy #0 + jsr FLD1R + jsr FADD + + ; total_ms = FR0 + ldx #total_ms + ldy #0 + jsr FST0R + + ; FR0 /= total_pixels + ldx #total_pixels + ldy #0 + jsr FLD1R + jsr FDIV + + ; convert to ASCII in INBUFF + jsr FASC + + ; print the first 6 digits + draw_text_indirect speed_start, speed_precision, INBUFF + draw_text speed_start + speed_precision, str_speed_len, str_speed + +skipped: + + clc + lda sx + adc #1 + sta sx + lda sx + 1 + adc #0 + sta sx + 1 + + lda sx + cmp #half_width + beq loop_sx_done + jmp loop_sx + +loop_sx_done: + + clc + lda sy + adc #1 + sta sy + lda sy + 1 + adc #0 + sta sy + 1 + + lda sy + cmp #half_height + beq loop_sy_done + jmp loop_sy + +loop_sy_done: + +fill_loop_done: + inc fill_level + lda fill_level + cmp #max_fill_level + beq loop + jmp fill_loop + +loop: + ; finished + draw_text 40 - str_done_len, str_done_len, str_done + jsr keycheck + beq loop + jmp main_loop + +.endproc diff --git a/readme.md b/readme.md index 2c9efc1..6b57378 100644 --- a/readme.md +++ b/readme.md @@ -14,37 +14,32 @@ Non-goals: Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals. --- brooke, january 2023 - december 2024 +-- brooke, january 2023 - february 2024 ## Current state -Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys. +Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet. -The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. +The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. -* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition -* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops -* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications -* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication +The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input. -The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates. +The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13. Iterations are capped at 255. The pixels are run in a progressive layout to get the basic shape on screen faster. -There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D +## Next steps -There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. +Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it! -There's some cute color cycling. +Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. + +I may be able to do a faster multiply using tables of squares for 8-bit component multiplication. ## Deps and build instructions I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that. Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices. - -## Todo - -See ideas in `todo.md`. diff --git a/tables.js b/tables.js index 50cbef9..5afc3c0 100644 --- a/tables.js +++ b/tables.js @@ -11,40 +11,23 @@ function db(func) { return lines.join('\n'); } -let squares = []; -for (let i = 0; i < 512; i++) { - squares.push(Math.trunc((i * i + 1) / 2)); -} - console.log( `.segment "TABLES" .export mul_lobyte256 .export mul_hibyte256 .export mul_hibyte512 -.export sqr_lobyte -.export sqr_hibyte -; (i * i + 1) / 2 for the multiplier .align 256 mul_lobyte256: -${db((i) => squares[i] & 0xff)} +${db((x) => Math.round(x * x / 2) & 0xff)} .align 256 mul_hibyte256: -${db((i) => (squares[i] >> 8) & 0xff)} +${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)} .align 256 mul_hibyte512: -${db((i) => (squares[i + 256] >> 8) & 0xff)} - -; (i * i) for the plain squares -.align 256 -sqr_lobyte: -${db((i) => (i * i) & 0xff)} - -.align 256 -sqr_hibyte: -${db((i) => ((i * i) >> 8) & 0xff)} +${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)} `); diff --git a/todo.md b/todo.md deleted file mode 100644 index 6807ae2..0000000 --- a/todo.md +++ /dev/null @@ -1,17 +0,0 @@ -things to try: - -* fix status bar to show elapsed time, per-iter time, per-pixel iter count - -* 'turbo' mode disabling graphics in full or part - -* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D - -* maybe clean up the load/layout of the big mul table - -* consider alternate lookup tables in the top 16KB under ROM - -* y-axis mirror optimization - -* extract viewport for display & re-input via keyboard - -* fujinet screenshot/viewport uploader