diff --git a/.gitignore b/.gitignore index 8d2f7ce..771e47a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.o *.xex +tables.s .DS_Store diff --git a/.mailmap b/.mailmap new file mode 100644 index 0000000..3102e50 --- /dev/null +++ b/.mailmap @@ -0,0 +1,2 @@ +Brooke Vibber +Brooke Vibber diff --git a/Makefile b/Makefile index 25148b4..711adcd 100644 --- a/Makefile +++ b/Makefile @@ -2,13 +2,19 @@ all : mandel.xex -%.xex : %.o - ld65 -C atari-asm-xex.cfg -o $@ $< +mandel.xex : mandel.o tables.o atari-asm-xex.cfg + ld65 -C ./atari-asm-xex.cfg --mapfile mandel.map -o $@ mandel.o tables.o %.o : %.s ca65 -o $@ $< +tables.s : tables.js + node tables.js > tables.s + clean : + rm -f tables.s rm -f *.o rm -f *.xex + rm -f mandel.map + diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg new file mode 100644 index 0000000..93b80f3 --- /dev/null +++ b/atari-asm-xex.cfg @@ -0,0 +1,28 @@ +FEATURES { + STARTADDRESS: default = $2E00; +} +SYMBOLS { + __STARTADDRESS__: type = export, value = %S; +} +MEMORY { + ZP: file = "", define = yes, start = $0082, size = $007E; + MAIN: file = %O, define = yes, start = %S, size = $4000 - %S; + # Keep $4000-7fff clear for expanded RAM access window + TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000; + # Keep $a000-$bfff clear for BASIC cartridge +} +FILES { + %O: format = atari; +} +FORMATS { + atari: runad = start; +} +SEGMENTS { + ZEROPAGE: load = ZP, type = zp, optional = yes; + EXTZP: load = ZP, type = zp, optional = yes; # to enable modules to be able to link to C and assembler programs + CODE: load = MAIN, type = rw, define = yes; + RODATA: load = MAIN, type = ro optional = yes; + DATA: load = MAIN, type = rw optional = yes; + BSS: load = MAIN, type = bss, optional = yes, define = yes; + TABLES: load = TABLES, type = ro, optional = yes, align = 256; +} diff --git a/mandel.s b/mandel.s index 097b700..b52f24a 100644 --- a/mandel.s +++ b/mandel.s @@ -1,32 +1,47 @@ ; Our zero-page vars -sx = $80 ; i16: screen pixel x -sy = $82 ; i16: screen pixel y -ox = $84 ; fixed4.12: center point x -oy = $86 ; fixed4.12: center point y -cx = $88 ; fixed4.12: c_x -cy = $8a ; fixed4.12: c_y -zx = $8c ; fixed4.12: z_x -zy = $8e ; fixed4.12: z_y +ox = $80 ; fixed6.26: center point x +oy = $84 ; fixed6.26: center point y +cx = $88 ; fixed6.26: c_x +cy = $8c ; fixed6.26: c_y -zx_2 = $90 ; fixed4.12: z_x^2 -zy_2 = $92 ; fixed4.12: z_y^2 -zx_zy = $94 ; fixed4.12: z_x * z_y -dist = $96 ; fixed4.12: z_x^2 + z_y^2 +zx = $90 ; fixed6.26: z_x +zy = $94 ; fixed6.26: z_y +zx_2 = $98 ; fixed6.26: z_x^2 +zy_2 = $9c ; fixed6.26: z_y^2 -iter = $a0 ; u8: iteration count +zx_zy = $a0 ; fixed6.26: z_x * z_y +dist = $a4 ; fixed6.26: z_x^2 + z_y^2 +sx = $a8 ; i16: screen pixel x +sy = $aa ; i16: screen pixel y +z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not +z_buffer_start = $ad ; u8: index into z_buffer +z_buffer_end = $ae ; u8: index into z_buffer +iter = $af ; u8: iteration count -zoom = $a1 ; u8: zoom shift level -count_frames = $a2 ; u8 -count_pixels = $a3 ; u8 -total_ms = $a4 ; float48 -total_pixels = $aa ; float48 +ptr = $b0 ; u16 +pixel_ptr = $b2 ; u16 +zoom = $b4 ; u8: zoom shift level +fill_level = $b5 ; u8 +pixel_color = $b6 ; u8 +pixel_mask = $b7 ; u8 +pixel_shift = $b8 ; u8 +pixel_offset = $b9 ; u8 +palette_offset = $ba ; u8 +chroma_offset = $bb ; u8 +palette_ticks = $bc ; u8 +chroma_ticks = $bd ; u8 +count_frames = $be ; u8 +; free space $bf -temp = $b0 ; u16 -pixel_ptr = $b2 ; u16 -pixel_color = $b4 ; u8 -pixel_mask = $b5 ; u8 -pixel_shift = $b6 ; u8 -pixel_offset = $b7 ; u8 +count_iters = $c0 ; u16 +text_col = $c2 ; u8 +text_row = $c3 ; u8 +; free space c4-cb +temp = $cc ; u16 +temp2 = $ce ; u16 + +palette_delay = 23 +chroma_delay = 137 ; FP registers in zero page @@ -38,15 +53,20 @@ CIX = $f2 ; u8 - index into INBUFF INBUFF = $f3 ; u16 - pointer to ascii FLPTR = $fc ; u16 - pointer to user buffer float48 +CH1 = $02f2 ; previous character read from keyboard +CH = $02fc ; current character read from keyboard + LBUFF = $0580 ; result buffer for FASC routine ; FP ROM routine vectors FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set) IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48) +FPI = $D9D2 ; floating point to integer FADD = $DA66 ; ADDITION (FR0 += FR1) FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1) FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1) FDIV = $DB28 ; DIVISION (FR0 /= FR1) +ZFR0 = $DA44 ; clear FR0 ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX) FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX) FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX) @@ -54,59 +74,142 @@ FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX) FMOVE = $DDB6 ; MOVE FR0 TO FR1 ; High data -framebuffer_top = $8000 -textbuffer = $8f00 -framebuffer_bottom = $9000 -display_list = $9f00 -framebuffer_end = $a000 +framebuffer_top = $a000 +textbuffer = $af00 +framebuffer_bottom = $b000 +display_list = $bf00 +framebuffer_end = $c000 -height = 184 +height = 176 half_height = height >> 1 width = 160 half_width = width >> 1 stride = width >> 2 +EXTENDED_RAM = $4000 ; 16KiB bank on the XE +PORTB = $D301 ; memory & bank-switch for XL/XE + DMACTL = $D400 DLISTL = $D402 DLISTH = $D403 +WSYNC = $D40A ; OS shadow registers SDLSTL = $230 SDLSTH = $231 ; interrupt stuff +SYSVBV = $E45F XITVBV = $E462 SETVBV = $E45C +COLOR0 = $2C4 +COLOR1 = $2C5 +COLOR2 = $2C6 +COLOR3 = $2C7 +COLOR4 = $2C8 + +; Keycodes! +KEY_PLUS = $06 +KEY_MINUS = $0e +KEY_UP = $8e +KEY_DOWN = $8f +KEY_LEFT = $86 +KEY_RIGHT = $87 +KEY_1 = $1f +KEY_2 = $1e +KEY_3 = $1a +KEY_4 = 24 +KEY_5 = 29 +KEY_6 = 27 +KEY_7 = 51 +KEY_8 = 53 +KEY_9 = 48 +KEY_0 = 50 +KEY_PERIOD = 34 +KEY_E = 42 +KEY_X = 22 +KEY_Y = 43 + .struct float48 exponent .byte - mantissa .byte 6 + mantissa .byte 5 .endstruct +.import mul_lobyte256 +.import mul_hibyte256 +.import mul_hibyte512 +.import sqr_lobyte +.import sqr_hibyte + .data strings: str_self: .byte "MANDEL-6502" str_self_end: + .byte 0 str_speed: - .byte "ms/px" + .byte "us/iter: " str_speed_end: + .byte 0 str_run: .byte " RUN" str_run_end: + .byte 0 str_done: .byte "DONE" str_done_end: + .byte 0 +str_padding: + .byte " " +str_padding_end: + .byte 0 + +str_space: + .byte " " + .byte 0 + +str_h: + .byte "h" + .byte 0 +str_m: + .byte "m" + .byte 0 +str_s: + .byte "s" + .byte 0 -str_self_len = str_self_end - str_self str_speed_len = str_speed_end - str_speed str_run_len = str_run_end - str_run str_done_len = str_done_end - str_done +str_padding_len = str_padding_end - str_padding -speed_start = str_self_len + 2 -speed_len = 14 + str_speed_len +; "3h59m59s" +str_elapsed_spacer = 8 +speed_start = 40 - str_done_len - str_speed_len - str_padding_len - str_elapsed_spacer - 1 +col_x = 1 +str_x: + .byte "X:" + .byte 0 +str_x_len = 2 +str_x_space = 12 +str_x_padding = 2 + +col_y = col_x + str_x_len + str_x_space + str_x_padding +str_y: + .byte "Y:" + .byte 0 +str_y_len = 2 +str_y_space = 12 +str_y_padding = 2 + +col_zoom = col_y + str_y_len + str_y_space + str_y_padding +str_zoom: + .byte "ZOOM:" + .byte 0 +str_zoom_len = 5 char_map: ; Map ATASCII string values to framebuffer font entries @@ -121,6 +224,13 @@ char_map: .byte 96 + i .endrepeat +hex_chars: +digits_zero: + .byte "0123456789abcdef" + +digits_space: + .byte " 123456789abcdef" + aspect: ; aspect ratio! ; pixels at 320w are 5:6 (narrow) @@ -134,20 +244,49 @@ aspect: ; ; 184h is the equiv of 220.8h at square pixels ; 320 / 220.8 = 1.45 display aspect ratio -aspect_x: ; fixed4.16 5/4 - .word 5 << (12 - 2) +aspect_x: ; fixed3.13 5/4 + .word 5 << (13 - 2) -aspect_y: ; fixed4.16 3/4 - .word 3 << (12 - 2) +aspect_y: ; fixed3.13 3/4 + .word 3 << (13 - 2) -ms_per_frame: ; float48 16.66666667 - .byte 64 ; exponent/sign - .byte $16 ; BCD digits +fixed3_13_as_float: ; float48 + ; 1 << 13 + ; 8192 + ; 81 92 . 00 00 00 + .byte 65 ; exponent/sign - +1 byte + .byte $81 + .byte $92 + .byte $00 + .byte $00 + .byte $00 + +sec_per_frame: ; float48 00 . 01 66 66 66 67 + .byte 63 ; exponent/sign - -1 bytes + .byte $01 ; BCD digits .byte $66 .byte $66 .byte $66 .byte $67 +us_per_sec: ; float48 1e9 01 00 0,0 00 . 00 + .byte 67 ; exponent/sign +3 bytes + .byte $01 ; BCD digits + .byte $00 + .byte $00 + .byte $00 + .byte $00 + +total_iters: ; float48 + .repeat 6 + .byte 0 + .endrepeat + +total_sec: ; float48 + .repeat 6 + .byte 0 + .endrepeat + display_list_start: ; 24 lines overscan .repeat 3 @@ -171,6 +310,10 @@ display_list_start: .byte $0e .endrep + ; 8 scan lines, 1 row of 40-column text + .byte $42 + .addr textbuffer + 40 + .byte $41 ; jump and blank .addr display_list display_list_end: @@ -179,15 +322,98 @@ display_list_len = display_list_end - display_list_start color_map: .byte 0 .repeat 85 - .byte 1 - .byte 2 - .byte 3 + .byte %01010101 + .byte %10101010 + .byte %11111111 .endrepeat + +palette_start: + .byte $0e + .byte $08 + .byte $04 +palette_repeat: + .byte $0e + .byte $08 + +palette_entries = 3 + +palette_chroma: + .repeat 15, i + .byte (i + 1) << 4 + .endrepeat + .repeat 2, i + .byte (i + 1) << 4 + .endrepeat +palette_chroma_entries = 15 + .code +;z_buffer_len = 16 ; 10.863 ms/px +;z_buffer_len = 12 ; 10.619 ms/px +z_buffer_len = 8 ; 10.612 ms/px +;z_buffer_len = 4 ; 12.395 ms/px +z_buffer_mask = z_buffer_len - 1 +z_buffer: + ; the last N zx/zy values + .repeat z_buffer_len + .word 0 + .word 0 + .endrepeat + .export start +;max_fill_level = 6 +max_fill_level = 3 +fill_masks: +; .byte %00011111 +; .byte %00001111 +; .byte %00000111 + .byte %00000011 + .byte %00000001 + .byte %00000000 + +pixel_masks: + .byte %11111111 + .byte %11110000 + .byte %11000000 + +viewport_zoom: + .byte 0 + .byte 5 + .byte 7 + .byte 5 + .byte 7 + .byte 7 + +viewport_ox: + .dword ($00000000 & $3fffffff) << 2 + .dword ($ff110000 & $3fffffff) << 2 + .dword ($ff110000 & $3fffffff) << 2 + .dword ($fe400000 & $3fffffff) << 2 + .dword ($fe3b0000 & $3fffffff) << 2 + .dword $fd220000 + +viewport_oy: + .dword ($00000000 & $3fffffff) << 2 + .dword ($ffb60000 & $3fffffff) << 2 + .dword ($ffbe0000 & $3fffffff) << 2 + .dword ($00000000 & $3fffffff) << 2 + .dword ($fffe0000 & $3fffffff) << 2 + .dword $ff000000 + +elapsed_work: + .dword 0 +elapsed_digit: + .byte 0 + +input_col: + .byte 0 +input_row: + .byte 0 +input_max: + .byte 0 + ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 clc ; 2 cyc @@ -198,12 +424,21 @@ color_map: .endrepeat .endmacro +; 20 cycles .macro add16 dest, arg1, arg2 add 2, dest, arg1, arg2 .endmacro +; 38 cycles .macro add32 dest, arg1, arg2 - add 4, dest, arg2, dest + add 4, dest, arg1, arg2 +.endmacro + +; 8 cycles +.macro add_carry dest + lda dest ; 3 cyc + adc #0 ; 2 cyc + sta dest ; 3 cyc .endmacro ; 2 + 9 * byte cycles @@ -216,34 +451,41 @@ color_map: .endrepeat .endmacro +; 20 cycles .macro sub16 dest, arg1, arg2 sub 2, dest, arg1, arg2 .endmacro +; 38 cycles .macro sub32 dest, arg1, arg2 sub 4, dest, arg1, arg2 .endmacro +; 3 + 5 * (bytes - 1) cycles .macro shl bytes, arg - asl arg + asl arg ; 3 cyc .repeat bytes-1, i - rol arg + 1 + i + rol arg + 1 + i ; 5 cyc .endrepeat .endmacro +; 8 cycles .macro shl16 arg shl 2, arg .endmacro +; 13 cycles .macro shl24 arg shl 3, arg .endmacro +; 18 cycles .macro shl32 arg shl 4, arg .endmacro ; 6 * bytes cycles +; 4 * bytes bytes .macro copy bytes, dest, arg .repeat bytes, byte ; 6 * bytes cycles lda arg + byte ; 3 cyc @@ -251,14 +493,18 @@ color_map: .endrepeat .endmacro +; 12 cycles +; 8 bytes .macro copy16 dest, arg copy 2, dest, arg .endmacro +; 24 cycles .macro copy32 dest, arg copy 4, dest, arg .endmacro +; 36 cycles .macro copyfloat dest, arg copy 6, dest, arg .endmacro @@ -283,127 +529,337 @@ color_map: neg 4, arg .endmacro -; inner loop for imul16 -; bitnum < 8: 25 or 41 cycles -; bitnum >= 8: 30 or 46 cycles -.macro bitmul16 arg1, arg2, result, bitnum - .local zero - .local one - .local next - - ; does 16-bit adds - ; arg1 and arg2 are treated as unsigned - ; negative signed inputs must be flipped first - - ; 7 cycles up to the branch - - ; check if arg1 has 0 or 1 bit in this place - ; 5 cycles either way - .if bitnum < 8 - lda arg1 ; 3 cyc - and #(1 << (bitnum)) ; 2 cyc - .else - lda arg1 + 1 ; 3 cyc - and #(1 << ((bitnum) - 8)) ; 2 cyc - .endif - bne one ; 2 cyc - -zero: ; 18 cyc, 23 cyc - lsr result + 3 ; 5 cyc - jmp next ; 3 cyc - -one: ; 32 cyc, 37 cyc - ; 16-bit add on the top bits - clc ; 2 cyc - lda result + 2 ; 3 cyc - adc arg2 ; 3 cyc - sta result + 2 ; 3 cyc - lda result + 3 ; 3 cyc - adc arg2 + 1 ; 3 cyc - ror a ; 2 cyc - get a jump on the shift - sta result + 3 ; 3 cyc -next: - ror result + 2 ; 5 cyc - ror result + 1 ; 5 cyc - .if bitnum >= 8 - ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte - ; when it's all uninitialized data - ror result ; 5 cyc - .endif - +; 11-27 + 18 * shift cycles +; 65-81 cycles for shift=3 +.macro shift_round_16 arg, shift + .repeat shift + shl32 arg ; 18 cycles + .endrepeat + round16 arg ; 11-27 cycles .endmacro -; 5 to 25 cycles -.macro check_sign arg - ; Check sign bit and flip argument to postive, - ; keeping a count of sign bits in the X register. - .local positive - lda arg + 1 ; 3 cyc - bpl positive ; 2 cyc - neg16 arg ; 18 cyc - inx ; 2 cyc -positive: -.endmacro - -; 518 - 828 cyc +; input: arg1, arg2 as fixed4.12 +; output: dest as fixed8.24 +; patch point jsr at 16 bytes in +imul16_patch_offset = 16 .macro imul16 dest, arg1, arg2 copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; 470-780 cyc + jsr imul16_func ; ? cyc copy32 dest, FR2 ; 24 cyc .endmacro -.macro shift_round_16 arg, shift - .repeat shift - shl32 arg +; input: arg as fixed4.12 +; output: dest as fixed8.24 +; patch point jsr at 8 bytes in +sqr16_patch_offset = 8 +.macro sqr16 dest, arg + copy16 FR0, arg ; 12 cyc + jsr sqr16_func ; ? cyc + copy32 dest, FR2 ; 24 cyc +.endmacro + +; input: arg as u8 +; output: dest as u16 +; clobbers a, x +.macro sqr8 dest, arg + ldx arg + lda sqr_lobyte,x + sta dest + lda sqr_hibyte,x + sta dest + 1 +.endmacro + +.segment "TABLES" +; lookup table for top byte -> PORTB value for bank-switch +.align 256 +bank_switch_table: + .repeat 256, i + .byte ((i & $c0) >> 4) | $e3 .endrepeat - round16 arg + +.code + +.macro bank_switch bank + lda #((bank << 2) | $e3) + sta PORTB .endmacro -.macro imul16_round dest, arg1, arg2, shift - copy16 FR0, arg1 ; 12 cyc - copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; 470-780 cyc - shift_round_16 FR2, shift - copy16 dest, FR2 + 2 ; 12 cyc +.macro imul8 dest, arg1, arg2, xe + .if xe + ; using 64KB lookup table + ; 51-70 cycles + ; clobbers x, y, dest, ptr + .scope + output = dest + + ; top 2 bits are the table bank selector + ldx arg2 ; 3 cyc + lda bank_switch_table,x ; 4 cyc + sta PORTB ; 4 cyc + + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + txa ; 2 cyc + and #$3f ; 2 cyc + ora #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; copy the entry into output + lda arg1 ; 3 cyc + and #$fe ; 2 cyc + tay ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc + + ; note: we are not restoring memory to save 6 cycles! + ; this means those 16kb have to be switched back to base RAM + ; if we need to use them anywhere else + ;;; restore memory + ;;lda #$81 ; 2 cyc - disabled + ;;sta PORTB ; 4 cyc - disabled + + ; check that 1 bit we skipped to fit into space + lda arg1 ; 3 cyc + and #1 ; 2 cyc + beq done ; 2 cyc + + ; add arg2 one last time for the skipped bit + clc ; 2 cyc + txa ; 2 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc + + done: + .endscope + .else + ; Using base 48k RAM compatibility mode + ; Small table of half squares + ; Adapted from https://everything2.com/title/Fast+6502+multiplication + ; 81-92 cycles + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 + + lda mul_factor_a ; 3 cyc + + ; (a + x)^2/2 + clc ; 2 cyc + adc mul_factor_x ; 3 cyc + tax ; 2 cyc + bcc under256 ; 2 cyc + lda mul_hibyte512,x ; 4 cyc + bcs next ; 2 cyc + under256: + lda mul_hibyte256,x ; 4 cyc + sec ; 2 cyc + next: + sta mul_product_hi ; 3 cyc + lda mul_lobyte256,x ; 4 cyc + + ; - a^2/2 + ldx mul_factor_a ; 3 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc + + ; + x & a & 1: + ; (this is a kludge to correct a + ; roundoff error that makes odd * odd too low) + ldx mul_factor_x ; 3 cyc + txa ; 2 cyc + and mul_factor_a ; 3 cyc + and #1 ; 2 cyc + + clc ; 2 cyc + adc mul_product_lo ; 3 cyc + bcc small_product ; 2 cyc + inc mul_product_hi ; 5 cyc + + ; - x^2/2 + small_product: + sec ; 2 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc + .endscope + .endif .endmacro -; min 470 cycles -; max 780 cycles -.proc imul16_func + +; Initialize a 16 KB chunk of the table +; input: multipliers in temp +; output: new multipliers in temp +; clobbers: temp, temp2 +.proc imul8xe_init_section + arg1 = FR1 + arg2 = FR2 + result = FR0 + ptr = temp2 + + lda #$00 + sta ptr + lda #$40 + sta ptr + 1 + + ldy #0 + + ; outer loop: $00 -> $3f +outer_loop: + + ; reset result to 0 + lda #0 + sta result + sta result + 1 + + ; inner loop: $00 -> $ff +inner_loop: + + ; copy result to data set + lda result + sta (ptr),y + lda result + 1 + iny + sta (ptr),y + dey + + ; result += 2 * arg2 + clc + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + 1 + clc + lda arg2 + adc result + sta result + lda #0 + adc result + 1 + sta result + 1 + + ; inner loop check + inc arg1 + inc arg1 + inc ptr + inc ptr + bne inner_loop + + ; outer loop check + inc arg2 + inc ptr + 1 + lda ptr + 1 + cmp #$80 + bne outer_loop + + rts + +.endproc + +.macro imul16_impl xe + .local arg1 + .local arg2 + .local result + .local inter + .local arg1_pos + .local arg2_pos arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result + inter = temp2 - ldx #0 ; 2 cyc - ; counts the number of sign bits in X - check_sign arg1 ; 5 to 25 cyc - check_sign arg2 ; 5 to 25 cyc - - ; zero out the 32-bit temp's top 16 bits - lda #0 ; 2 cyc - sta result + 2 ; 3 cyc - sta result + 3 ; 3 cyc - ; the bottom two bytes will get cleared by the shifts + ; h1l1 * h2l2 + ; (h1*256 + l1) * (h2*256 + l2) + ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) + ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - ; unrolled loop for maximum speed, at the cost - ; of a larger routine - ; 440 to 696 cycles - .repeat 16, bitnum - ; bitnum < 8: 25 or 41 cycles - ; bitnum >= 8: 30 or 46 cycles - bitmul16 arg1, arg2, result, bitnum - .endrepeat + imul8 result, arg1, arg2, xe - ; In case of mixed input signs, return a negative result. - cpx #1 ; 2 cyc - bne positive_result ; 2 cyc - neg32 result ; 34 cyc -positive_result: + imul8 result + 2, arg1 + 1, arg2 + 1, xe + + imul8 inter, arg1 + 1, arg2, xe + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8 inter, arg1, arg2 + 1, xe + add16 result + 1, result + 1, inter + add_carry result + 3 + + ; In case of negative inputs, adjust high word + ; https://stackoverflow.com/a/28827013 + lda arg1 + 1 + bpl arg1_pos + sub16 result + 2, result + 2, arg2 +arg1_pos: + lda arg2 + 1 + bpl arg2_pos + sub16 result + 2, result + 2, arg1 +arg2_pos: rts ; 6 cyc +.endmacro + +.macro sqr16_impl xe + .scope + arg = FR0 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + ;inter = temp2 + inter = FR1 + + lda arg + 1 + bpl arg_pos + neg16 arg + arg_pos: + + ; hl * hl + ; (h*256 + l) * (h*256 + l) + ; h*256*(h*256 + l) + l*(h*256 + l) + ; h*h*256*256 + h*l*256 + h*l*256 + l*l + + sqr8 result, arg + + sqr8 result + 2, arg + 1 + + imul8 inter, arg + 1, arg, xe + add16 result + 1, result + 1, inter + add_carry result + 3 + add16 result + 1, result + 1, inter + add_carry result + 3 + + rts ; 6 cyc + .endscope +.endmacro + +.proc imul16_func + imul16_impl 0 .endproc +.proc imul16xe_func + imul16_impl 1 +.endproc + +.proc sqr16_func + sqr16_impl 0 +.endproc + +.proc sqr16xe_func + sqr16_impl 1 +.endproc + +; 11-27 cycles .macro round16 arg ; Round top 16 bits of 32-bit fixed-point number in-place .local increment @@ -416,21 +872,28 @@ positive_result: ; round down if negative ; < $8000: round down - lda arg + 1 - cmp #$80 - beq high_half - bpl increment - bmi next + ; $8000 17 + ; $8001 27 + ; $8100 21 + ; $7fff 11 + + lda arg + 1 ; 3 cyc + cmp #$80 ; 2 cyc + beq high_half ; 2 cyc + + bpl increment ; 2 cyc + + bmi next ; 2 cyc high_half: - lda arg - beq check_sign - bpl increment - bmi next + lda arg ; 3 cyc + beq check_sign ; 2 cyc + + jmp increment ; 3 cyc check_sign: - lda arg + 3 - bmi next + lda arg + 3 ; 3 cyc + bmi next ; 2 cyc increment: ; 5-10 cyc inc arg + 2 ; 5 cyc @@ -441,10 +904,83 @@ next: .endmacro +; input in FR0, 16 bits signed 3.13 fixed +; output in FR0, Atari float +; clobbers a, x, y, FR0, FR1 +.proc fixed3_13_to_float + ldx #.lobyte(fixed3_13_as_float) + ldy #.hibyte(fixed3_13_as_float) + jsr FLD1R + + ; check sign bit! conversion routine is for unsigned + lda FR0 + 1 + bpl positive + +negative: + neg16 FR0 + jsr IFP + + ; set float sign bit + lda FR0 + ora #$80 + sta FR0 + jmp common + +positive: + jsr IFP + +common: + jsr FDIV + rts + +.endproc + +; rounds to 16-bit first! +; input in FR0, 32 bits signed 6.26 fixed +; output in FR0, Atari float +; clobbers a, x, y, FR0, FR1 +.proc fixed6_26_to_float + shift_round_16 FR0, 3 + copy16 FR0, FR0 + 2 + jsr fixed3_13_to_float + rts +.endproc + +; input in FR0, Atari float +; output in FR0, 16 bits signed 3.13 fixed +; clobbers a, x, y, FR0, FR1 +.proc float_to_fixed3_13 + ldx #.lobyte(fixed3_13_as_float) + ldy #.hibyte(fixed3_13_as_float) + jsr FLD1R + jsr FMUL + + ; check sign bit! conversion routine is for unsigned + lda FR0 + bcc positive + +negative: + ; clearfloat sign bit + lda FR0 + eor #$80 + sta FR0 + + jsr FPI + neg16 FR0 + jmp common + +positive: + jsr FPI + +common: + rts + +.endproc + .proc mandelbrot ; input: - ; cx: position scaled to 4.12 fixed point - -8..+7.9 - ; cy: position scaled to 4.12 + ; cx: position scaled to 6.26 fixed point - -32..+31.9 + ; cy: position scaled to 6.26 ; ; output: ; iter: iteration count at escape or 0 @@ -456,21 +992,59 @@ next: ; zx_zy = 0 ; dist = 0 ; iter = 0 +; lda #00 +; ldx #(iter - zx + 1) +;initloop: +; sta zx - 1,x +; dex +; bne initloop +; sta z_buffer_start +; sta z_buffer_end + lda #00 - ldx #(iter - zx + 1) -initloop: - sta zx - 1,x - dex - bne initloop + sta zx + sta zx + 1 + sta zx + 2 + sta zx + 3 + sta zy + sta zy + 1 + sta zy + 2 + sta zy + 3 + sta zx_2 + sta zx_2 + 1 + sta zx_2 + 2 + sta zx_2 + 3 + sta zy_2 + sta zy_2 + 1 + sta zy_2 + 2 + sta zy_2 + 3 + sta zx_zy + sta zx_zy + 1 + sta zx_zy + 2 + sta zx_zy + 3 + sta dist + sta dist + 1 + sta dist + 2 + sta dist + 3 + sta iter + sta z_buffer_start + sta z_buffer_end loop: + inc count_iters + bne low_iters + inc count_iters + 1 +low_iters: + ; iter++ & max-iters break inc iter bne keep_going - rts + jmp exit_path keep_going: .macro quick_exit arg, max + ; arg: fixed6.26 + ; max: integer .local positive .local negative .local nope_out @@ -478,68 +1052,162 @@ keep_going: .local all_done ; check sign bit - lda arg + 1 + lda arg + 3 bmi negative positive: - cmp #((max) << 4) + cmp #(max << 2) bmi all_done ; 'less than' - rts + jmp exit_path negative: - cmp #(256 - ((max) << 4)) + cmp #(256 - (max << 2)) beq first_equal ; 'equal' on first byte bpl all_done ; 'greater than' nope_out: - rts - + jmp exit_path + first_equal: + ; following bytes all 0 shows it's really 'equal' + lda arg + 2 + bne all_done + lda arg + 1 + bne all_done lda arg - beq nope_out ; 2nd byte 0 shows it's really 'equal' + bne all_done + jmp exit_path all_done: .endmacro - ; 4.12: (-8 .. +7.9) + ; 6.26: (-32 .. 31.9) ; zx = zx_2 - zy_2 + cx - sub16 zx, zx_2, zy_2 - add16 zx, zx, cx + sub32 zx, zx_2, zy_2 + add32 zx, zx, cx quick_exit zx, 2 ; zy = zx_zy + zx_zy + cy - add16 zy, zx_zy, zx_zy - add16 zy, zy, cy + add32 zy, zx_zy, zx_zy + add32 zy, zy, cy quick_exit zy, 2 + ; convert 6.26 -> 3.13: (-4 .. +3.9) + shift_round_16 zx, 3 + shift_round_16 zy, 3 + ; zx_2 = zx * zx - imul16_round zx_2, zx, zx, 4 +fixup_sqr16_1: + sqr16 zx_2, zx + 2 ; zy_2 = zy * zy - imul16_round zy_2, zy, zy, 4 +fixup_sqr16_2: + sqr16 zy_2, zy + 2 ; zx_zy = zx * zy - imul16_round zx_zy, zx, zy, 4 +fixup_imul16_1: + imul16 zx_zy, zx + 2, zy + 2 ; dist = zx_2 + zy_2 - add16 dist, zx_2, zy_2 + add32 dist, zx_2, zy_2 quick_exit dist, 4 ; if may be in the lake, look for looping output with a small buffer ; as an optimization vs running to max iters + lda z_buffer_active + beq skip_z_buffer + + ldx z_buffer_start + cpx z_buffer_end + beq z_nothing_to_read + +z_buffer_loop: + .macro z_compare arg + .local compare_no_match + lda z_buffer,x + inx + cmp arg + bne compare_no_match + iny + compare_no_match: + .endmacro + .macro z_advance + .local skip_reset_x + cpx #(z_buffer_len * 4) + bmi skip_reset_x + ldx #0 + skip_reset_x: + .endmacro + .macro z_store arg + lda arg + sta z_buffer,x + inx + .endmacro + + ; Compare the previously stored z values + ldy #0 + z_compare zx + 2 + z_compare zx + 3 + z_compare zy + 2 + z_compare zy + 3 + + cpy #4 + bne z_no_matches + jmp z_exit + +z_no_matches: + z_advance + + cpx z_buffer_end + bne z_buffer_loop + +z_nothing_to_read: + + ; Store and expand + z_store zx + 2 + z_store zx + 3 + z_store zy + 2 + z_store zy + 3 + z_advance + stx z_buffer_end + + ; Increment the start roller if necessary (limit size) + lda iter + cmp #(z_buffer_len * 4) + bmi skip_inc_start + lda z_buffer_start + clc + adc #4 + tax + z_advance + stx z_buffer_start +skip_inc_start: + +skip_z_buffer: + jmp loop -peace_out: +z_exit: + lda #0 + sta iter + +exit_path: + ldx #0 + lda iter + bne next + inx +next: + stx z_buffer_active rts .endproc -.macro zoom_factor dest, src, zoom, aspect +.macro scale_zoom dest + ; clobbers X, flags .local cont .local enough ; cx = (sx << (8 - zoom)) - copy16 dest, src ldx zoom cont: cpx #8 @@ -548,10 +1216,19 @@ cont: inx jmp cont enough: +.endmacro + +.macro zoom_factor dest, src, aspect + ; output: dest: fixed6.26 + ; input: src: fixed3.13 + ; aspect: fixed3.13 + ; clobbers A, X, flags, etc + copy16 dest, src + scale_zoom dest ; cy = cy * (3 / 4) ; cx = cx * (5 / 4) - imul16_round dest, dest, aspect, 4 + imul16 dest, dest, aspect .endmacro .proc pset @@ -562,8 +1239,11 @@ enough: ; iter -> color ldx iter lda color_map,x + ldx fill_level + and pixel_masks,x sta pixel_color - lda #(255 - 3) + lda pixel_masks,x + eor #$ff sta pixel_mask ; sy -> line base address in temp @@ -612,22 +1292,23 @@ point: ; pixel_mask <<= pixel_shift (shifting in ones) and #3 sta pixel_shift - lda #3 - sec - sbc pixel_shift tax shift_loop: beq shift_done - asl pixel_color - asl pixel_color + lsr pixel_color + lsr pixel_color sec - rol pixel_mask + ror pixel_mask sec - rol pixel_mask + ror pixel_mask dex jmp shift_loop shift_done: + ldy fill_level + ldx fill_masks,y + inx + ; pixel_offset = temp >> 2 lda temp lsr a @@ -635,36 +1316,162 @@ shift_done: sta pixel_offset tay +draw_pixel: ; read, mask, or, write lda (pixel_ptr),y and pixel_mask ora pixel_color sta (pixel_ptr),y + dex + beq done + clc + lda #40 + adc pixel_ptr + sta pixel_ptr + lda #0 + adc pixel_ptr + 1 + sta pixel_ptr + 1 + jmp draw_pixel + +done: rts .endproc -.macro draw_text col, len, cstr - ; clobbers A, X - .local loop - .local done - ldx #0 +; in/out: column in text_col +; in: row in text_row +; in: pointer to string in INBUFF +; clobbers x/y/a/temp +.proc draw_string + drawptr = temp + strptr = INBUFF + + clc + lda #.lobyte(textbuffer) + adc text_col + sta temp + lda #.hibyte(textbuffer) + adc #0 + sta temp + 1 + + ldx text_row + beq done_rows +continue_rows: + clc + lda temp + adc #40 + sta temp + lda temp + 1 + adc #0 + sta temp + 1 + dex + bne continue_rows + +done_rows: + + ldy #0 loop: - cpx #len + lda (strptr),y + ; if char's null, terminate c-style beq done - ldy cstr,x - lda char_map,y - sta textbuffer + col,x - inx + ; save the char for terminator check + pha + ; strip the high bit (terminator) + and #$7f + tax + lda char_map,x + sta (drawptr),y + iny + + pla + ; _last_ char has high bit set in atari rom routines + bmi done jmp loop + done: + ; move the text column pointer + tya + clc + adc text_col + sta text_col + + rts +.endproc + +.macro draw_string_const str + lda #.lobyte(str) + sta INBUFF + lda #.hibyte(str) + sta INBUFF + 1 + jsr draw_string .endmacro .proc vblank_handler inc count_frames + + inc chroma_ticks + lda chroma_ticks + cmp #(chroma_delay) + bne skip_chroma + + lda #0 + sta chroma_ticks + + inc chroma_offset + lda chroma_offset + cmp #(palette_chroma_entries) + bne skip_chroma + + lda #0 + sta chroma_offset +skip_chroma: + + inc palette_ticks + lda palette_ticks + cmp #(palette_delay) + bne skip_luma + + lda #0 + sta palette_ticks + + inc palette_offset + lda palette_offset + cmp #(palette_entries) + bne skip_luma + + lda #0 + sta palette_offset + +skip_luma: + jsr update_palette jmp XITVBV .endproc +.proc update_palette + lda #0 + sta COLOR4 + + ldx chroma_offset + ldy palette_offset + lda palette_chroma,x + ora palette_start,y + sta COLOR2 + + ;inx + iny + lda palette_chroma,x + ora palette_start,y + sta COLOR1 + + ;inx + iny + lda palette_chroma,x + ora palette_start,y + sta COLOR0 + + rts +.endproc + .proc update_speed ; convert frames (u16) to fp ; add to frames_total @@ -675,33 +1482,155 @@ done: ; draw text .endproc -.proc start +.proc keycheck + ; clobbers all + ; returns 255 in A if state change or 0 if no change - ; ox = 0; oy = 0; zoom = 0 - ; count_frames = 0; count_pixels = 0 + ; check keyboard buffer + lda CH + cmp #$ff + beq skip_char + + ; Clear the keyboard buffer and re-enable interrupts + ldx #$ff + stx CH + + tay + + lda zoom + cpy #KEY_PLUS + beq plus + cpy #KEY_MINUS + beq minus + + ; temp+temp2 = $00010000 << (8 - zoom) + lda #$00 + sta temp + sta temp + 1 + lda #$01 + sta temp + 2 + lda #$00 + sta temp + 3 + scale_zoom temp + 2 + + cpy #KEY_UP + beq up + cpy #KEY_DOWN + beq down + cpy #KEY_LEFT + beq left + cpy #KEY_RIGHT + beq right + jmp number_keys + +skip_char: lda #0 - sta ox - sta ox + 1 - sta oy - sta oy + 1 - sta count_frames - sta count_pixels + rts - ; total_ms = 0.0; total_pixels = 0.0 - ldx #total_ms - jsr ZF1 - ldx #total_pixels - jsr ZF1 +plus: + lda zoom + cmp #7 + bpl skip_char + inc zoom + jmp done +minus: + lda zoom + cmp #1 + bmi skip_char + dec zoom + jmp done +up: + add32 oy, oy, temp + jsr display_coords + jmp done +down: + sub32 oy, oy, temp + jsr display_coords + jmp done +left: + sub32 ox, ox, temp + jsr display_coords + jmp done +right: + add32 ox, ox, temp + jsr display_coords + jmp done - ; zoom = 2x - lda #1 - sta zoom +number_keys: + cpy #KEY_1 + beq one + cpy #KEY_2 + beq two + cpy #KEY_3 + beq three + cpy #KEY_4 + beq four + cpy #KEY_5 + beq five + cpy #KEY_6 + beq six + jmp letter_keys - ; Disable display DMA - lda #0 - sta DMACTL +one: + ldx #0 + jmp load_key_viewport +two: + ldx #1 + jmp load_key_viewport +three: + ldx #2 + jmp load_key_viewport +four: + ldx #3 + jmp load_key_viewport +five: + ldx #4 + jmp load_key_viewport +six: + ldx #5 + jmp load_key_viewport - ; zero the range from framebuffer_top to framebuffer_end +letter_keys: + cpy #KEY_X + bne not_x + jsr input_x + jmp done +not_x: + cpy #KEY_Y + bne not_y + jsr input_y + jmp done +not_y: + jmp skip_char + +load_key_viewport: + jsr load_viewport + ; fall through +done: + lda #255 + rts + +.endproc + +.proc input_x + ldx #col_x + ldy #1 + jsr input_number + + + rts +.endproc + +.proc input_y + rts +.endproc + +.proc input_number + rts +.endproc + +.proc clear_screen + ; zero the range from framebuffer_top to display_list lda #.lobyte(framebuffer_top) sta temp lda #.hibyte(framebuffer_top) @@ -717,9 +1646,120 @@ zero_byte_loop: inc temp + 1 lda temp + 1 - cmp #.hibyte(framebuffer_end) + cmp #.hibyte(display_list) bne zero_page_loop + rts +.endproc + +.proc status_bar + ; Status bar + + lda #0 + sta text_col + lda #0 + sta text_row + draw_string_const str_self + + lda #(40 - str_run_len) + sta text_col + draw_string_const str_run + + rts +.endproc + +.proc display_coords + lda #1 + sta text_row + lda #col_x + sta text_col + draw_string_const str_x + + copy32 FR0, ox + jsr fixed6_26_to_float + jsr FASC + jsr draw_string + + lda #col_y + sta text_col + draw_string_const str_y + + copy32 FR0, oy + jsr fixed6_26_to_float + jsr FASC + jsr draw_string + + lda #col_zoom + sta text_col + draw_string_const str_zoom + + lda zoom + clc + adc #0 + sta FR0 + lda #0 + sta FR0 + 1 + jsr IFP + jsr FASC + jsr draw_string + + rts + +.endproc + +; input: viewport selector in x +; clobbers: a, x +.proc load_viewport + + lda viewport_zoom,x + sta zoom + + txa + asl a + asl a + + tax + lda viewport_ox,x + sta ox + lda viewport_oy,x + sta oy + + inx + lda viewport_ox,x + sta ox + 1 + lda viewport_oy,x + sta oy + 1 + + inx + lda viewport_ox,x + sta ox + 2 + lda viewport_oy,x + sta oy + 2 + + inx + lda viewport_ox,x + sta ox + 3 + lda viewport_oy,x + sta oy + 3 + + rts +.endproc + +.proc start + + jsr imul8xe_init + + ; initialize viewport + ldx #0 ; overview + jsr load_viewport + + ; Disable display DMA + lda #0 + sta DMACTL + + jsr clear_screen + jsr display_coords + ; Copy the display list into properly aligned memory ; Can't cross 1024-byte boundaries :D ldx #0 @@ -738,14 +1778,18 @@ copy_byte_loop: sta DLISTH ; actual register sta SDLSTH ; shadow register the OS will copy in - ; Status bar - draw_text 0, str_self_len, str_self - draw_text 40 - str_run_len, str_run_len, str_run - ; Re-enable display DMA lda #$22 sta DMACTL + ; Initialize the palette + lda #0 + sta palette_offset + sta palette_delay + sta chroma_offset + sta chroma_delay + jsr update_palette + ; install the vblank handler lda #7 ; deferred ldx #.hibyte(vblank_handler) @@ -753,6 +1797,30 @@ copy_byte_loop: jsr SETVBV main_loop: + ; count_frames = 0; count_iters = 0 + lda #0 + sta count_frames + sta count_iters + sta count_iters + 1 + + ; total_sec = 0.0; total_iters = 0.0 + jsr ZFR0 + ldx #.lobyte(total_sec) + ldy #.hibyte(total_sec) + jsr FST0R + ldx #.lobyte(total_iters) + ldy #.hibyte(total_iters) + jsr FST0R + + jsr clear_screen + jsr status_bar + jsr display_coords + + lda #0 + sta fill_level + +fill_loop: + ; sy = -92 .. 91 lda #(256-half_height) sta sy @@ -767,45 +1835,81 @@ loop_sy: sta sx + 1 loop_sx: - zoom_factor cx, sx, zoom, aspect_x - zoom_factor cy, sy, zoom, aspect_y + ; check the fill mask + ldy #0 + +loop_skip_level: + cpy fill_level + beq current_level + + lda fill_masks,y + and sx + bne not_skipped_mask1 + + lda fill_masks,y + and sy + beq skipped_mask + +not_skipped_mask1: + iny + jmp loop_skip_level + +current_level: + lda fill_masks,y + and sx + bne skipped_mask + + lda fill_masks,y + and sy + beq not_skipped_mask + +skipped_mask: + jmp skipped + +not_skipped_mask: + + ; run the fractal! + zoom_factor cx, sx, aspect_x + add32 cx, cx, ox + zoom_factor cy, sy, aspect_y + neg32 cy + add32 cy, cy, oy jsr mandelbrot jsr pset + jsr keycheck + beq no_key + ; @fixme clear the pixel stats + jmp main_loop +no_key: ; check if we should update the counters - ; - ; count_pixels >= width? update! - inc count_pixels - lda count_pixels - cmp #width - bmi update_status ; count_frames >= 120? update! lda count_frames cmp #120 ; >= 2 seconds - bmi skip_status + bpl update_status + jmp skipped update_status: - ; FR0 = (float)count_pixels & clear count_pixels - lda count_pixels - sta FR0 - lda #0 - sta FR0 + 1 - sta count_pixels + ; FR0 = (float)count_iters & clear count_iters + copy16 FR0, count_iters jsr IFP + lda #0 + sta count_iters + sta count_iters + 1 - ; FR1 = total_pixels - ldx #.lobyte(total_pixels) - ldy #.hibyte(total_pixels) + ; FR1 = total_iters + ldx #.lobyte(total_iters) + ldy #.hibyte(total_iters) jsr FLD1R ; FR0 += FR1 jsr FADD - ; total_pixels = FR0 - ldx #.lobyte(total_pixels) - ldy #.hibyte(total_pixels) + ; total_iters = FR0 + ldx #.lobyte(total_iters) + ldy #.hibyte(total_iters) jsr FST0R @@ -818,68 +1922,100 @@ update_status: sta count_frames jsr IFP - ; FR0 *= ms_per_frame - ldx #.lobyte(ms_per_frame) - ldy #.hibyte(ms_per_frame) + ; FR0 *= sec_per_frame + ldx #.lobyte(sec_per_frame) + ldy #.hibyte(sec_per_frame) jsr FLD1R jsr FMUL - ; FR0 += total_ms - ldx #total_ms - ldy #0 + ; FR0 += total_sec + ldx #.lobyte(total_sec) + ldy #.hibyte(total_sec) jsr FLD1R jsr FADD - ; total_ms = FR0 - ldx #total_ms - ldy #0 + ; total_sec = FR0 + ldx #.lobyte(total_sec) + ldy #.hibyte(total_sec) jsr FST0R - ; FR0 /= total_pixels - ldx #total_pixels - ldy #0 + ; FR0 /= total_iters + ldx #.lobyte(total_iters) + ldy #.hibyte(total_iters) jsr FLD1R jsr FDIV - ; convert to ASCII in INBUFF - jsr FASC - - ; find the last byte - ldy #0 -number_loop: - lda (INBUFF),y - bmi lastchar - - tax - lda char_map,x - sta textbuffer + speed_start,y - - iny - bpl number_loop -lastchar: - ; Y is last char - ; trim that high bit - and #$7f - tax - lda char_map,x - sta textbuffer + speed_start,y - - ; Fill out any remaining spaces - lda #0 -space_loop: - iny - sta textbuffer + speed_start,y - cpy #(20) - bmi space_loop - -skip_status: + ; FR0 *= us_per_sec + ldx #.lobyte(us_per_sec) + ldy #.hibyte(us_per_sec) + jsr FLD1R + jsr FMUL + ; round (down) to integer + jsr FPI clc - lda sx - adc #1 + jsr IFP + + lda #speed_start + sta text_col + lda #0 + sta text_row + draw_string_const str_speed + + lda text_col + pha + draw_string_const str_padding + pla + sta text_col + + ; convert to ASCII in INBUFF and print + jsr FASC + jsr draw_string + + ; elapsed time + ; FR0 = total_sec + ldx #.lobyte(total_sec) + ldy #.hibyte(total_sec) + jsr FLD0R + ; FR0 -> integer -> elapsed_work + jsr FPI + lda FR0 + sta elapsed_work + lda FR0 + 1 + sta elapsed_work + 1 + + draw_string_const str_space + + .macro do_countdown divisor, digits + ldx #.lobyte(divisor) + ldy #.hibyte(divisor) + lda #.lobyte(digits) + sta INBUFF + lda #.hibyte(digits) + sta INBUFF + 1 + jsr countdown + .endmacro + do_countdown 36000, digits_space + do_countdown 3600, digits_zero + draw_string_const str_h + do_countdown 600, digits_zero + do_countdown 60, digits_zero + draw_string_const str_m + do_countdown 10, digits_zero + do_countdown 1, digits_zero + draw_string_const str_s + +skipped: + + ; sx += fill_level[fill_masks] + 1 + ldx fill_level + lda fill_masks,x + clc + adc #1 ; will never carry + adc sx sta sx - lda sx + 1 - adc #0 + lda #0 + adc sx + 1 sta sx + 1 lda sx @@ -889,12 +2025,15 @@ skip_status: loop_sx_done: + ; sy += fill_level[fill_masks] + 1 + ldx fill_level + lda fill_masks,x clc - lda sy - adc #1 + adc #1 ; will never carry + adc sy sta sy - lda sy + 1 - adc #0 + lda #0 + adc sy + 1 sta sy + 1 lda sy @@ -904,9 +2043,139 @@ loop_sx_done: loop_sy_done: - draw_text 40 - str_done_len, str_done_len, str_done +fill_loop_done: + inc fill_level + lda fill_level + cmp #max_fill_level + beq loop + jmp fill_loop loop: ; finished - jmp loop + + lda #(40 - str_done_len) + sta text_col + lda #0 + sta text_row + draw_string_const str_done + + jsr keycheck + beq loop + jmp main_loop + +.endproc + +; digit string in INBUFF +; divisor X/Y +; clobbers temp, calls draw_string +.proc countdown + divisor = temp + stx divisor + sty divisor + 1 + + ; count the hours + ldy #0 +countdown_loop: + lda elapsed_work + 1 + cmp divisor + 1 + beq countdown_lobyte + bcc countdown_done + bcs countdown_inc +countdown_lobyte: + lda elapsed_work + cmp divisor + bcc countdown_done +countdown_inc: + sec + lda elapsed_work + sbc divisor + sta elapsed_work + lda elapsed_work + 1 + sbc divisor + 1 + sta elapsed_work + 1 + iny + jmp countdown_loop +countdown_done: + lda (INBUFF),y + eor #$80 + sta elapsed_digit + lda #.lobyte(elapsed_digit) + sta INBUFF + lda #.hibyte(elapsed_digit) + sta INBUFF + 1 + jsr draw_string + rts +.endproc + +.proc imul8xe_init + + bank_switch 0 + lda #0 + sta EXTENDED_RAM + bank_switch 1 + lda #1 + sta EXTENDED_RAM + bank_switch 0 + lda EXTENDED_RAM + beq init + + ; no bank switching available, we just overwrite the value in base ram + rts + +init: + + ; patch imul16_func into a forwarding thunk to imul16xe_func + lda #$4c ; 'jmp' opcode + sta imul16_func + lda #.lobyte(imul16xe_func) + sta imul16_func + 1 + sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 1 + lda #.hibyte(imul16xe_func) + sta imul16_func + 2 + sta mandelbrot::fixup_imul16_1 + imul16_patch_offset + 2 + + ; ditto for sqr16_func -> sqr16xe_func + lda #$4c ; 'jmp' opcode + sta sqr16_func + lda #.lobyte(sqr16xe_func) + sta sqr16_func + 1 + sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 1 + sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 1 + lda #.hibyte(sqr16xe_func) + sta sqr16_func + 2 + sta mandelbrot::fixup_sqr16_1 + sqr16_patch_offset + 2 + sta mandelbrot::fixup_sqr16_2 + sqr16_patch_offset + 2 + + + ; create the lookup table + ; go through the input set, in four 16KB chunks + + arg1 = FR1 + arg2 = FR2 + result = FR0 + + lda #$00 + sta arg1 + sta arg2 + sta ptr + lda #$40 + sta ptr + 1 + + ; $00 * $00 -> $3f * $ff + bank_switch 0 + jsr imul8xe_init_section + + ; $40 * $00 -> $7f * $ff + bank_switch 1 + jsr imul8xe_init_section + + ; $80 * $00 -> $bf * $ff + bank_switch 2 + jsr imul8xe_init_section + + ; $c0 * $00 -> $ff * $ff + bank_switch 3 + jsr imul8xe_init_section + + rts .endproc diff --git a/readme.md b/readme.md index 46ebd36..2c9efc1 100644 --- a/readme.md +++ b/readme.md @@ -14,30 +14,37 @@ Non-goals: Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals. --- brion, january 2023 +-- brooke, january 2023 - december 2024 ## Current state -Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet. +Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys. -The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. +The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. -The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input. +* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition +* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops +* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications +* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication -The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13. +The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates. Iterations are capped at 255. -## Next steps +The pixels are run in a progressive layout to get the basic shape on screen faster. -Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it! +There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D -Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. +There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. -I may be able to do a faster multiply using tables of squares for 8-bit component multiplication. +There's some cute color cycling. ## Deps and build instructions I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that. Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices. + +## Todo + +See ideas in `todo.md`. diff --git a/tables.js b/tables.js new file mode 100644 index 0000000..50cbef9 --- /dev/null +++ b/tables.js @@ -0,0 +1,50 @@ +function db(func) { + let lines = []; + for (let i = 0; i < 256; i += 16) { + let items = []; + for (let j = 0; j < 16; j++) { + let x = i + j; + items.push(func(x)); + } + lines.push(' .byte ' + items.join(', ')); + } + return lines.join('\n'); +} + +let squares = []; +for (let i = 0; i < 512; i++) { + squares.push(Math.trunc((i * i + 1) / 2)); +} + +console.log( +`.segment "TABLES" + +.export mul_lobyte256 +.export mul_hibyte256 +.export mul_hibyte512 +.export sqr_lobyte +.export sqr_hibyte + +; (i * i + 1) / 2 for the multiplier +.align 256 +mul_lobyte256: +${db((i) => squares[i] & 0xff)} + +.align 256 +mul_hibyte256: +${db((i) => (squares[i] >> 8) & 0xff)} + +.align 256 +mul_hibyte512: +${db((i) => (squares[i + 256] >> 8) & 0xff)} + +; (i * i) for the plain squares +.align 256 +sqr_lobyte: +${db((i) => (i * i) & 0xff)} + +.align 256 +sqr_hibyte: +${db((i) => ((i * i) >> 8) & 0xff)} + +`); diff --git a/testme.js b/testme.js new file mode 100644 index 0000000..e12e706 --- /dev/null +++ b/testme.js @@ -0,0 +1,41 @@ +// ax = (a + x)2/2 - a2/2 - x2/2 + +function half_square(x) { + return Math.round(x * x / 2) & 0xffff >>> 0; +} + +function mul8(a, b) { + let result = half_square(a + b) & 0xffff; + result = (result - half_square(a)) & 0xffff; + result = (result - half_square(b)) & 0xffff; + result = (result + (b & a & 1)) & 0xffff; + return result >>> 0; +} + +function mul16(a, b) { + let ah = (a & 0xff00) >>> 8; + let al = (a & 0x00ff) >>> 0; + let bh = (b & 0xff00) >>> 8; + let bl = (b & 0x00ff) >>> 0; + let result = (mul8(al, bl) & 0xffff) >>> 0; + result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0; + result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0; + result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0; + return result; +} + +let max = 65536; +//let max = 256; +//let max = 128; +//let max = 8; + +for (let a = 0; a < max; a++) { + for (let b = 0; b < max; b++) { + let expected = Math.imul(a, b) >>> 0; + //let actual = mul8(a, b); + let actual = mul16(a, b); + if (expected !== actual) { + console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`); + } + } +} \ No newline at end of file diff --git a/todo.md b/todo.md new file mode 100644 index 0000000..6807ae2 --- /dev/null +++ b/todo.md @@ -0,0 +1,17 @@ +things to try: + +* fix status bar to show elapsed time, per-iter time, per-pixel iter count + +* 'turbo' mode disabling graphics in full or part + +* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D + +* maybe clean up the load/layout of the big mul table + +* consider alternate lookup tables in the top 16KB under ROM + +* y-axis mirror optimization + +* extract viewport for display & re-input via keyboard + +* fujinet screenshot/viewport uploader