diff --git a/.mailmap b/.mailmap deleted file mode 100644 index 3102e50..0000000 --- a/.mailmap +++ /dev/null @@ -1,2 +0,0 @@ -Brooke Vibber -Brooke Vibber diff --git a/Makefile b/Makefile index bd14c7d..008bf8c 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ all : mandel.xex -mandel.xex : mandel.o tables.o atari-asm-xex.cfg - ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o +mandel.xex : mandel.o tables.o + ld65 -C ./atari-asm-xex.cfg -o $@ $+ %.o : %.s ca65 -o $@ $< diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg deleted file mode 100644 index 93b80f3..0000000 --- a/atari-asm-xex.cfg +++ /dev/null @@ -1,28 +0,0 @@ -FEATURES { - STARTADDRESS: default = $2E00; -} -SYMBOLS { - __STARTADDRESS__: type = export, value = %S; -} -MEMORY { - ZP: file = "", define = yes, start = $0082, size = $007E; - MAIN: file = %O, define = yes, start = %S, size = $4000 - %S; - # Keep $4000-7fff clear for expanded RAM access window - TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000; - # Keep $a000-$bfff clear for BASIC cartridge -} -FILES { - %O: format = atari; -} -FORMATS { - atari: runad = start; -} -SEGMENTS { - ZEROPAGE: load = ZP, type = zp, optional = yes; - EXTZP: load = ZP, type = zp, optional = yes; # to enable modules to be able to link to C and assembler programs - CODE: load = MAIN, type = rw, define = yes; - RODATA: load = MAIN, type = ro optional = yes; - DATA: load = MAIN, type = rw optional = yes; - BSS: load = MAIN, type = bss, optional = yes, define = yes; - TABLES: load = TABLES, type = ro, optional = yes, align = 256; -} diff --git a/mandel.s b/mandel.s index b8985b3..71bc6c2 100644 --- a/mandel.s +++ b/mandel.s @@ -1,45 +1,33 @@ ; Our zero-page vars -ox = $80 ; fixed8.24: center point x -oy = $84 ; fixed8.24: center point y -cx = $88 ; fixed8.24: c_x -cy = $8c ; fixed8.24: c_y +sx = $80 ; i16: screen pixel x +sy = $82 ; i16: screen pixel y +ox = $84 ; fixed4.12: center point x +oy = $86 ; fixed4.12: center point y +cx = $88 ; fixed4.12: c_x +cy = $8a ; fixed4.12: c_y +zx = $8c ; fixed4.12: z_x +zy = $8e ; fixed4.12: z_y -zx = $90 ; fixed8.24: z_x -zy = $94 ; fixed8.24: z_y -zx_2 = $98 ; fixed8.24: z_x^2 -zy_2 = $9c ; fixed8.24: z_y^2 +zx_2 = $90 ; fixed4.12: z_x^2 +zy_2 = $92 ; fixed4.12: z_y^2 +zx_zy = $94 ; fixed4.12: z_x * z_y +dist = $96 ; fixed4.12: z_x^2 + z_y^2 -zx_zy = $a0 ; fixed8.24: z_x * z_y -dist = $a4 ; fixed8.24: z_x^2 + z_y^2 -sx = $a8 ; i16: screen pixel x -sy = $aa ; i16: screen pixel y -z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not -z_buffer_start = $ad ; u8: index into z_buffer -z_buffer_end = $ae ; u8: index into z_buffer -iter = $af ; u8: iteration count +iter = $a0 ; u8: iteration count -ptr = $b0 ; u16 -pixel_ptr = $b2 ; u16 -zoom = $b4 ; u8: zoom shift level -fill_level = $b5 ; u8 -pixel_color = $b6 ; u8 -pixel_mask = $b7 ; u8 -pixel_shift = $b8 ; u8 -pixel_offset = $b9 ; u8 -palette_offset = $ba ; u8 -chroma_offset = $bb ; u8 -palette_ticks = $bc ; u8 -chroma_ticks = $bd ; u8 -count_frames = $be ; u8 -count_pixels = $bf ; u8 +zoom = $a1 ; u8: zoom shift level +count_frames = $a2 ; u8 +count_pixels = $a3 ; u8 +total_ms = $a4 ; float48 +total_pixels = $aa ; float48 -total_pixels = $c0 ; float48 -total_ms = $c6 ; float48 -temp = $cc ; u16 -temp2 = $ce ; u16 - -palette_delay = 23 -chroma_delay = 137 +temp = $b0 ; u16 +temp2 = $b2 ; u16 +pixel_ptr = $b4 ; u16 +pixel_color = $b6 ; u8 +pixel_mask = $b7 ; u8 +pixel_shift = $b8 ; u8 +pixel_offset = $b9 ; u8 ; FP registers in zero page @@ -51,9 +39,6 @@ CIX = $f2 ; u8 - index into INBUFF INBUFF = $f3 ; u16 - pointer to ascii FLPTR = $fc ; u16 - pointer to user buffer float48 -CH1 = $02f2 ; previous character read from keyboard -CH = $02fc ; current character read from keyboard - LBUFF = $0580 ; result buffer for FASC routine ; FP ROM routine vectors @@ -70,11 +55,11 @@ FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX) FMOVE = $DDB6 ; MOVE FR0 TO FR1 ; High data -framebuffer_top = $a000 -textbuffer = $af00 -framebuffer_bottom = $b000 -display_list = $bf00 -framebuffer_end = $c000 +framebuffer_top = $8000 +textbuffer = $8f00 +framebuffer_bottom = $9000 +display_list = $9f00 +framebuffer_end = $a000 height = 184 half_height = height >> 1 @@ -82,57 +67,26 @@ width = 160 half_width = width >> 1 stride = width >> 2 -EXTENDED_RAM = $4000 ; 16KiB bank on the XE -PORTB = $D301 ; memory & bank-switch for XL/XE - DMACTL = $D400 DLISTL = $D402 DLISTH = $D403 -WSYNC = $D40A ; OS shadow registers SDLSTL = $230 SDLSTH = $231 ; interrupt stuff -SYSVBV = $E45F XITVBV = $E462 SETVBV = $E45C -COLOR0 = $2C4 -COLOR1 = $2C5 -COLOR2 = $2C6 -COLOR3 = $2C7 -COLOR4 = $2C8 - -; Keycodes! -KEY_PLUS = $06 -KEY_MINUS = $0e -KEY_UP = $8e -KEY_DOWN = $8f -KEY_LEFT = $86 -KEY_RIGHT = $87 -KEY_1 = $1f -KEY_2 = $1e -KEY_3 = $1a -KEY_4 = 24 -KEY_5 = 29 -KEY_6 = 27 -KEY_7 = 51 -KEY_8 = 53 -KEY_9 = 48 -KEY_0 = 50 - .struct float48 exponent .byte - mantissa .byte 5 + mantissa .byte 6 .endstruct .import mul_lobyte256 .import mul_hibyte256 .import mul_hibyte512 -.import sqr_lobyte -.import sqr_hibyte .data @@ -141,7 +95,7 @@ str_self: .byte "MANDEL-6502" str_self_end: str_speed: - .byte " ms/px" + .byte "ms/px" str_speed_end: str_run: .byte " RUN" @@ -154,9 +108,8 @@ str_self_len = str_self_end - str_self str_speed_len = str_speed_end - str_speed str_run_len = str_run_end - str_run str_done_len = str_done_end - str_done -speed_precision = 6 -speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1 +speed_start = str_self_len + 2 speed_len = 14 + str_speed_len @@ -173,9 +126,6 @@ char_map: .byte 96 + i .endrepeat -hex_chars: - .byte "0123456789abcdef" - aspect: ; aspect ratio! ; pixels at 320w are 5:6 (narrow) @@ -239,70 +189,10 @@ color_map: .byte 3 .endrepeat - -palette_start: - .byte $0e - .byte $08 - .byte $04 -palette_repeat: - .byte $0e - .byte $08 - -palette_entries = 3 - -palette_chroma: - .repeat 15, i - .byte (i + 1) << 4 - .endrepeat - .repeat 2, i - .byte (i + 1) << 4 - .endrepeat -palette_chroma_entries = 15 - .code -;z_buffer_len = 16 ; 10.863 ms/px -;z_buffer_len = 12 ; 10.619 ms/px -z_buffer_len = 8 ; 10.612 ms/px -;z_buffer_len = 4 ; 12.395 ms/px -z_buffer_mask = z_buffer_len - 1 -z_buffer: - ; the last N zx/zy values - .repeat z_buffer_len - .word 0 - .word 0 - .endrepeat - .export start -;max_fill_level = 6 -max_fill_level = 3 -fill_masks: -; .byte %00011111 -; .byte %00001111 -; .byte %00000111 - .byte %00000011 - .byte %00000001 - .byte %00000000 - -viewport_zoom: - .byte 1 - .byte 6 - .byte 8 - .byte 6 - -viewport_ox: - .dword $00000000 - .dword $ff110000 - .dword $ff110000 - .dword $fe400000 - -viewport_oy: - .dword $00000000 - .dword $ffb60000 - .dword $ffbe0000 - .dword $00000000 - ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 clc ; 2 cyc @@ -313,21 +203,18 @@ viewport_oy: .endrepeat .endmacro -; 20 cycles .macro add16 dest, arg1, arg2 add 2, dest, arg1, arg2 .endmacro -; 38 cycles .macro add32 dest, arg1, arg2 - add 4, dest, arg1, arg2 + add 4, dest, arg2, dest .endmacro -; 8 cycles .macro add_carry dest - lda dest ; 3 cyc - adc #0 ; 2 cyc - sta dest ; 3 cyc + lda dest + adc #0 + sta dest .endmacro ; 2 + 9 * byte cycles @@ -340,35 +227,29 @@ viewport_oy: .endrepeat .endmacro -; 20 cycles .macro sub16 dest, arg1, arg2 sub 2, dest, arg1, arg2 .endmacro -; 38 cycles .macro sub32 dest, arg1, arg2 sub 4, dest, arg1, arg2 .endmacro -; 3 + 5 * bytes cycles .macro shl bytes, arg - asl arg ; 3 cyc + asl arg .repeat bytes-1, i - rol arg + 1 + i ; 5 cyc + rol arg + 1 + i .endrepeat .endmacro -; 13 cycles .macro shl16 arg shl 2, arg .endmacro -; 18 cycles .macro shl24 arg shl 3, arg .endmacro -; 23 cycles .macro shl32 arg shl 4, arg .endmacro @@ -381,17 +262,14 @@ viewport_oy: .endrepeat .endmacro -; 12 cycles .macro copy16 dest, arg copy 2, dest, arg .endmacro -; 24 cycles .macro copy32 dest, arg copy 4, dest, arg .endmacro -; 36 cycles .macro copyfloat dest, arg copy 6, dest, arg .endmacro @@ -416,419 +294,218 @@ viewport_oy: neg 4, arg .endmacro -; 11-27 + 23 * shift cycles -; 103-119 cycles for shift=4 -.macro shift_round_16 arg, shift - .repeat shift - shl32 arg ; 23 cycles - .endrepeat - round16 arg ; 11-27 cycles +; inner loop for imul16 +; bitnum < 8: 25 or 41 cycles +; bitnum >= 8: 30 or 46 cycles +.macro bitmul16 arg1, arg2, result, bitnum + .local zero + .local one + .local next + + ; does 16-bit adds + ; arg1 and arg2 are treated as unsigned + ; negative signed inputs must be flipped first + + ; 7 cycles up to the branch + + ; check if arg1 has 0 or 1 bit in this place + ; 5 cycles either way + .if bitnum < 8 + lda arg1 ; 3 cyc + and #(1 << (bitnum)) ; 2 cyc + .else + lda arg1 + 1 ; 3 cyc + and #(1 << ((bitnum) - 8)) ; 2 cyc + .endif + bne one ; 2 cyc + +zero: ; 18 cyc, 23 cyc + lsr result + 3 ; 5 cyc + jmp next ; 3 cyc + +one: ; 32 cyc, 37 cyc + ; 16-bit add on the top bits + clc ; 2 cyc + lda result + 2 ; 3 cyc + adc arg2 ; 3 cyc + sta result + 2 ; 3 cyc + lda result + 3 ; 3 cyc + adc arg2 + 1 ; 3 cyc + ror a ; 2 cyc - get a jump on the shift + sta result + 3 ; 3 cyc +next: + ror result + 2 ; 5 cyc + ror result + 1 ; 5 cyc + .if bitnum >= 8 + ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte + ; when it's all uninitialized data + ror result ; 5 cyc + .endif + .endmacro -; input: arg1, arg2 as fixed4.12 -; output: dest as fixed8.24 +; 5 to 25 cycles +.macro check_sign arg + ; Check sign bit and flip argument to postive, + ; keeping a count of sign bits in the Y register. + .local positive + lda arg + 1 ; 3 cyc + bpl positive ; 2 cyc + neg16 arg ; 18 cyc + iny ; 2 cyc +positive: +.endmacro + +; 518 - 828 cyc .macro imul16 dest, arg1, arg2 copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; ? cyc + jsr imul16_func ; 470-780 cyc copy32 dest, FR2 ; 24 cyc .endmacro -; input: arg as fixed4.12 -; output: dest as fixed8.24 -.macro sqr16 dest, arg - copy16 FR0, arg ; 12 cyc - jsr sqr16_func ; ? cyc - copy32 dest, FR2 ; 24 cyc +.macro shift_round_16 arg, shift + .repeat shift + shl32 arg + .endrepeat + round16 arg .endmacro -; input: arg as u8 -; output: dest as u16 -; clobbers a, x -.macro sqr8 dest, arg - ldx arg - lda sqr_lobyte,x - sta dest - lda sqr_hibyte,x - sta dest + 1 +.macro imul16_round dest, arg1, arg2, shift + copy16 FR0, arg1 ; 12 cyc + copy16 FR1, arg2 ; 12 cyc + jsr imul16_func ; 470-780 cyc + shift_round_16 FR2, shift + copy16 dest, FR2 + 2 ; 12 cyc .endmacro -; input: arg as u8 -; input/output: dest as u16 -; clobbers a, x -.macro sqr8_add16 dest, arg - ldx arg - clc - lda sqr_lobyte,x - adc dest - sta dest - lda sqr_hibyte,x - adc dest + 1 - sta dest + 1 -.endmacro +; min 470 cycles +; max 780 cycles +.proc imul16_func_orig + arg1 = FR0 ; 16-bit arg (clobbered) + arg2 = FR1 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result -.segment "TABLES" -; lookup table for top byte -> PORTB value for bank-switch -.align 256 -bank_switch_table: - .repeat 256, i - .byte ((i & $c0) >> 4) | $e3 + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y + check_sign arg1 ; 5 to 25 cyc + check_sign arg2 ; 5 to 25 cyc + + ; zero out the 32-bit temp's top 16 bits + lda #0 ; 2 cyc + sta result + 2 ; 3 cyc + sta result + 3 ; 3 cyc + ; the bottom two bytes will get cleared by the shifts + + ; unrolled loop for maximum speed, at the cost + ; of a larger routine + ; 440 to 696 cycles + .repeat 16, bitnum + ; bitnum < 8: 25 or 41 cycles + ; bitnum >= 8: 30 or 46 cycles + bitmul16 arg1, arg2, result, bitnum .endrepeat -.code + ; In case of mixed input signs, return a negative result. + cpy #1 ; 2 cyc + bne positive_result ; 2 cyc + neg32 result ; 34 cyc +positive_result: -.macro bank_switch bank - lda #((bank << 2) | $e3) - sta PORTB -.endmacro - -.macro imul8 dest, arg1, arg2, xe - .if xe - ; using 64KB lookup table - ; 51-70 cycles - ; clobbers x, y, dest, ptr - .scope - output = dest - - ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch_table,x ; 4 cyc - sta PORTB ; 4 cyc - - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - txa ; 2 cyc - and #$3f ; 2 cyc - ora #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - - ; copy the entry into output - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - tay ; 2 cyc - lda (ptr),y ; 5 cyc - sta output ; 3 cyc - iny ; 2 cyc - lda (ptr),y ; 5 cyc - sta output+1 ; 3 cyc - - ; note: we are not restoring memory to save 6 cycles! - ; this means those 16kb have to be switched back to base RAM - ; if we need to use them anywhere else - ;;; restore memory - ;;lda #$81 ; 2 cyc - disabled - ;;sta PORTB ; 4 cyc - disabled - - ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc - and #1 ; 2 cyc - beq done ; 2 cyc - - ; add arg2 one last time for the skipped bit - clc ; 2 cyc - txa ; 2 cyc - adc output ; 3 cyc - sta output ; 3 cyc - lda #0 ; 2 cyc - adc output+1 ; 3 cyc - sta output+1 ; 3 cyc - - done: - .endscope - .else - ; Using base 48k RAM compatibility mode - ; Small table of half squares - ; Adapted from https://everything2.com/title/Fast+6502+multiplication - ; 81-92 cycles - .scope - mul_factor_a = arg1 - mul_factor_x = arg2 - mul_product_lo = dest - mul_product_hi = dest + 1 - - lda mul_factor_a ; 3 cyc - - ; (a + x)^2/2 - clc ; 2 cyc - adc mul_factor_x ; 3 cyc - tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc - under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc - next: - sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc - - ; - a^2/2 - ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - - ; + x & a & 1: - ; (this is a kludge to correct a - ; roundoff error that makes odd * odd too low) - ldx mul_factor_x ; 3 cyc - txa ; 2 cyc - and mul_factor_a ; 3 cyc - and #1 ; 2 cyc - - clc ; 2 cyc - adc mul_product_lo ; 3 cyc - bcc small_product ; 2 cyc - inc mul_product_hi ; 5 cyc - - ; - x^2/2 - small_product: - sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - .endscope - .endif -.endmacro - -.proc imul8xe_init - - bank_switch 0 - lda #0 - sta EXTENDED_RAM - bank_switch 1 - lda #1 - sta EXTENDED_RAM - bank_switch 0 - lda EXTENDED_RAM - beq init - - ; no bank switching available, we just overwrite the value in base ram - rts - -init: - - ; patch imul16_func into a forwarding thunk to imul16xe_func - lda #$4c ; 'jmp' opcode - sta imul16_func - lda #.lobyte(imul16xe_func) - sta imul16_func + 1 - lda #.hibyte(imul16xe_func) - sta imul16_func + 2 - - ; ditto for sqr16_func -> sqr16xe_func - lda #$4c ; 'jmp' opcode - sta sqr16_func - lda #.lobyte(sqr16xe_func) - sta sqr16_func + 1 - lda #.hibyte(sqr16xe_func) - sta sqr16_func + 2 - - ; create the lookup table - ; go through the input set, in four 16KB chunks - - arg1 = FR1 - arg2 = FR2 - result = FR0 - - lda #$00 - sta arg1 - sta arg2 - sta ptr - lda #$40 - sta ptr + 1 - - ; $00 * $00 -> $3f * $ff - bank_switch 0 - jsr imul8xe_init_section - - ; $40 * $00 -> $7f * $ff - bank_switch 1 - jsr imul8xe_init_section - - ; $80 * $00 -> $bf * $ff - bank_switch 2 - jsr imul8xe_init_section - - ; $c0 * $00 -> $ff * $ff - bank_switch 3 - jsr imul8xe_init_section - - rts + rts ; 6 cyc .endproc -; Initialize a 16 KB chunk of the table -; input: multipliers in temp -; output: new multipliers in temp -; clobbers: temp, temp2 -.proc imul8xe_init_section - arg1 = FR1 - arg2 = FR2 - result = FR0 - ptr = temp2 +; Adapted from https://everything2.com/title/Fast+6502+multiplication +.macro imul8 dest, arg1, arg2 + .local under256 + .local next + .local small_product + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 - lda #$00 - sta ptr - lda #$40 - sta ptr + 1 + lda mul_factor_a ; setup: 6 cycles + ;ldx mul_factor_x - ldy #0 + clc ; (a + x)^2/2: 23 cycles + adc mul_factor_x + tax + bcc under256 + lda mul_hibyte512,x + bcs next + under256: + lda mul_hibyte256,x + sec + next: + sta mul_product_hi + lda mul_lobyte256,x - ; outer loop: $00 -> $3f -outer_loop: + ldx mul_factor_a ; - a^2/2: 20 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi - ; reset result to 0 - lda #0 - sta result - sta result + 1 + ldx mul_factor_x ; + x & a & 1: 22 cycles + txa ; (this is a kludge to correct a + and mul_factor_a ; roundoff error that makes odd * odd too low) + and #1 - ; inner loop: $00 -> $ff -inner_loop: + clc + adc mul_product_lo + bcc small_product + inc mul_product_hi + small_product: + sec ; - x^2/2: 25 cycles + sbc mul_lobyte256,x + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi + .endscope +.endmacro - ; copy result to data set - lda result - sta (ptr),y - lda result + 1 - iny - sta (ptr),y - dey - - ; result += 2 * arg2 - clc - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result + 1 - clc - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result + 1 - - ; inner loop check - inc arg1 - inc arg1 - inc ptr - inc ptr - bne inner_loop - - ; outer loop check - inc arg2 - inc ptr + 1 - lda ptr + 1 - cmp #$80 - bne outer_loop - - rts - -.endproc - -.macro imul16_impl xe - .local arg1 - .local arg2 - .local result - .local inter - .local arg1_pos - .local arg2_pos +.proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result inter = temp2 - ; h1l1 * h2l2 - ; (h1*256 + l1) * (h2*256 + l2) - ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) - ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 + ldy #0 ; 2 cyc + ; counts the number of sign bits in Y + check_sign arg1 ; 5 to 25 cyc + check_sign arg2 ; 5 to 25 cyc - imul8 result, arg1, arg2, xe lda #0 + sta result + 0 + sta result + 1 sta result + 2 sta result + 3 - imul8 inter, arg1 + 1, arg2, xe + imul8 inter, arg1, arg2 + add16 result, result, inter + + imul8 inter, arg1 + 1, arg2 + add16 result + 1, result + 1, inter + + imul8 inter, arg1, arg2 + 1 add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1, arg2 + 1, xe - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8 inter, arg1 + 1, arg2 + 1, xe + imul8 inter, arg1 + 1, arg2 + 1 add16 result + 2, result + 2, inter - ; In case of negative inputs, adjust high word - ; https://stackoverflow.com/a/28827013 - lda arg1 + 1 - bpl arg1_pos - sub16 result + 2, result + 2, arg2 -arg1_pos: - lda arg2 + 1 - bpl arg2_pos - sub16 result + 2, result + 2, arg1 -arg2_pos: + ; In case of mixed input signs, return a negative result. + cpy #1 ; 2 cyc + bne positive_result ; 2 cyc + neg32 result ; 34 cyc +positive_result: rts ; 6 cyc -.endmacro - -.macro sqr16_impl xe - .scope - arg = FR0 ; 16-bit arg (clobbered) - result = FR2 ; 32-bit result - ;inter = temp2 - inter = FR1 - - lda arg + 1 - bpl arg_pos - neg16 arg - arg_pos: - - ; hl * hl - ; (h*256 + l) * (h*256 + l) - ; h*256*(h*256 + l) + l*(h*256 + l) - ; h*h*256*256 + h*l*256 + h*l*256 + l*l - - sqr8 result, arg - lda #0 - sta result + 2 - sta result + 3 - - imul8 inter, arg + 1, arg, xe - add16 result + 1, result + 1, inter - add_carry result + 3 - add16 result + 1, result + 1, inter - add_carry result + 3 - - sqr8_add16 result + 2, arg + 1 - - rts ; 6 cyc - .endscope -.endmacro - -.proc imul16_func - imul16_impl 0 .endproc -.proc imul16xe_func - imul16_impl 1 -.endproc - -.proc sqr16_func - sqr16_impl 0 -.endproc - -.proc sqr16xe_func - sqr16_impl 1 -.endproc - -; 11-27 cycles .macro round16 arg ; Round top 16 bits of 32-bit fixed-point number in-place .local increment @@ -841,28 +518,21 @@ arg2_pos: ; round down if negative ; < $8000: round down - ; $8000 17 - ; $8001 27 - ; $8100 21 - ; $7fff 11 - - lda arg + 1 ; 3 cyc - cmp #$80 ; 2 cyc - beq high_half ; 2 cyc - - bpl increment ; 2 cyc - - bmi next ; 2 cyc + lda arg + 1 + cmp #$80 + beq high_half + bpl increment + bmi next high_half: - lda arg ; 3 cyc - beq check_sign ; 2 cyc - - jmp increment ; 3 cyc + lda arg + beq check_sign + bpl increment + bmi next check_sign: - lda arg + 3 ; 3 cyc - bmi next ; 2 cyc + lda arg + 3 + bmi next increment: ; 5-10 cyc inc arg + 2 ; 5 cyc @@ -875,8 +545,8 @@ next: .proc mandelbrot ; input: - ; cx: position scaled to 8.24 fixed point - -128..+127.9 - ; cy: position scaled to 8.24 + ; cx: position scaled to 4.12 fixed point - -8..+7.9 + ; cy: position scaled to 4.12 ; ; output: ; iter: iteration count at escape or 0 @@ -888,54 +558,21 @@ next: ; zx_zy = 0 ; dist = 0 ; iter = 0 -; lda #00 -; ldx #(iter - zx + 1) -;initloop: -; sta zx - 1,x -; dex -; bne initloop -; sta z_buffer_start -; sta z_buffer_end - lda #00 - sta zx - sta zx + 1 - sta zx + 2 - sta zx + 3 - sta zy - sta zy + 1 - sta zy + 2 - sta zy + 3 - sta zx_2 - sta zx_2 + 1 - sta zx_2 + 2 - sta zx_2 + 3 - sta zy_2 - sta zy_2 + 1 - sta zy_2 + 2 - sta zy_2 + 3 - sta zx_zy - sta zx_zy + 1 - sta zx_zy + 2 - sta zx_zy + 3 - sta dist - sta dist + 1 - sta dist + 2 - sta dist + 3 - sta iter - sta z_buffer_start - sta z_buffer_end + ldx #(iter - zx + 1) +initloop: + sta zx - 1,x + dex + bne initloop loop: ; iter++ & max-iters break inc iter bne keep_going - jmp exit_path + rts keep_going: .macro quick_exit arg, max - ; arg: fixed8.24 - ; max: integer .local positive .local negative .local nope_out @@ -943,159 +580,68 @@ keep_going: .local all_done ; check sign bit - lda arg + 3 + lda arg + 1 bmi negative positive: - cmp #max + cmp #((max) << 4) bmi all_done ; 'less than' - jmp exit_path + rts negative: - cmp #(256 - max) + cmp #(256 - ((max) << 4)) beq first_equal ; 'equal' on first byte bpl all_done ; 'greater than' nope_out: - jmp exit_path - + rts + first_equal: - ; following bytes all 0 shows it's really 'equal' - lda arg + 2 - bne all_done - lda arg + 1 - bne all_done lda arg - bne all_done - jmp exit_path + beq nope_out ; 2nd byte 0 shows it's really 'equal' all_done: .endmacro - ; 8.24: (-128 .. 127.9) + ; 4.12: (-8 .. +7.9) ; zx = zx_2 - zy_2 + cx - sub32 zx, zx_2, zy_2 - add32 zx, zx, cx + sub16 zx, zx_2, zy_2 + add16 zx, zx, cx quick_exit zx, 2 ; zy = zx_zy + zx_zy + cy - add32 zy, zx_zy, zx_zy - add32 zy, zy, cy + add16 zy, zx_zy, zx_zy + add16 zy, zy, cy quick_exit zy, 2 - ; convert 8.24 -> 4.12: (-8 .. +7.9) - shift_round_16 zx, 4 - shift_round_16 zy, 4 - ; zx_2 = zx * zx - sqr16 zx_2, zx + 2 + imul16_round zx_2, zx, zx, 4 ; zy_2 = zy * zy - sqr16 zy_2, zy + 2 + imul16_round zy_2, zy, zy, 4 ; zx_zy = zx * zy - imul16 zx_zy, zx + 2, zy + 2 + imul16_round zx_zy, zx, zy, 4 ; dist = zx_2 + zy_2 - add32 dist, zx_2, zy_2 + add16 dist, zx_2, zy_2 quick_exit dist, 4 ; if may be in the lake, look for looping output with a small buffer ; as an optimization vs running to max iters - lda z_buffer_active - beq skip_z_buffer - - ldx z_buffer_start - cpx z_buffer_end - beq z_nothing_to_read - -z_buffer_loop: - .macro z_compare arg - .local compare_no_match - lda z_buffer,x - inx - cmp arg - bne compare_no_match - iny - compare_no_match: - .endmacro - .macro z_advance - .local skip_reset_x - cpx #(z_buffer_len * 4) - bmi skip_reset_x - ldx #0 - skip_reset_x: - .endmacro - .macro z_store arg - lda arg - sta z_buffer,x - inx - .endmacro - - ; Compare the previously stored z values - ldy #0 - z_compare zx + 2 - z_compare zx + 3 - z_compare zy + 2 - z_compare zy + 3 - - cpy #4 - bne z_no_matches - jmp z_exit - -z_no_matches: - z_advance - - cpx z_buffer_end - bne z_buffer_loop - -z_nothing_to_read: - - ; Store and expand - z_store zx + 2 - z_store zx + 3 - z_store zy + 2 - z_store zy + 3 - z_advance - stx z_buffer_end - - ; Increment the start roller if necessary (limit size) - lda iter - cmp #(z_buffer_len * 4) - bmi skip_inc_start - lda z_buffer_start - clc - adc #4 - tax - z_advance - stx z_buffer_start -skip_inc_start: - -skip_z_buffer: - jmp loop -z_exit: - lda #0 - sta iter - -exit_path: - ldx #0 - lda iter - bne next - inx -next: - stx z_buffer_active +peace_out: rts .endproc -.macro scale_zoom dest - ; clobbers X, flags +.macro zoom_factor dest, src, zoom, aspect .local cont .local enough ; cx = (sx << (8 - zoom)) + copy16 dest, src ldx zoom cont: cpx #8 @@ -1104,19 +650,10 @@ cont: inx jmp cont enough: -.endmacro - -.macro zoom_factor dest, src, aspect - ; output: dest: fixed8.24 - ; input: src: fixed4.12 - ; aspect: fixed4.12 - ; clobbers A, X, flags, etc - copy16 dest, src - scale_zoom dest ; cy = cy * (3 / 4) ; cx = cx * (5 / 4) - imul16 dest, dest, aspect + imul16_round dest, dest, aspect, 4 .endmacro .proc pset @@ -1209,25 +746,6 @@ shift_done: rts .endproc -.macro draw_text_indirect col, len, strptr - ; clobbers A, X - .local loop - .local done - ldx #0 -loop: - cpx #len - beq done - txa - tay - lda (strptr),y - tay - lda char_map,y - sta textbuffer + col,x - inx - jmp loop -done: -.endmacro - .macro draw_text col, len, cstr ; clobbers A, X .local loop @@ -1246,70 +764,9 @@ done: .proc vblank_handler inc count_frames - - inc chroma_ticks - lda chroma_ticks - cmp #(chroma_delay) - bne skip_chroma - - lda #0 - sta chroma_ticks - - inc chroma_offset - lda chroma_offset - cmp #(palette_chroma_entries) - bne skip_chroma - - lda #0 - sta chroma_offset -skip_chroma: - - inc palette_ticks - lda palette_ticks - cmp #(palette_delay) - bne skip_luma - - lda #0 - sta palette_ticks - - inc palette_offset - lda palette_offset - cmp #(palette_entries) - bne skip_luma - - lda #0 - sta palette_offset - -skip_luma: - jsr update_palette jmp XITVBV .endproc -.proc update_palette - lda #0 - sta COLOR4 - - ldx chroma_offset - ldy palette_offset - lda palette_chroma,x - ora palette_start,y - sta COLOR2 - - ;inx - iny - lda palette_chroma,x - ora palette_start,y - sta COLOR1 - - ;inx - iny - lda palette_chroma,x - ora palette_start,y - sta COLOR0 - - rts -.endproc - .proc update_speed ; convert frames (u16) to fp ; add to frames_total @@ -1320,110 +777,33 @@ skip_luma: ; draw text .endproc -.proc keycheck - ; clobbers all - ; returns 255 in A if state change or 0 if no change +.proc start - ; check keyboard buffer - lda CH - cmp #$ff - beq skip_char - - ; Clear the keyboard buffer and re-enable interrupts - ldx #$ff - stx CH - - tay - - lda zoom - cpy #KEY_PLUS - beq plus - cpy #KEY_MINUS - beq minus - - ; temp+temp2 = $00010000 << (8 - zoom) - lda #$00 - sta temp - sta temp + 1 - lda #$01 - sta temp + 2 - lda #$00 - sta temp + 3 - scale_zoom temp + 2 - - cpy #KEY_UP - beq up - cpy #KEY_DOWN - beq down - cpy #KEY_LEFT - beq left - cpy #KEY_RIGHT - beq right - jmp number_keys - -skip_char: + ; ox = 0; oy = 0; zoom = 0 + ; count_frames = 0; count_pixels = 0 lda #0 - rts + sta ox + sta ox + 1 + sta oy + sta oy + 1 + sta count_frames + sta count_pixels -plus: - lda zoom - cmp #8 - bpl skip_char - inc zoom - jmp done -minus: - lda zoom - cmp #1 - bmi skip_char - dec zoom - jmp done -up: - sub32 oy, oy, temp - jmp done -down: - add32 oy, oy, temp - jmp done -left: - sub32 ox, ox, temp - jmp done -right: - add32 ox, ox, temp - jmp done + ; total_ms = 0.0; total_pixels = 0.0 + ldx #total_ms + jsr ZF1 + ldx #total_pixels + jsr ZF1 -number_keys: - cpy #KEY_1 - beq one - cpy #KEY_2 - beq two - cpy #KEY_3 - beq three - cpy #KEY_4 - beq four - jmp skip_char + ; zoom = 2x + lda #1 + sta zoom -one: - ldx #0 - jmp load_key_viewport -two: - ldx #1 - jmp load_key_viewport -three: - ldx #2 - jmp load_key_viewport -four: - ldx #3 - ; fall through -load_key_viewport: - jsr load_viewport - ; fall through -done: - lda #255 - rts + ; Disable display DMA + lda #0 + sta DMACTL -.endproc - -.proc clear_screen - ; zero the range from framebuffer_top to display_list + ; zero the range from framebuffer_top to framebuffer_end lda #.lobyte(framebuffer_top) sta temp lda #.hibyte(framebuffer_top) @@ -1439,72 +819,9 @@ zero_byte_loop: inc temp + 1 lda temp + 1 - cmp #.hibyte(display_list) + cmp #.hibyte(framebuffer_end) bne zero_page_loop - rts -.endproc - -.proc status_bar - ; Status bar - draw_text 0, str_self_len, str_self - draw_text 40 - str_run_len, str_run_len, str_run - - rts -.endproc - -; input: viewport selector in x -; clobbers: a, x -.proc load_viewport - - lda viewport_zoom,x - sta zoom - - txa - asl a - asl a - - tax - lda viewport_ox,x - sta ox - lda viewport_oy,x - sta oy - - inx - lda viewport_ox,x - sta ox + 1 - lda viewport_oy,x - sta oy + 1 - - inx - lda viewport_ox,x - sta ox + 2 - lda viewport_oy,x - sta oy + 2 - - inx - lda viewport_ox,x - sta ox + 3 - lda viewport_oy,x - sta oy + 3 - - rts -.endproc - -.proc start - - jsr imul8xe_init - - ; initialize viewport - ldx #0 ; overview - jsr load_viewport - - ; Disable display DMA - lda #0 - sta DMACTL - - jsr clear_screen - ; Copy the display list into properly aligned memory ; Can't cross 1024-byte boundaries :D ldx #0 @@ -1523,18 +840,14 @@ copy_byte_loop: sta DLISTH ; actual register sta SDLSTH ; shadow register the OS will copy in + ; Status bar + draw_text 0, str_self_len, str_self + draw_text 40 - str_run_len, str_run_len, str_run + ; Re-enable display DMA lda #$22 sta DMACTL - ; Initialize the palette - lda #0 - sta palette_offset - sta palette_delay - sta chroma_offset - sta chroma_delay - jsr update_palette - ; install the vblank handler lda #7 ; deferred ldx #.hibyte(vblank_handler) @@ -1542,25 +855,6 @@ copy_byte_loop: jsr SETVBV main_loop: - ; count_frames = 0; count_pixels = 0 - lda #0 - sta count_frames - sta count_pixels - - ; total_ms = 0.0; total_pixels = 0.0 - ldx #total_ms - jsr ZF1 - ldx #total_pixels - jsr ZF1 - - jsr clear_screen - jsr status_bar - - lda #0 - sta fill_level - -fill_loop: - ; sy = -92 .. 91 lda #(256-half_height) sta sy @@ -1575,53 +869,12 @@ loop_sy: sta sx + 1 loop_sx: - ; check the fill mask - ldy #0 - -loop_skip_level: - cpy fill_level - beq current_level - - lda fill_masks,y - and sx - bne not_skipped_mask1 - - lda fill_masks,y - and sy - beq skipped_mask - -not_skipped_mask1: - iny - jmp loop_skip_level - -current_level: - lda fill_masks,y - and sx - bne skipped_mask - - lda fill_masks,y - and sy - beq not_skipped_mask - -skipped_mask: - jmp skipped - -not_skipped_mask: - - ; run the fractal! - zoom_factor cx, sx, aspect_x - add32 cx, cx, ox - zoom_factor cy, sy, aspect_y - add32 cy, cy, oy + zoom_factor cx, sx, zoom, aspect_x + zoom_factor cy, sy, zoom, aspect_y jsr mandelbrot jsr pset - jsr keycheck - beq no_key - ; @fixme clear the pixel stats - jmp main_loop -no_key: ; check if we should update the counters ; ; count_pixels >= width? update! @@ -1633,7 +886,7 @@ no_key: ; count_frames >= 120? update! lda count_frames cmp #120 ; >= 2 seconds - bmi skipped + bmi skip_status update_status: ; FR0 = (float)count_pixels & clear count_pixels @@ -1693,11 +946,35 @@ update_status: ; convert to ASCII in INBUFF jsr FASC - ; print the first 6 digits - draw_text_indirect speed_start, speed_precision, INBUFF - draw_text speed_start + speed_precision, str_speed_len, str_speed + ; find the last byte + ldy #0 +number_loop: + lda (INBUFF),y + bmi lastchar -skipped: + tax + lda char_map,x + sta textbuffer + speed_start,y + + iny + bpl number_loop +lastchar: + ; Y is last char + ; trim that high bit + and #$7f + tax + lda char_map,x + sta textbuffer + speed_start,y + + ; Fill out any remaining spaces + lda #0 +space_loop: + iny + sta textbuffer + speed_start,y + cpy #(20) + bmi space_loop + +skip_status: clc lda sx @@ -1729,18 +1006,9 @@ loop_sx_done: loop_sy_done: -fill_loop_done: - inc fill_level - lda fill_level - cmp #max_fill_level - beq loop - jmp fill_loop + draw_text 40 - str_done_len, str_done_len, str_done loop: ; finished - draw_text 40 - str_done_len, str_done_len, str_done - jsr keycheck - beq loop - jmp main_loop - + jmp loop .endproc diff --git a/readme.md b/readme.md index d60644c..46ebd36 100644 --- a/readme.md +++ b/readme.md @@ -14,37 +14,30 @@ Non-goals: Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals. --- brooke, january 2023 - december 2024 +-- brion, january 2023 ## Current state -Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys. +Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet. -The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. +The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. -* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition -* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops -* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications -* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication +The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input. -The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26. +The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13. Iterations are capped at 255. -The pixels are run in a progressive layout to get the basic shape on screen faster. +## Next steps -There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D +Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it! -There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. +Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint. -There's some cute color cycling. +I may be able to do a faster multiply using tables of squares for 8-bit component multiplication. ## Deps and build instructions I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that. Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices. - -## Todo - -See ideas in `todo.md`. diff --git a/tables.js b/tables.js index 50cbef9..5afc3c0 100644 --- a/tables.js +++ b/tables.js @@ -11,40 +11,23 @@ function db(func) { return lines.join('\n'); } -let squares = []; -for (let i = 0; i < 512; i++) { - squares.push(Math.trunc((i * i + 1) / 2)); -} - console.log( `.segment "TABLES" .export mul_lobyte256 .export mul_hibyte256 .export mul_hibyte512 -.export sqr_lobyte -.export sqr_hibyte -; (i * i + 1) / 2 for the multiplier .align 256 mul_lobyte256: -${db((i) => squares[i] & 0xff)} +${db((x) => Math.round(x * x / 2) & 0xff)} .align 256 mul_hibyte256: -${db((i) => (squares[i] >> 8) & 0xff)} +${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)} .align 256 mul_hibyte512: -${db((i) => (squares[i + 256] >> 8) & 0xff)} - -; (i * i) for the plain squares -.align 256 -sqr_lobyte: -${db((i) => (i * i) & 0xff)} - -.align 256 -sqr_hibyte: -${db((i) => ((i * i) >> 8) & 0xff)} +${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)} `); diff --git a/todo.md b/todo.md deleted file mode 100644 index 284d653..0000000 --- a/todo.md +++ /dev/null @@ -1,19 +0,0 @@ -things to try: - -* skip add on the top-byte multiply in sqr8/mul8 - * should save a few cycles, suggestion by jamey - -* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D - -* try 3.13 fixed point instead of 4.12 for more precision - * can we get away without the extra bit? - * since exit compare space would be 6.26 i think so - -* y-axis mirror optimization - -* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering - * maybe redo tiering to just 4x4, 2x2, 1x1? - -* extract viewport for display & re-input via keyboard - -* fujinet screenshot/viewport uploader