diff --git a/Makefile b/Makefile index bd14c7d..008bf8c 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ all : mandel.xex -mandel.xex : mandel.o tables.o atari-asm-xex.cfg - ld65 -C ./atari-asm-xex.cfg -o $@ mandel.o tables.o +mandel.xex : mandel.o tables.o + ld65 -C ./atari-asm-xex.cfg -o $@ $+ %.o : %.s ca65 -o $@ $< diff --git a/atari-asm-xex.cfg b/atari-asm-xex.cfg index 93b80f3..6e6498d 100644 --- a/atari-asm-xex.cfg +++ b/atari-asm-xex.cfg @@ -6,10 +6,7 @@ SYMBOLS { } MEMORY { ZP: file = "", define = yes, start = $0082, size = $007E; - MAIN: file = %O, define = yes, start = %S, size = $4000 - %S; - # Keep $4000-7fff clear for expanded RAM access window - TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000; - # Keep $a000-$bfff clear for BASIC cartridge + MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S; } FILES { %O: format = atari; @@ -24,5 +21,5 @@ SEGMENTS { RODATA: load = MAIN, type = ro optional = yes; DATA: load = MAIN, type = rw optional = yes; BSS: load = MAIN, type = bss, optional = yes, define = yes; - TABLES: load = TABLES, type = ro, optional = yes, align = 256; + TABLES: load = MAIN, type = ro, optional = yes, align = 256; } diff --git a/mandel.s b/mandel.s index 9f594e8..3579b0f 100644 --- a/mandel.s +++ b/mandel.s @@ -62,11 +62,11 @@ FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX) FMOVE = $DDB6 ; MOVE FR0 TO FR1 ; High data -framebuffer_top = $a000 -textbuffer = $af00 -framebuffer_bottom = $b000 -display_list = $bf00 -framebuffer_end = $c000 +framebuffer_top = $8000 +textbuffer = $8f00 +framebuffer_bottom = $9000 +display_list = $9f00 +framebuffer_end = $a000 height = 184 half_height = height >> 1 @@ -107,14 +107,12 @@ KEY_RIGHT = $87 .struct float48 exponent .byte - mantissa .byte 5 + mantissa .byte 6 .endstruct .import mul_lobyte256 .import mul_hibyte256 .import mul_hibyte512 -.import sqr_lobyte -.import sqr_hibyte .data @@ -376,150 +374,137 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro -.macro sqr16_round dest, arg, shift - ;imul16_round dest, arg, arg, shift - copy16 FR0, arg ; 12 cyc - jsr sqr16_func ; ? cyc - shift_round_16 FR2, shift - copy16 dest, FR2 + 2 ; 12 cyc -.endmacro +; Adapted from https://everything2.com/title/Fast+6502+multiplication +.macro imul8 dest, arg1, arg2 + .local under256 + .local next + .local small_product + ; circa 92 cycles? this doesn't seem right + ; 81-92 cycles + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 -; clobbers a, x -.macro sqr8 dest, arg - ldx arg - lda sqr_lobyte,x - sta dest - lda sqr_hibyte,x - sta dest + 1 + lda mul_factor_a ; 3 cyc + + ; (a + x)^2/2 + clc ; 2 cyc + adc mul_factor_x ; 3 cyc + tax ; 2 cyc + bcc under256 ; 2 cyc + lda mul_hibyte512,x ; 4 cyc + bcs next ; 2 cyc + under256: + lda mul_hibyte256,x ; 4 cyc + sec ; 2 cyc + next: + sta mul_product_hi ; 3 cyc + lda mul_lobyte256,x ; 4 cyc + + ; - a^2/2 + ldx mul_factor_a ; 3 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc + + ; + x & a & 1: + ; (this is a kludge to correct a + ; roundoff error that makes odd * odd too low) + ldx mul_factor_x ; 3 cyc + txa ; 2 cyc + and mul_factor_a ; 3 cyc + and #1 ; 2 cyc + + clc ; 2 cyc + adc mul_product_lo ; 3 cyc + bcc small_product ; 2 cyc + inc mul_product_hi ; 5 cyc + + ; - x^2/2 + small_product: + sec ; 2 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc + .endscope .endmacro ; lookup table for top byte -> PORTB value for bank-switch ;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes bank_switch_table: .repeat 256, i - .byte ((i & $c0) >> 4) | $e3 + .byte ((i & $c0) >> 4) | $e1 .endrepeat .macro bank_switch bank - lda #((bank << 2) | $e3) + lda #((bank << 2) | $e1) sta PORTB .endmacro -.macro imul8 dest, arg1, arg2, xe - .if xe - ; using 64KB lookup table - ; 58-77 cycles - ; clobbers x, y, dest to dest + 3 - .scope - output = dest - ptr = dest + 2 ; scratch space assumed - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - sta ptr ; 3 cyc - lda arg2 ; 3 cyc - and #$3f ; 2 cyc - clc ; 2 cyc - adc #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - - ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch_table,x ; 4 cyc - sta PORTB ; 4 cyc +; 58-77 cycles +; clobbers x, y, dest to dest + 3 +.macro imul8xe dest, arg1, arg2 +.local done +.local output +.local ptr + + output = dest + ptr = dest + 2 ; scratch space assumed + + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + lda arg1 ; 3 cyc + and #$fe ; 2 cyc + sta ptr ; 3 cyc + lda arg2 ; 3 cyc + and #$3f ; 2 cyc + clc ; 2 cyc + adc #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; top 2 bits are the table bank selector + ldx arg2 ; 3 cyc + lda bank_switch_table,x ; 4 cyc + sta PORTB ; 4 cyc - ; copy the entry into output - ldy #0 ; 2 cyc - lda (ptr),y ; 5 cyc - sta output ; 3 cyc - iny ; 2 cyc - lda (ptr),y ; 5 cyc - sta output+1 ; 3 cyc + ; copy the entry into output + ldy #0 ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc - ; note: we are not restoring memory to save 6 cycles! - ; this means those 16kb have to be switched back to base RAM - ; if we need to use them anywhere else - ;;; restore memory - ;;lda #$81 ; 2 cyc - disabled - ;;sta PORTB ; 4 cyc - disabled + ; note: we are not restoring memory to save 6 cycles! + ; this means those 16kb have to be switched back to base RAM + ; if we need to use them anywhere else + ;;; restore memory + ;;lda #$81 ; 2 cyc - disabled + ;;sta PORTB ; 4 cyc - disabled - ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc - and #1 ; 2 cyc - beq done ; 2 cyc + ; check that 1 bit we skipped to fit into space + lda arg1 ; 3 cyc + and #1 ; 2 cyc + beq done ; 2 cyc - ; add the second param one last time for the skipped bit - clc ; 2 cyc - lda arg2 ; 3 cyc - adc output ; 3 cyc - sta output ; 3 cyc - lda #0 ; 2 cyc - adc output+1 ; 3 cyc - sta output+1 ; 3 cyc + ; add the second param one last time for the skipped bit + clc ; 2 cyc + lda arg2 ; 3 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc - done: - .endscope - .else - ; Using base 48k RAM compatibility mode - ; Small table of half squares - ; Adapted from https://everything2.com/title/Fast+6502+multiplication - ; 81-92 cycles - .scope - mul_factor_a = arg1 - mul_factor_x = arg2 - mul_product_lo = dest - mul_product_hi = dest + 1 - - lda mul_factor_a ; 3 cyc - - ; (a + x)^2/2 - clc ; 2 cyc - adc mul_factor_x ; 3 cyc - tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc - under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc - next: - sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc - - ; - a^2/2 - ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - - ; + x & a & 1: - ; (this is a kludge to correct a - ; roundoff error that makes odd * odd too low) - ldx mul_factor_x ; 3 cyc - txa ; 2 cyc - and mul_factor_a ; 3 cyc - and #1 ; 2 cyc - - clc ; 2 cyc - adc mul_product_lo ; 3 cyc - bcc small_product ; 2 cyc - inc mul_product_hi ; 5 cyc - - ; - x^2/2 - small_product: - sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - .endscope - .endif +done: .endmacro .proc imul8xe_init @@ -547,14 +532,6 @@ init: lda #.hibyte(imul16xe_func) sta imul16_func + 2 - ; ditto for sqr16_func -> sqr16xe_func - lda #$4c ; 'jmp' opcode - sta sqr16_func - lda #.lobyte(sqr16xe_func) - sta sqr16_func + 1 - lda #.hibyte(sqr16xe_func) - sta sqr16_func + 2 - ; create the lookup table ; go through the input set, in four 16KB chunks @@ -655,13 +632,7 @@ inner_loop: .endproc -.macro imul16_impl xe - .local arg1 - .local arg2 - .local result - .local inter - .local arg1_pos - .local arg2_pos +.proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result @@ -672,20 +643,20 @@ inner_loop: ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - imul8 result, arg1, arg2, xe + imul8 result, arg1, arg2 lda #0 sta result + 2 sta result + 3 - imul8 inter, arg1 + 1, arg2, xe + imul8 inter, arg1 + 1, arg2 add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1, arg2 + 1, xe + imul8 inter, arg1, arg2 + 1 add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1 + 1, arg2 + 1, xe + imul8 inter, arg1 + 1, arg2 + 1 add16 result + 2, result + 2, inter ; In case of negative inputs, adjust high word @@ -700,59 +671,47 @@ arg1_pos: arg2_pos: rts ; 6 cyc -.endmacro - -.macro sqr16_impl xe - .scope - arg = FR0 ; 16-bit arg (clobbered) - result = FR2 ; 32-bit result - ;inter = temp2 - inter = FR1 - - lda arg + 1 - bpl arg_pos - neg16 arg - arg_pos: - - ; hl * hl - ; (h*256 + l) * (h*256 + l) - ; h*256*(h*256 + l) + l*(h*256 + l) - ; h*h*256*256 + h*l*256 + h*l*256 + l*l - - sqr8 result, arg - ;imul8 inter, arg, arg, xe - lda #0 - sta result + 2 - sta result + 3 - - imul8 inter, arg + 1, arg, xe - add16 result + 1, result + 1, inter - add_carry result + 3 - add16 result + 1, result + 1, inter - add_carry result + 3 - - sqr8 inter, arg + 1 - ;imul8 inter, arg + 1, arg + 1, xe - add16 result + 2, result + 2, inter - - rts ; 6 cyc - .endscope -.endmacro - -.proc imul16_func - imul16_impl 0 .endproc .proc imul16xe_func - imul16_impl 1 -.endproc + arg1 = FR0 ; 16-bit arg (clobbered) + arg2 = FR1 ; 16-bit arg (clobbered) + result = FR2 ; 32-bit result + inter = temp2 -.proc sqr16_func - sqr16_impl 0 -.endproc + ; h1l1 * h2l2 + ; (h1*256 + l1) * (h2*256 + l2) + ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) + ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 -.proc sqr16xe_func - sqr16_impl 1 + imul8xe result, arg1, arg2 + lda #0 + sta result + 2 + sta result + 3 + + imul8xe inter, arg1 + 1, arg2 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8xe inter, arg1, arg2 + 1 + add16 result + 1, result + 1, inter + add_carry result + 3 + + imul8xe inter, arg1 + 1, arg2 + 1 + add16 result + 2, result + 2, inter + + ; In case of negative inputs, adjust high word + ; https://stackoverflow.com/a/28827013 + lda arg1 + 1 + bpl arg1_pos + sub16 result + 2, result + 2, arg2 +arg1_pos: + lda arg2 + 1 + bpl arg2_pos + sub16 result + 2, result + 2, arg1 +arg2_pos: + + rts ; 6 cyc .endproc .macro round16 arg @@ -866,10 +825,10 @@ keep_going: quick_exit zy, 2 ; zx_2 = zx * zx - sqr16_round zx_2, zx, 4 + imul16_round zx_2, zx, zx, 4 ; zy_2 = zy * zy - sqr16_round zy_2, zy, 4 + imul16_round zy_2, zy, zy, 4 ; zx_zy = zx * zy imul16_round zx_zy, zx, zy, 4 diff --git a/tables.js b/tables.js index 50cbef9..c772f81 100644 --- a/tables.js +++ b/tables.js @@ -22,10 +22,7 @@ console.log( .export mul_lobyte256 .export mul_hibyte256 .export mul_hibyte512 -.export sqr_lobyte -.export sqr_hibyte -; (i * i + 1) / 2 for the multiplier .align 256 mul_lobyte256: ${db((i) => squares[i] & 0xff)} @@ -38,13 +35,4 @@ ${db((i) => (squares[i] >> 8) & 0xff)} mul_hibyte512: ${db((i) => (squares[i + 256] >> 8) & 0xff)} -; (i * i) for the plain squares -.align 256 -sqr_lobyte: -${db((i) => (i * i) & 0xff)} - -.align 256 -sqr_hibyte: -${db((i) => ((i * i) >> 8) & 0xff)} - `);