diff --git a/mandel.s b/mandel.s index cf52fdb..2e16b53 100644 --- a/mandel.s +++ b/mandel.s @@ -374,65 +374,13 @@ viewport_oy: copy16 dest, FR2 + 2 ; 12 cyc .endmacro -; Adapted from https://everything2.com/title/Fast+6502+multiplication -.macro imul8 dest, arg1, arg2 - .local under256 - .local next - .local small_product - ; circa 92 cycles? this doesn't seem right - ; 81-92 cycles - .scope - mul_factor_a = arg1 - mul_factor_x = arg2 - mul_product_lo = dest - mul_product_hi = dest + 1 - - lda mul_factor_a ; 3 cyc - - ; (a + x)^2/2 - clc ; 2 cyc - adc mul_factor_x ; 3 cyc - tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc - under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc - next: - sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc - - ; - a^2/2 - ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - - ; + x & a & 1: - ; (this is a kludge to correct a - ; roundoff error that makes odd * odd too low) - ldx mul_factor_x ; 3 cyc - txa ; 2 cyc - and mul_factor_a ; 3 cyc - and #1 ; 2 cyc - - clc ; 2 cyc - adc mul_product_lo ; 3 cyc - bcc small_product ; 2 cyc - inc mul_product_hi ; 5 cyc - - ; - x^2/2 - small_product: - sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc - .endscope +; clobbers a, x +.macro sqr8 dest, arg + ldx arg + lda sqr_lobyte,x + sta dest + lda sqr_hibyte,x + sta dest + 1 .endmacro ; lookup table for top byte -> PORTB value for bank-switch @@ -447,64 +395,121 @@ bank_switch_table: sta PORTB .endmacro +.macro imul8 dest, arg1, arg2, xe + .if xe + ; using 64KB lookup table + ; 58-77 cycles + ; clobbers x, y, dest to dest + 3 + .scope + output = dest + ptr = dest + 2 ; scratch space assumed -; 58-77 cycles -; clobbers x, y, dest to dest + 3 -.macro imul8xe dest, arg1, arg2 -.local done -.local output -.local ptr - - output = dest - ptr = dest + 2 ; scratch space assumed - - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - sta ptr ; 3 cyc - lda arg2 ; 3 cyc - and #$3f ; 2 cyc - clc ; 2 cyc - adc #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - - ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch_table,x ; 4 cyc - sta PORTB ; 4 cyc + ; bottom 14 bits except the LSB are the per-bank table index + ; add $4000 for the bank pointer + lda arg1 ; 3 cyc + and #$fe ; 2 cyc + sta ptr ; 3 cyc + lda arg2 ; 3 cyc + and #$3f ; 2 cyc + clc ; 2 cyc + adc #$40 ; 2 cyc + sta ptr + 1 ; 3 cyc + + ; top 2 bits are the table bank selector + ldx arg2 ; 3 cyc + lda bank_switch_table,x ; 4 cyc + sta PORTB ; 4 cyc - ; copy the entry into output - ldy #0 ; 2 cyc - lda (ptr),y ; 5 cyc - sta output ; 3 cyc - iny ; 2 cyc - lda (ptr),y ; 5 cyc - sta output+1 ; 3 cyc + ; copy the entry into output + ldy #0 ; 2 cyc + lda (ptr),y ; 5 cyc + sta output ; 3 cyc + iny ; 2 cyc + lda (ptr),y ; 5 cyc + sta output+1 ; 3 cyc - ; note: we are not restoring memory to save 6 cycles! - ; this means those 16kb have to be switched back to base RAM - ; if we need to use them anywhere else - ;;; restore memory - ;;lda #$81 ; 2 cyc - disabled - ;;sta PORTB ; 4 cyc - disabled + ; note: we are not restoring memory to save 6 cycles! + ; this means those 16kb have to be switched back to base RAM + ; if we need to use them anywhere else + ;;; restore memory + ;;lda #$81 ; 2 cyc - disabled + ;;sta PORTB ; 4 cyc - disabled - ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc - and #1 ; 2 cyc - beq done ; 2 cyc + ; check that 1 bit we skipped to fit into space + lda arg1 ; 3 cyc + and #1 ; 2 cyc + beq done ; 2 cyc - ; add the second param one last time for the skipped bit - clc ; 2 cyc - lda arg2 ; 3 cyc - adc output ; 3 cyc - sta output ; 3 cyc - lda #0 ; 2 cyc - adc output+1 ; 3 cyc - sta output+1 ; 3 cyc + ; add the second param one last time for the skipped bit + clc ; 2 cyc + lda arg2 ; 3 cyc + adc output ; 3 cyc + sta output ; 3 cyc + lda #0 ; 2 cyc + adc output+1 ; 3 cyc + sta output+1 ; 3 cyc -done: + done: + .endscope + .else + ; Using base 48k RAM compatibility mode + ; Small table of half squares + ; Adapted from https://everything2.com/title/Fast+6502+multiplication + ; 81-92 cycles + .scope + mul_factor_a = arg1 + mul_factor_x = arg2 + mul_product_lo = dest + mul_product_hi = dest + 1 + + lda mul_factor_a ; 3 cyc + + ; (a + x)^2/2 + clc ; 2 cyc + adc mul_factor_x ; 3 cyc + tax ; 2 cyc + bcc under256 ; 2 cyc + lda mul_hibyte512,x ; 4 cyc + bcs next ; 2 cyc + under256: + lda mul_hibyte256,x ; 4 cyc + sec ; 2 cyc + next: + sta mul_product_hi ; 3 cyc + lda mul_lobyte256,x ; 4 cyc + + ; - a^2/2 + ldx mul_factor_a ; 3 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc + + ; + x & a & 1: + ; (this is a kludge to correct a + ; roundoff error that makes odd * odd too low) + ldx mul_factor_x ; 3 cyc + txa ; 2 cyc + and mul_factor_a ; 3 cyc + and #1 ; 2 cyc + + clc ; 2 cyc + adc mul_product_lo ; 3 cyc + bcc small_product ; 2 cyc + inc mul_product_hi ; 5 cyc + + ; - x^2/2 + small_product: + sec ; 2 cyc + sbc mul_lobyte256,x ; 4 cyc + sta mul_product_lo ; 3 cyc + lda mul_product_hi ; 3 cyc + sbc mul_hibyte256,x ; 4 cyc + sta mul_product_hi ; 3 cyc + .endscope + .endif .endmacro .proc imul8xe_init @@ -632,7 +637,13 @@ inner_loop: .endproc -.proc imul16_func +.macro imul16_impl xe + .local arg1 + .local arg2 + .local result + .local inter + .local arg1_pos + .local arg2_pos arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result @@ -643,20 +654,20 @@ inner_loop: ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - imul8 result, arg1, arg2 + imul8 result, arg1, arg2, xe lda #0 sta result + 2 sta result + 3 - imul8 inter, arg1 + 1, arg2 + imul8 inter, arg1 + 1, arg2, xe add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1, arg2 + 1 + imul8 inter, arg1, arg2 + 1, xe add16 result + 1, result + 1, inter add_carry result + 3 - imul8 inter, arg1 + 1, arg2 + 1 + imul8 inter, arg1 + 1, arg2 + 1, xe add16 result + 2, result + 2, inter ; In case of negative inputs, adjust high word @@ -671,47 +682,14 @@ arg1_pos: arg2_pos: rts ; 6 cyc +.endmacro + +.proc imul16_func + imul16_impl 0 .endproc .proc imul16xe_func - arg1 = FR0 ; 16-bit arg (clobbered) - arg2 = FR1 ; 16-bit arg (clobbered) - result = FR2 ; 32-bit result - inter = temp2 - - ; h1l1 * h2l2 - ; (h1*256 + l1) * (h2*256 + l2) - ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) - ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - - imul8xe result, arg1, arg2 - lda #0 - sta result + 2 - sta result + 3 - - imul8xe inter, arg1 + 1, arg2 - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8xe inter, arg1, arg2 + 1 - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8xe inter, arg1 + 1, arg2 + 1 - add16 result + 2, result + 2, inter - - ; In case of negative inputs, adjust high word - ; https://stackoverflow.com/a/28827013 - lda arg1 + 1 - bpl arg1_pos - sub16 result + 2, result + 2, arg2 -arg1_pos: - lda arg2 + 1 - bpl arg2_pos - sub16 result + 2, result + 2, arg1 -arg2_pos: - - rts ; 6 cyc + imul16_impl 1 .endproc .macro round16 arg diff --git a/tables.js b/tables.js index c772f81..50cbef9 100644 --- a/tables.js +++ b/tables.js @@ -22,7 +22,10 @@ console.log( .export mul_lobyte256 .export mul_hibyte256 .export mul_hibyte512 +.export sqr_lobyte +.export sqr_hibyte +; (i * i + 1) / 2 for the multiplier .align 256 mul_lobyte256: ${db((i) => squares[i] & 0xff)} @@ -35,4 +38,13 @@ ${db((i) => (squares[i] >> 8) & 0xff)} mul_hibyte512: ${db((i) => (squares[i + 256] >> 8) & 0xff)} +; (i * i) for the plain squares +.align 256 +sqr_lobyte: +${db((i) => (i * i) & 0xff)} + +.align 256 +sqr_hibyte: +${db((i) => ((i * i) >> 8) & 0xff)} + `);