diff --git a/imul8xe.s b/imul8xe.s deleted file mode 100644 index 15adf64..0000000 --- a/imul8xe.s +++ /dev/null @@ -1,75 +0,0 @@ -FR0 = $d4 ; float48 -PORTB = $d301 - - -EXTENDED_RAM = $4000 ; 16KiB bank on the XE - -; lookup table for top byte -> PORTB value for bank-switch -.align 256 -bankswitch: - .repeat 256, i - .byte ((i & $c0) >> 5) | $c1 - .endrepeat - -; 58-77 cycles -; clobbers x, y, dest to dest + 3 -.macro imul8xe dest, arg1, arg2 -.local done -.local output -.local ptr - - output = dest - ptr = dest + 2 ; scratch space assumed - - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - sta ptr ; 3 cyc - lda arg2 ; 3 cyc - and #$3f ; 2 cyc - clc ; 2 cyc - adc #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - - ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch,x ; 4 cyc - sta PORTB ; 4 cyc - - - ; copy the entry into output - ldy #0 ; 2 cyc - lda (ptr),y ; 5 cyc - sta output ; 3 cyc - iny ; 2 cyc - lda (ptr),y ; 5 cyc - sta output+1 ; 3 cyc - - ; note: we are not restoring memory to save 6 cycles! - ; this means those 16kb have to be switched back to base RAM - ; if we need to use them anywhere else - ;;; restore memory - ;;lda #$81 ; 2 cyc - disabled - ;;sta PORTB ; 4 cyc - disabled - - ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc - and #1 ; 2 cyc - beq done ; 2 cyc - - ; add the second param one last time for the skipped bit - clc ; 2 cyc - lda arg2 ; 3 cyc - adc output ; 3 cyc - sta output ; 3 cyc - lda #0 ; 2 cyc - adc output+1 ; 3 cyc - sta output+1 ; 3 cyc - -done: -.endmacro - -proc imul8xe_init - rts -endproc diff --git a/mandel.s b/mandel.s index e0a8570..3622995 100644 --- a/mandel.s +++ b/mandel.s @@ -372,59 +372,51 @@ fill_masks: .local under256 .local next .local small_product - ; circa 92 cycles? this doesn't seem right - ; 81-92 cycles .scope mul_factor_a = arg1 mul_factor_x = arg2 mul_product_lo = dest mul_product_hi = dest + 1 - lda mul_factor_a ; 3 cyc + lda mul_factor_a ; setup: 6 cycles + ;ldx mul_factor_x - ; (a + x)^2/2 - clc ; 2 cyc - adc mul_factor_x ; 3 cyc - tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc + clc ; (a + x)^2/2: 23 cycles + adc mul_factor_x + tax + bcc under256 + lda mul_hibyte512,x + bcs next under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc + lda mul_hibyte256,x + sec next: - sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc + sta mul_product_hi + lda mul_lobyte256,x - ; - a^2/2 - ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc + ldx mul_factor_a ; - a^2/2: 20 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi - ; + x & a & 1: - ; (this is a kludge to correct a - ; roundoff error that makes odd * odd too low) - ldx mul_factor_x ; 3 cyc - txa ; 2 cyc - and mul_factor_a ; 3 cyc - and #1 ; 2 cyc + ldx mul_factor_x ; + x & a & 1: 22 cycles + txa ; (this is a kludge to correct a + and mul_factor_a ; roundoff error that makes odd * odd too low) + and #1 - clc ; 2 cyc - adc mul_product_lo ; 3 cyc - bcc small_product ; 2 cyc - inc mul_product_hi ; 5 cyc - - ; - x^2/2 + clc + adc mul_product_lo + bcc small_product + inc mul_product_hi small_product: - sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc + sec ; - x^2/2: 25 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi .endscope .endmacro