diff --git a/imul8xe.s b/imul8xe.s deleted file mode 100644 index 855e044..0000000 --- a/imul8xe.s +++ /dev/null @@ -1,175 +0,0 @@ -FR0 = $d4 ; float48 -PORTB = $d301 - - -EXTENDED_RAM = $4000 ; 16KiB bank on the XE - -; lookup table for top byte -> PORTB value for bank-switch -.align 256 -bankswitch: - .repeat 256, i - .byte ((i & $c0) >> 5) | $c1 - .endrepeat - -; 58-77 cycles -; clobbers x, y, dest to dest + 3 -.macro imul8xe dest, arg1, arg2 -.local done -.local output -.local ptr - - output = dest - ptr = dest + 2 ; scratch space assumed - - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - sta ptr ; 3 cyc - lda arg2 ; 3 cyc - and #$3f ; 2 cyc - clc ; 2 cyc - adc #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - - ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch,x ; 4 cyc - sta PORTB ; 4 cyc - - - ; copy the entry into output - ldy #0 ; 2 cyc - lda (ptr),y ; 5 cyc - sta output ; 3 cyc - iny ; 2 cyc - lda (ptr),y ; 5 cyc - sta output+1 ; 3 cyc - - ; note: we are not restoring memory to save 6 cycles! - ; this means those 16kb have to be switched back to base RAM - ; if we need to use them anywhere else - ;;; restore memory - ;;lda #$81 ; 2 cyc - disabled - ;;sta PORTB ; 4 cyc - disabled - - ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc - and #1 ; 2 cyc - beq done ; 2 cyc - - ; add the second param one last time for the skipped bit - clc ; 2 cyc - lda arg2 ; 3 cyc - adc output ; 3 cyc - sta output ; 3 cyc - lda #0 ; 2 cyc - adc output+1 ; 3 cyc - sta output+1 ; 3 cyc - -done: -.endmacro - -.macro bank_switch bank - lda #((bank << 1) | $c1) - sta PORTB -.endmacro - -proc imul8xe_init - - ; go through the input set, in four 16KB chunks - - arg1 = FR1 - arg2 = FR2 - result = FR0 - - lda #$00 - sta arg1 - sta arg2 - - ; $00 * $00 -> $3f * $ff - bank_switch 0 - jsr imul8xe_init_section - - ; $40 * $00 -> $7f * $ff - bank_switch 1 - jsr imul8xe_init_section - - ; $80 * $00 -> $bf * $ff - bank_switch 2 - jsr imul8xe_init_section - - ; $c0 * $00 -> $ff * $ff - bank_switch 3 - jsr imul8xe_init_section - - rts -endproc - -; Initialize a 16 KB chunk of the table -; input: multipliers in temp -; output: new multipliers in temp -; clobbers: temp, temp2 -proc imul8xe_init_section - arg1 = FR1 - arg2 = FR2 - result = FR0 - ptr = temp2 - - lda #$00 - sta ptr - lda #$40 - sta ptr + 1 - - ldx #0 - ldy #0 - - ; outer loop: $00 -> $3f -outer_loop: - - ; reset result to 0 - lda #0 - sta result - sta result + 1 - - ; inner loop: $00 -> $ff -inner_loop: - - ; copy result to data set - lda result - sta (ptr),y - lda result + 1 - sta (ptr),y - - ; result += 2 * arg2 - clc - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result - - ; inner loop check - inc arg1 - inc arg1 - inc ptr - inc ptr - bne inner_loop - - ; outer loop check - inc arg2 - inc ptr + 1 - lda ptr + 1 - cmp #$40 - bne outer_loop - - rts - -endproc diff --git a/mandel.s b/mandel.s index d198989..3622995 100644 --- a/mandel.s +++ b/mandel.s @@ -74,9 +74,6 @@ width = 160 half_width = width >> 1 stride = width >> 2 -EXTENDED_RAM = $4000 ; 16KiB bank on the XE -PORTB = $D301 ; memory & bank-switch for XL/XE - DMACTL = $D400 DLISTL = $D402 DLISTH = $D403 @@ -347,6 +344,14 @@ fill_masks: neg 4, arg .endmacro +; 518 - 828 cyc +.macro imul16 dest, arg1, arg2 + copy16 FR0, arg1 ; 12 cyc + copy16 FR1, arg2 ; 12 cyc + jsr imul16_func ; 470-780 cyc + copy32 dest, FR2 ; 24 cyc +.endmacro + .macro shift_round_16 arg, shift .repeat shift shl32 arg @@ -357,7 +362,7 @@ fill_masks: .macro imul16_round dest, arg1, arg2, shift copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc - jsr imul16_func ; ? cyc + jsr imul16_func ; 470-780 cyc shift_round_16 FR2, shift copy16 dest, FR2 + 2 ; 12 cyc .endmacro @@ -367,259 +372,54 @@ fill_masks: .local under256 .local next .local small_product - ; circa 92 cycles? this doesn't seem right - ; 81-92 cycles .scope mul_factor_a = arg1 mul_factor_x = arg2 mul_product_lo = dest mul_product_hi = dest + 1 - lda mul_factor_a ; 3 cyc + lda mul_factor_a ; setup: 6 cycles + ;ldx mul_factor_x - ; (a + x)^2/2 - clc ; 2 cyc - adc mul_factor_x ; 3 cyc - tax ; 2 cyc - bcc under256 ; 2 cyc - lda mul_hibyte512,x ; 4 cyc - bcs next ; 2 cyc + clc ; (a + x)^2/2: 23 cycles + adc mul_factor_x + tax + bcc under256 + lda mul_hibyte512,x + bcs next under256: - lda mul_hibyte256,x ; 4 cyc - sec ; 2 cyc + lda mul_hibyte256,x + sec next: - sta mul_product_hi ; 3 cyc - lda mul_lobyte256,x ; 4 cyc + sta mul_product_hi + lda mul_lobyte256,x - ; - a^2/2 - ldx mul_factor_a ; 3 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc + ldx mul_factor_a ; - a^2/2: 20 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi - ; + x & a & 1: - ; (this is a kludge to correct a - ; roundoff error that makes odd * odd too low) - ldx mul_factor_x ; 3 cyc - txa ; 2 cyc - and mul_factor_a ; 3 cyc - and #1 ; 2 cyc + ldx mul_factor_x ; + x & a & 1: 22 cycles + txa ; (this is a kludge to correct a + and mul_factor_a ; roundoff error that makes odd * odd too low) + and #1 - clc ; 2 cyc - adc mul_product_lo ; 3 cyc - bcc small_product ; 2 cyc - inc mul_product_hi ; 5 cyc - - ; - x^2/2 + clc + adc mul_product_lo + bcc small_product + inc mul_product_hi small_product: - sec ; 2 cyc - sbc mul_lobyte256,x ; 4 cyc - sta mul_product_lo ; 3 cyc - lda mul_product_hi ; 3 cyc - sbc mul_hibyte256,x ; 4 cyc - sta mul_product_hi ; 3 cyc + sec ; - x^2/2: 25 cycles + sbc mul_lobyte256,x + sta mul_product_lo + lda mul_product_hi + sbc mul_hibyte256,x + sta mul_product_hi .endscope .endmacro -; lookup table for top byte -> PORTB value for bank-switch -;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes -bank_switch_table: - .repeat 256, i - .byte ((i & $c0) >> 4) | $e1 - .endrepeat - -.macro bank_switch bank - lda #((bank << 2) | $e1) - sta PORTB -.endmacro - - -; 58-77 cycles -; clobbers x, y, dest to dest + 3 -.macro imul8xe dest, arg1, arg2 -.local done -.local output -.local ptr - - output = dest - ptr = dest + 2 ; scratch space assumed - - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - sta ptr ; 3 cyc - lda arg2 ; 3 cyc - and #$3f ; 2 cyc - clc ; 2 cyc - adc #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - - ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch_table,x ; 4 cyc - sta PORTB ; 4 cyc - - - ; copy the entry into output - ldy #0 ; 2 cyc - lda (ptr),y ; 5 cyc - sta output ; 3 cyc - iny ; 2 cyc - lda (ptr),y ; 5 cyc - sta output+1 ; 3 cyc - - ; note: we are not restoring memory to save 6 cycles! - ; this means those 16kb have to be switched back to base RAM - ; if we need to use them anywhere else - ;;; restore memory - ;;lda #$81 ; 2 cyc - disabled - ;;sta PORTB ; 4 cyc - disabled - - ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc - and #1 ; 2 cyc - beq done ; 2 cyc - - ; add the second param one last time for the skipped bit - clc ; 2 cyc - lda arg2 ; 3 cyc - adc output ; 3 cyc - sta output ; 3 cyc - lda #0 ; 2 cyc - adc output+1 ; 3 cyc - sta output+1 ; 3 cyc - -done: -.endmacro - -.proc imul8xe_init - - bank_switch 0 - lda #0 - sta EXTENDED_RAM - bank_switch 1 - lda #1 - sta EXTENDED_RAM - bank_switch 0 - lda EXTENDED_RAM - beq init - - ; no bank switching available, we just overwrite the value in base ram - rts - -init: - - ; patch imul16_func into a forwarding thunk to imul16xe_func - lda #$4c ; 'jmp' opcode - sta imul16_func - lda #.lobyte(imul16xe_func) - sta imul16_func + 1 - lda #.hibyte(imul16xe_func) - sta imul16_func + 2 - - ; create the lookup table - ; go through the input set, in four 16KB chunks - - arg1 = FR1 - arg2 = FR2 - result = FR0 - - lda #$00 - sta arg1 - sta arg2 - - ; $00 * $00 -> $3f * $ff - bank_switch 0 - jsr imul8xe_init_section - - ; $40 * $00 -> $7f * $ff - bank_switch 1 - jsr imul8xe_init_section - - ; $80 * $00 -> $bf * $ff - bank_switch 2 - jsr imul8xe_init_section - - ; $c0 * $00 -> $ff * $ff - bank_switch 3 - jsr imul8xe_init_section - - rts -.endproc - -; Initialize a 16 KB chunk of the table -; input: multipliers in temp -; output: new multipliers in temp -; clobbers: temp, temp2 -.proc imul8xe_init_section - arg1 = FR1 - arg2 = FR2 - result = FR0 - ptr = temp2 - - lda #$00 - sta ptr - lda #$40 - sta ptr + 1 - - ldy #0 - - ; outer loop: $00 -> $3f -outer_loop: - - ; reset result to 0 - lda #0 - sta result - sta result + 1 - - ; inner loop: $00 -> $ff -inner_loop: - - ; copy result to data set - lda result - sta (ptr),y - lda result + 1 - iny - sta (ptr),y - dey - - ; result += 2 * arg2 - clc - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result + 1 - clc - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result + 1 - - ; inner loop check - inc arg1 - inc arg1 - inc ptr - inc ptr - bne inner_loop - - ; outer loop check - inc arg2 - inc ptr + 1 - lda ptr + 1 - cmp #$80 - bne outer_loop - - rts - -.endproc - .proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) @@ -661,47 +461,6 @@ arg2_pos: rts ; 6 cyc .endproc -.proc imul16xe_func - arg1 = FR0 ; 16-bit arg (clobbered) - arg2 = FR1 ; 16-bit arg (clobbered) - result = FR2 ; 32-bit result - inter = temp2 - - ; h1l1 * h2l2 - ; (h1*256 + l1) * (h2*256 + l2) - ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) - ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - - imul8xe result, arg1, arg2 - lda #0 - sta result + 2 - sta result + 3 - - imul8xe inter, arg1 + 1, arg2 - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8xe inter, arg1, arg2 + 1 - add16 result + 1, result + 1, inter - add_carry result + 3 - - imul8xe inter, arg1 + 1, arg2 + 1 - add16 result + 2, result + 2, inter - - ; In case of negative inputs, adjust high word - ; https://stackoverflow.com/a/28827013 - lda arg1 + 1 - bpl arg1_pos - sub16 result + 2, result + 2, arg2 -arg1_pos: - lda arg2 + 1 - bpl arg2_pos - sub16 result + 2, result + 2, arg1 -arg2_pos: - - rts ; 6 cyc -.endproc - .macro round16 arg ; Round top 16 bits of 32-bit fixed-point number in-place .local increment @@ -1207,8 +966,6 @@ zero_byte_loop: .proc start - jsr imul8xe_init - ; ox = 0; oy = 0; zoom = 0 ; count_frames = 0; count_pixels = 0 lda #0