diff --git a/imul8xe.s b/imul8xe.s index 855e044..15adf64 100644 --- a/imul8xe.s +++ b/imul8xe.s @@ -70,106 +70,6 @@ bankswitch: done: .endmacro -.macro bank_switch bank - lda #((bank << 1) | $c1) - sta PORTB -.endmacro - proc imul8xe_init - - ; go through the input set, in four 16KB chunks - - arg1 = FR1 - arg2 = FR2 - result = FR0 - - lda #$00 - sta arg1 - sta arg2 - - ; $00 * $00 -> $3f * $ff - bank_switch 0 - jsr imul8xe_init_section - - ; $40 * $00 -> $7f * $ff - bank_switch 1 - jsr imul8xe_init_section - - ; $80 * $00 -> $bf * $ff - bank_switch 2 - jsr imul8xe_init_section - - ; $c0 * $00 -> $ff * $ff - bank_switch 3 - jsr imul8xe_init_section - rts endproc - -; Initialize a 16 KB chunk of the table -; input: multipliers in temp -; output: new multipliers in temp -; clobbers: temp, temp2 -proc imul8xe_init_section - arg1 = FR1 - arg2 = FR2 - result = FR0 - ptr = temp2 - - lda #$00 - sta ptr - lda #$40 - sta ptr + 1 - - ldx #0 - ldy #0 - - ; outer loop: $00 -> $3f -outer_loop: - - ; reset result to 0 - lda #0 - sta result - sta result + 1 - - ; inner loop: $00 -> $ff -inner_loop: - - ; copy result to data set - lda result - sta (ptr),y - lda result + 1 - sta (ptr),y - - ; result += 2 * arg2 - clc - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result - - ; inner loop check - inc arg1 - inc arg1 - inc ptr - inc ptr - bne inner_loop - - ; outer loop check - inc arg2 - inc ptr + 1 - lda ptr + 1 - cmp #$40 - bne outer_loop - - rts - -endproc diff --git a/mandel.s b/mandel.s index 8c6130b..e0a8570 100644 --- a/mandel.s +++ b/mandel.s @@ -74,9 +74,6 @@ width = 160 half_width = width >> 1 stride = width >> 2 -EXTENDED_RAM = $4000 ; 16KiB bank on the XE -PORTB = $D301 ; memory & bank-switch for XL/XE - DMACTL = $D400 DLISTL = $D402 DLISTH = $D403 @@ -431,179 +428,6 @@ fill_masks: .endscope .endmacro -; lookup table for top byte -> PORTB value for bank-switch -;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes -bank_switch_table: - .repeat 256, i - .byte ((i & $c0) >> 4) | $d1 - .endrepeat - -.macro bank_switch bank - lda #((bank << 2) | $d1) - sta PORTB -.endmacro - - -; 58-77 cycles -; clobbers x, y, dest to dest + 3 -.macro imul8xe dest, arg1, arg2 -.local done -.local output -.local ptr - - output = dest - ptr = dest + 2 ; scratch space assumed - - ; bottom 14 bits except the LSB are the per-bank table index - ; add $4000 for the bank pointer - lda arg1 ; 3 cyc - and #$fe ; 2 cyc - sta ptr ; 3 cyc - lda arg2 ; 3 cyc - and #$3f ; 2 cyc - clc ; 2 cyc - adc #$40 ; 2 cyc - sta ptr + 1 ; 3 cyc - - ; top 2 bits are the table bank selector - ldx arg2 ; 3 cyc - lda bank_switch_table,x ; 4 cyc - sta PORTB ; 4 cyc - - - ; copy the entry into output - ldy #0 ; 2 cyc - lda (ptr),y ; 5 cyc - sta output ; 3 cyc - iny ; 2 cyc - lda (ptr),y ; 5 cyc - sta output+1 ; 3 cyc - - ; note: we are not restoring memory to save 6 cycles! - ; this means those 16kb have to be switched back to base RAM - ; if we need to use them anywhere else - ;;; restore memory - ;;lda #$81 ; 2 cyc - disabled - ;;sta PORTB ; 4 cyc - disabled - - ; check that 1 bit we skipped to fit into space - lda arg1 ; 3 cyc - and #1 ; 2 cyc - beq done ; 2 cyc - - ; add the second param one last time for the skipped bit - clc ; 2 cyc - lda arg2 ; 3 cyc - adc output ; 3 cyc - sta output ; 3 cyc - lda #0 ; 2 cyc - adc output+1 ; 3 cyc - sta output+1 ; 3 cyc - -done: -.endmacro - -.proc imul8xe_init - - ; go through the input set, in four 16KB chunks - - arg1 = FR1 - arg2 = FR2 - result = FR0 - - lda #$00 - sta arg1 - sta arg2 - - ; $00 * $00 -> $3f * $ff - bank_switch 0 - jsr imul8xe_init_section - - ; $40 * $00 -> $7f * $ff - bank_switch 1 - jsr imul8xe_init_section - - ; $80 * $00 -> $bf * $ff - bank_switch 2 - jsr imul8xe_init_section - - ; $c0 * $00 -> $ff * $ff - bank_switch 3 - jsr imul8xe_init_section - - rts -.endproc - -; Initialize a 16 KB chunk of the table -; input: multipliers in temp -; output: new multipliers in temp -; clobbers: temp, temp2 -.proc imul8xe_init_section - arg1 = FR1 - arg2 = FR2 - result = FR0 - ptr = temp2 - - lda #$00 - sta ptr - lda #$40 - sta ptr + 1 - - ldy #0 - - ; outer loop: $00 -> $3f -outer_loop: - - ; reset result to 0 - lda #0 - sta result - sta result + 1 - - ; inner loop: $00 -> $ff -inner_loop: - - ; copy result to data set - lda result - sta (ptr),y - lda result + 1 - iny - sta (ptr),y - dey - - ; result += 2 * arg2 - clc - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result + 1 - clc - lda arg2 - adc result - sta result - lda #0 - adc result + 1 - sta result + 1 - - ; inner loop check - inc arg1 - inc arg1 - inc ptr - inc ptr - bne inner_loop - - ; outer loop check - inc arg2 - inc ptr + 1 - lda ptr + 1 - cmp #$80 - bne outer_loop - - rts - -.endproc - .proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) @@ -615,20 +439,20 @@ inner_loop: ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2) ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 - imul8xe result, arg1, arg2 + imul8 result, arg1, arg2 lda #0 sta result + 2 sta result + 3 - imul8xe inter, arg1 + 1, arg2 + imul8 inter, arg1 + 1, arg2 add16 result + 1, result + 1, inter add_carry result + 3 - imul8xe inter, arg1, arg2 + 1 + imul8 inter, arg1, arg2 + 1 add16 result + 1, result + 1, inter add_carry result + 3 - imul8xe inter, arg1 + 1, arg2 + 1 + imul8 inter, arg1 + 1, arg2 + 1 add16 result + 2, result + 2, inter ; In case of negative inputs, adjust high word @@ -1150,8 +974,6 @@ zero_byte_loop: .proc start - jsr imul8xe_init - ; ox = 0; oy = 0; zoom = 0 ; count_frames = 0; count_pixels = 0 lda #0