Compare commits

..

No commits in common. "main" and "xe" have entirely different histories.
main ... xe

4 changed files with 176 additions and 54 deletions

View file

@ -1,25 +0,0 @@
FEATURES {
STARTADDRESS: default = $2E00;
}
SYMBOLS {
__STARTADDRESS__: type = export, value = %S;
}
MEMORY {
ZP: file = "", define = yes, start = $0082, size = $007E;
MAIN: file = %O, define = yes, start = %S, size = $BC20 - %S;
}
FILES {
%O: format = atari;
}
FORMATS {
atari: runad = start;
}
SEGMENTS {
ZEROPAGE: load = ZP, type = zp, optional = yes;
EXTZP: load = ZP, type = zp, optional = yes; # to enable modules to be able to link to C and assembler programs
CODE: load = MAIN, type = rw, define = yes;
RODATA: load = MAIN, type = ro optional = yes;
DATA: load = MAIN, type = rw optional = yes;
BSS: load = MAIN, type = bss, optional = yes, define = yes;
TABLES: load = MAIN, type = ro, optional = yes, align = 256;
}

175
imul8xe.s Normal file
View file

@ -0,0 +1,175 @@
FR0 = $d4 ; float48
PORTB = $d301
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
; lookup table for top byte -> PORTB value for bank-switch
.align 256
bankswitch:
.repeat 256, i
.byte ((i & $c0) >> 5) | $c1
.endrepeat
; 58-77 cycles
; clobbers x, y, dest to dest + 3
.macro imul8xe dest, arg1, arg2
.local done
.local output
.local ptr
output = dest
ptr = dest + 2 ; scratch space assumed
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
lda arg1 ; 3 cyc
and #$fe ; 2 cyc
sta ptr ; 3 cyc
lda arg2 ; 3 cyc
and #$3f ; 2 cyc
clc ; 2 cyc
adc #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; top 2 bits are the table bank selector
ldx arg2 ; 3 cyc
lda bank_switch,x ; 4 cyc
sta PORTB ; 4 cyc
; copy the entry into output
ldy #0 ; 2 cyc
lda (ptr),y ; 5 cyc
sta output ; 3 cyc
iny ; 2 cyc
lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc
; note: we are not restoring memory to save 6 cycles!
; this means those 16kb have to be switched back to base RAM
; if we need to use them anywhere else
;;; restore memory
;;lda #$81 ; 2 cyc - disabled
;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc
and #1 ; 2 cyc
beq done ; 2 cyc
; add the second param one last time for the skipped bit
clc ; 2 cyc
lda arg2 ; 3 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
adc output+1 ; 3 cyc
sta output+1 ; 3 cyc
done:
.endmacro
.macro bank_switch bank
lda #((bank << 1) | $c1)
sta PORTB
.endmacro
proc imul8xe_init
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
endproc
; Initialize a 16 KB chunk of the table
; input: multipliers in temp
; output: new multipliers in temp
; clobbers: temp, temp2
proc imul8xe_init_section
arg1 = FR1
arg2 = FR2
result = FR0
ptr = temp2
lda #$00
sta ptr
lda #$40
sta ptr + 1
ldx #0
ldy #0
; outer loop: $00 -> $3f
outer_loop:
; reset result to 0
lda #0
sta result
sta result + 1
; inner loop: $00 -> $ff
inner_loop:
; copy result to data set
lda result
sta (ptr),y
lda result + 1
sta (ptr),y
; result += 2 * arg2
clc
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result
; inner loop check
inc arg1
inc arg1
inc ptr
inc ptr
bne inner_loop
; outer loop check
inc arg2
inc ptr + 1
lda ptr + 1
cmp #$40
bne outer_loop
rts
endproc

View file

@ -1148,13 +1148,11 @@ skip_char:
rts
plus:
lda zoom
cmp #8
bpl skip_char
inc zoom
jmp done
minus:
lda zoom
cmp #1
bmi skip_char
dec zoom
@ -1211,7 +1209,7 @@ zero_byte_loop:
jsr imul8xe_init
; ox = 0; oy = 0
; ox = 0; oy = 0; zoom = 0
; count_frames = 0; count_pixels = 0
lda #0
sta ox

26
todo.md
View file

@ -1,26 +0,0 @@
things to try:
* fix the pan/zoom bug where it doesn't reset loop right :(
* add some preset viewports that can be switched via number keys (1, 2, 3 etc)
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
* square-root special case of multiplication for zx*zx and zy*zy
* the hi1*hi2 and lo1*lo2 8-bit muls can be optimized into a 512-byte lookup table
* jamey on mastodon tried this but had some problems. see what happens on our version!
* double-check rounding behavior is correct
* try 3.13 fixed point instead of 4.12 for more precision
* can we get away without the extra bit?
* y-axis mirror optimization
* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
* rework the palette cycling to look more like an advancing flow
* extact viewport for display & re-input via keyboard
* fujinet screenshot/viewport uploader