Brooke Vibber
83cba4afa3
Uses the "big multiplication table" in 64KB of extended memory if bank switching appears to work, otherwise uses the table of squares lookups. Initial view clocks in at 13.133 ms/px for the XE version and still 14.211 ms/px for the 400/800/XL version. Tested in emulator with 130XE and XL+Ultimate 1MB upgrade configs, and base implementation on the 800XL emulator.
1462 lines
27 KiB
ArmAsm
1462 lines
27 KiB
ArmAsm
; Our zero-page vars
|
|
sx = $80 ; i16: screen pixel x
|
|
sy = $82 ; i16: screen pixel y
|
|
ox = $84 ; fixed4.12: center point x
|
|
oy = $86 ; fixed4.12: center point y
|
|
cx = $88 ; fixed4.12: c_x
|
|
cy = $8a ; fixed4.12: c_y
|
|
zx = $8c ; fixed4.12: z_x
|
|
zy = $8e ; fixed4.12: z_y
|
|
|
|
zx_2 = $90 ; fixed4.12: z_x^2
|
|
zy_2 = $92 ; fixed4.12: z_y^2
|
|
zx_zy = $94 ; fixed4.12: z_x * z_y
|
|
dist = $96 ; fixed4.12: z_x^2 + z_y^2
|
|
|
|
iter = $a0 ; u8: iteration count
|
|
|
|
zoom = $a1 ; u8: zoom shift level
|
|
count_frames = $a2 ; u8
|
|
count_pixels = $a3 ; u8
|
|
total_ms = $a4 ; float48
|
|
total_pixels = $aa ; float48
|
|
|
|
z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
|
|
z_buffer_start = $b1 ; u8: index into z_buffer
|
|
z_buffer_end = $b2 ; u8: index into z_buffer
|
|
temp = $b4 ; u16
|
|
temp2 = $b6 ; u16
|
|
pixel_ptr = $b8 ; u16
|
|
pixel_color = $ba ; u8
|
|
pixel_mask = $bb ; u8
|
|
pixel_shift = $bc ; u8
|
|
pixel_offset = $bd ; u8
|
|
fill_level = $be ; u8
|
|
palette_offset = $bf ; u8
|
|
|
|
; FP registers in zero page
|
|
FR0 = $d4 ; float48
|
|
FRE = $da
|
|
FR1 = $e0 ; float48
|
|
FR2 = $e6 ; float48
|
|
CIX = $f2 ; u8 - index into INBUFF
|
|
INBUFF = $f3 ; u16 - pointer to ascii
|
|
FLPTR = $fc ; u16 - pointer to user buffer float48
|
|
|
|
CH1 = $02f2 ; previous character read from keyboard
|
|
CH = $02fc ; current character read from keyboard
|
|
|
|
LBUFF = $0580 ; result buffer for FASC routine
|
|
|
|
; FP ROM routine vectors
|
|
FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
|
|
IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
|
|
FADD = $DA66 ; ADDITION (FR0 += FR1)
|
|
FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1)
|
|
FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1)
|
|
FDIV = $DB28 ; DIVISION (FR0 /= FR1)
|
|
ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
|
|
FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
|
|
FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
|
|
FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
|
|
FMOVE = $DDB6 ; MOVE FR0 TO FR1
|
|
|
|
; High data
|
|
framebuffer_top = $8000
|
|
textbuffer = $8f00
|
|
framebuffer_bottom = $9000
|
|
display_list = $9f00
|
|
framebuffer_end = $a000
|
|
|
|
height = 184
|
|
half_height = height >> 1
|
|
width = 160
|
|
half_width = width >> 1
|
|
stride = width >> 2
|
|
|
|
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
|
|
PORTB = $D301 ; memory & bank-switch for XL/XE
|
|
|
|
DMACTL = $D400
|
|
DLISTL = $D402
|
|
DLISTH = $D403
|
|
WSYNC = $D40A
|
|
|
|
; OS shadow registers
|
|
SDLSTL = $230
|
|
SDLSTH = $231
|
|
|
|
; interrupt stuff
|
|
SYSVBV = $E45F
|
|
XITVBV = $E462
|
|
SETVBV = $E45C
|
|
|
|
COLOR0 = $2C4
|
|
COLOR1 = $2C5
|
|
COLOR2 = $2C6
|
|
COLOR3 = $2C7
|
|
COLOR4 = $2C8
|
|
|
|
; Keycodes!
|
|
KEY_PLUS = $06
|
|
KEY_MINUS = $0e
|
|
KEY_UP = $8e
|
|
KEY_DOWN = $8f
|
|
KEY_LEFT = $86
|
|
KEY_RIGHT = $87
|
|
|
|
.struct float48
|
|
exponent .byte
|
|
mantissa .byte 6
|
|
.endstruct
|
|
|
|
.import mul_lobyte256
|
|
.import mul_hibyte256
|
|
.import mul_hibyte512
|
|
|
|
.data
|
|
|
|
strings:
|
|
str_self:
|
|
.byte "MANDEL-6502"
|
|
str_self_end:
|
|
str_speed:
|
|
.byte " ms/px"
|
|
str_speed_end:
|
|
str_run:
|
|
.byte " RUN"
|
|
str_run_end:
|
|
str_done:
|
|
.byte "DONE"
|
|
str_done_end:
|
|
|
|
str_self_len = str_self_end - str_self
|
|
str_speed_len = str_speed_end - str_speed
|
|
str_run_len = str_run_end - str_run
|
|
str_done_len = str_done_end - str_done
|
|
speed_precision = 6
|
|
|
|
speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
|
|
speed_len = 14 + str_speed_len
|
|
|
|
|
|
char_map:
|
|
; Map ATASCII string values to framebuffer font entries
|
|
; Sighhhhh
|
|
.repeat 32, i
|
|
.byte i + 64
|
|
.endrepeat
|
|
.repeat 64, i
|
|
.byte i
|
|
.endrepeat
|
|
.repeat 32, i
|
|
.byte 96 + i
|
|
.endrepeat
|
|
|
|
hex_chars:
|
|
.byte "0123456789abcdef"
|
|
|
|
aspect:
|
|
; aspect ratio!
|
|
; pixels at 320w are 5:6 (narrow)
|
|
; pixels at 160w are 5:3 (wide)
|
|
;
|
|
; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4)
|
|
; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4)
|
|
;
|
|
; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624
|
|
; &horizontal range -80 .. 79.9 is -3.125 .. 3.124
|
|
;
|
|
; 184h is the equiv of 220.8h at square pixels
|
|
; 320 / 220.8 = 1.45 display aspect ratio
|
|
aspect_x: ; fixed4.16 5/4
|
|
.word 5 << (12 - 2)
|
|
|
|
aspect_y: ; fixed4.16 3/4
|
|
.word 3 << (12 - 2)
|
|
|
|
ms_per_frame: ; float48 16.66666667
|
|
.byte 64 ; exponent/sign
|
|
.byte $16 ; BCD digits
|
|
.byte $66
|
|
.byte $66
|
|
.byte $66
|
|
.byte $67
|
|
|
|
display_list_start:
|
|
; 24 lines overscan
|
|
.repeat 3
|
|
.byte $70 ; 8 blank lines
|
|
.endrep
|
|
|
|
; 8 scan lines, 1 row of 40-column text
|
|
.byte $42
|
|
.addr textbuffer
|
|
|
|
; 184 lines graphics
|
|
; ANTIC mode e (160px 2bpp, 1 scan line per line)
|
|
.byte $4e
|
|
.addr framebuffer_top
|
|
.repeat half_height - 1
|
|
.byte $0e
|
|
.endrep
|
|
.byte $4e
|
|
.addr framebuffer_bottom
|
|
.repeat half_height - 1
|
|
.byte $0e
|
|
.endrep
|
|
|
|
.byte $41 ; jump and blank
|
|
.addr display_list
|
|
display_list_end:
|
|
display_list_len = display_list_end - display_list_start
|
|
|
|
color_map:
|
|
.byte 0
|
|
.repeat 85
|
|
.byte 1
|
|
.byte 2
|
|
.byte 3
|
|
.endrepeat
|
|
|
|
palette:
|
|
.byte $00
|
|
.byte $46
|
|
.byte $78
|
|
.byte $b4
|
|
.code
|
|
|
|
z_buffer_len = 16
|
|
z_buffer_mask = z_buffer_len - 1
|
|
z_buffer:
|
|
; the last N zx/zy values
|
|
.repeat z_buffer_len
|
|
.word 0
|
|
.word 0
|
|
.endrepeat
|
|
|
|
.export start
|
|
|
|
max_fill_level = 6
|
|
fill_masks:
|
|
.byte %00011111
|
|
.byte %00001111
|
|
.byte %00000111
|
|
.byte %00000011
|
|
.byte %00000001
|
|
.byte %00000000
|
|
|
|
; 2 + 9 * byte cycles
|
|
.macro add bytes, dest, arg1, arg2
|
|
clc ; 2 cyc
|
|
.repeat bytes, byte ; 9 * byte cycles
|
|
lda arg1 + byte
|
|
adc arg2 + byte
|
|
sta dest + byte
|
|
.endrepeat
|
|
.endmacro
|
|
|
|
.macro add16 dest, arg1, arg2
|
|
add 2, dest, arg1, arg2
|
|
.endmacro
|
|
|
|
.macro add32 dest, arg1, arg2
|
|
add 4, dest, arg2, dest
|
|
.endmacro
|
|
|
|
.macro add_carry dest
|
|
lda dest
|
|
adc #0
|
|
sta dest
|
|
.endmacro
|
|
|
|
; 2 + 9 * byte cycles
|
|
.macro sub bytes, dest, arg1, arg2
|
|
sec ; 2 cyc
|
|
.repeat bytes, byte ; 9 * byte cycles
|
|
lda arg1 + byte
|
|
sbc arg2 + byte
|
|
sta dest + byte
|
|
.endrepeat
|
|
.endmacro
|
|
|
|
.macro sub16 dest, arg1, arg2
|
|
sub 2, dest, arg1, arg2
|
|
.endmacro
|
|
|
|
.macro sub32 dest, arg1, arg2
|
|
sub 4, dest, arg1, arg2
|
|
.endmacro
|
|
|
|
.macro shl bytes, arg
|
|
asl arg
|
|
.repeat bytes-1, i
|
|
rol arg + 1 + i
|
|
.endrepeat
|
|
.endmacro
|
|
|
|
.macro shl16 arg
|
|
shl 2, arg
|
|
.endmacro
|
|
|
|
.macro shl24 arg
|
|
shl 3, arg
|
|
.endmacro
|
|
|
|
.macro shl32 arg
|
|
shl 4, arg
|
|
.endmacro
|
|
|
|
; 6 * bytes cycles
|
|
.macro copy bytes, dest, arg
|
|
.repeat bytes, byte ; 6 * bytes cycles
|
|
lda arg + byte ; 3 cyc
|
|
sta dest + byte ; 3 cyc
|
|
.endrepeat
|
|
.endmacro
|
|
|
|
.macro copy16 dest, arg
|
|
copy 2, dest, arg
|
|
.endmacro
|
|
|
|
.macro copy32 dest, arg
|
|
copy 4, dest, arg
|
|
.endmacro
|
|
|
|
.macro copyfloat dest, arg
|
|
copy 6, dest, arg
|
|
.endmacro
|
|
|
|
; 2 + 8 * byte cycles
|
|
.macro neg bytes, arg
|
|
sec ; 2 cyc
|
|
.repeat bytes, byte ; 8 * byte cycles
|
|
lda #00 ; 2 cyc
|
|
sbc arg + byte ; 3 cyc
|
|
sta arg + byte ; 3 cyc
|
|
.endrepeat
|
|
.endmacro
|
|
|
|
; 18 cycles
|
|
.macro neg16 arg
|
|
neg 2, arg
|
|
.endmacro
|
|
|
|
; 34 cycles
|
|
.macro neg32 arg
|
|
neg 4, arg
|
|
.endmacro
|
|
|
|
.macro shift_round_16 arg, shift
|
|
.repeat shift
|
|
shl32 arg
|
|
.endrepeat
|
|
round16 arg
|
|
.endmacro
|
|
|
|
.macro imul16_round dest, arg1, arg2, shift
|
|
copy16 FR0, arg1 ; 12 cyc
|
|
copy16 FR1, arg2 ; 12 cyc
|
|
jsr imul16_func ; ? cyc
|
|
shift_round_16 FR2, shift
|
|
copy16 dest, FR2 + 2 ; 12 cyc
|
|
.endmacro
|
|
|
|
; Adapted from https://everything2.com/title/Fast+6502+multiplication
|
|
.macro imul8 dest, arg1, arg2
|
|
.local under256
|
|
.local next
|
|
.local small_product
|
|
; circa 92 cycles? this doesn't seem right
|
|
; 81-92 cycles
|
|
.scope
|
|
mul_factor_a = arg1
|
|
mul_factor_x = arg2
|
|
mul_product_lo = dest
|
|
mul_product_hi = dest + 1
|
|
|
|
lda mul_factor_a ; 3 cyc
|
|
|
|
; (a + x)^2/2
|
|
clc ; 2 cyc
|
|
adc mul_factor_x ; 3 cyc
|
|
tax ; 2 cyc
|
|
bcc under256 ; 2 cyc
|
|
lda mul_hibyte512,x ; 4 cyc
|
|
bcs next ; 2 cyc
|
|
under256:
|
|
lda mul_hibyte256,x ; 4 cyc
|
|
sec ; 2 cyc
|
|
next:
|
|
sta mul_product_hi ; 3 cyc
|
|
lda mul_lobyte256,x ; 4 cyc
|
|
|
|
; - a^2/2
|
|
ldx mul_factor_a ; 3 cyc
|
|
sbc mul_lobyte256,x ; 4 cyc
|
|
sta mul_product_lo ; 3 cyc
|
|
lda mul_product_hi ; 3 cyc
|
|
sbc mul_hibyte256,x ; 4 cyc
|
|
sta mul_product_hi ; 3 cyc
|
|
|
|
; + x & a & 1:
|
|
; (this is a kludge to correct a
|
|
; roundoff error that makes odd * odd too low)
|
|
ldx mul_factor_x ; 3 cyc
|
|
txa ; 2 cyc
|
|
and mul_factor_a ; 3 cyc
|
|
and #1 ; 2 cyc
|
|
|
|
clc ; 2 cyc
|
|
adc mul_product_lo ; 3 cyc
|
|
bcc small_product ; 2 cyc
|
|
inc mul_product_hi ; 5 cyc
|
|
|
|
; - x^2/2
|
|
small_product:
|
|
sec ; 2 cyc
|
|
sbc mul_lobyte256,x ; 4 cyc
|
|
sta mul_product_lo ; 3 cyc
|
|
lda mul_product_hi ; 3 cyc
|
|
sbc mul_hibyte256,x ; 4 cyc
|
|
sta mul_product_hi ; 3 cyc
|
|
.endscope
|
|
.endmacro
|
|
|
|
; lookup table for top byte -> PORTB value for bank-switch
|
|
;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
|
|
bank_switch_table:
|
|
.repeat 256, i
|
|
.byte ((i & $c0) >> 4) | $e1
|
|
.endrepeat
|
|
|
|
.macro bank_switch bank
|
|
lda #((bank << 2) | $e1)
|
|
sta PORTB
|
|
.endmacro
|
|
|
|
|
|
; 58-77 cycles
|
|
; clobbers x, y, dest to dest + 3
|
|
.macro imul8xe dest, arg1, arg2
|
|
.local done
|
|
.local output
|
|
.local ptr
|
|
|
|
output = dest
|
|
ptr = dest + 2 ; scratch space assumed
|
|
|
|
; bottom 14 bits except the LSB are the per-bank table index
|
|
; add $4000 for the bank pointer
|
|
lda arg1 ; 3 cyc
|
|
and #$fe ; 2 cyc
|
|
sta ptr ; 3 cyc
|
|
lda arg2 ; 3 cyc
|
|
and #$3f ; 2 cyc
|
|
clc ; 2 cyc
|
|
adc #$40 ; 2 cyc
|
|
sta ptr + 1 ; 3 cyc
|
|
|
|
; top 2 bits are the table bank selector
|
|
ldx arg2 ; 3 cyc
|
|
lda bank_switch_table,x ; 4 cyc
|
|
sta PORTB ; 4 cyc
|
|
|
|
|
|
; copy the entry into output
|
|
ldy #0 ; 2 cyc
|
|
lda (ptr),y ; 5 cyc
|
|
sta output ; 3 cyc
|
|
iny ; 2 cyc
|
|
lda (ptr),y ; 5 cyc
|
|
sta output+1 ; 3 cyc
|
|
|
|
; note: we are not restoring memory to save 6 cycles!
|
|
; this means those 16kb have to be switched back to base RAM
|
|
; if we need to use them anywhere else
|
|
;;; restore memory
|
|
;;lda #$81 ; 2 cyc - disabled
|
|
;;sta PORTB ; 4 cyc - disabled
|
|
|
|
; check that 1 bit we skipped to fit into space
|
|
lda arg1 ; 3 cyc
|
|
and #1 ; 2 cyc
|
|
beq done ; 2 cyc
|
|
|
|
; add the second param one last time for the skipped bit
|
|
clc ; 2 cyc
|
|
lda arg2 ; 3 cyc
|
|
adc output ; 3 cyc
|
|
sta output ; 3 cyc
|
|
lda #0 ; 2 cyc
|
|
adc output+1 ; 3 cyc
|
|
sta output+1 ; 3 cyc
|
|
|
|
done:
|
|
.endmacro
|
|
|
|
.proc imul8xe_init
|
|
|
|
bank_switch 0
|
|
lda #0
|
|
sta EXTENDED_RAM
|
|
bank_switch 1
|
|
lda #1
|
|
sta EXTENDED_RAM
|
|
bank_switch 0
|
|
lda EXTENDED_RAM
|
|
beq init
|
|
|
|
; no bank switching available, we just overwrite the value in base ram
|
|
rts
|
|
|
|
init:
|
|
|
|
; patch imul16_func into a forwarding thunk to imul16xe_func
|
|
lda #$4c ; 'jmp' opcode
|
|
sta imul16_func
|
|
lda #.lobyte(imul16xe_func)
|
|
sta imul16_func + 1
|
|
lda #.hibyte(imul16xe_func)
|
|
sta imul16_func + 2
|
|
|
|
; create the lookup table
|
|
; go through the input set, in four 16KB chunks
|
|
|
|
arg1 = FR1
|
|
arg2 = FR2
|
|
result = FR0
|
|
|
|
lda #$00
|
|
sta arg1
|
|
sta arg2
|
|
|
|
; $00 * $00 -> $3f * $ff
|
|
bank_switch 0
|
|
jsr imul8xe_init_section
|
|
|
|
; $40 * $00 -> $7f * $ff
|
|
bank_switch 1
|
|
jsr imul8xe_init_section
|
|
|
|
; $80 * $00 -> $bf * $ff
|
|
bank_switch 2
|
|
jsr imul8xe_init_section
|
|
|
|
; $c0 * $00 -> $ff * $ff
|
|
bank_switch 3
|
|
jsr imul8xe_init_section
|
|
|
|
rts
|
|
.endproc
|
|
|
|
; Initialize a 16 KB chunk of the table
|
|
; input: multipliers in temp
|
|
; output: new multipliers in temp
|
|
; clobbers: temp, temp2
|
|
.proc imul8xe_init_section
|
|
arg1 = FR1
|
|
arg2 = FR2
|
|
result = FR0
|
|
ptr = temp2
|
|
|
|
lda #$00
|
|
sta ptr
|
|
lda #$40
|
|
sta ptr + 1
|
|
|
|
ldy #0
|
|
|
|
; outer loop: $00 -> $3f
|
|
outer_loop:
|
|
|
|
; reset result to 0
|
|
lda #0
|
|
sta result
|
|
sta result + 1
|
|
|
|
; inner loop: $00 -> $ff
|
|
inner_loop:
|
|
|
|
; copy result to data set
|
|
lda result
|
|
sta (ptr),y
|
|
lda result + 1
|
|
iny
|
|
sta (ptr),y
|
|
dey
|
|
|
|
; result += 2 * arg2
|
|
clc
|
|
lda arg2
|
|
adc result
|
|
sta result
|
|
lda #0
|
|
adc result + 1
|
|
sta result + 1
|
|
clc
|
|
lda arg2
|
|
adc result
|
|
sta result
|
|
lda #0
|
|
adc result + 1
|
|
sta result + 1
|
|
|
|
; inner loop check
|
|
inc arg1
|
|
inc arg1
|
|
inc ptr
|
|
inc ptr
|
|
bne inner_loop
|
|
|
|
; outer loop check
|
|
inc arg2
|
|
inc ptr + 1
|
|
lda ptr + 1
|
|
cmp #$80
|
|
bne outer_loop
|
|
|
|
rts
|
|
|
|
.endproc
|
|
|
|
.proc imul16_func
|
|
arg1 = FR0 ; 16-bit arg (clobbered)
|
|
arg2 = FR1 ; 16-bit arg (clobbered)
|
|
result = FR2 ; 32-bit result
|
|
inter = temp2
|
|
|
|
; h1l1 * h2l2
|
|
; (h1*256 + l1) * (h2*256 + l2)
|
|
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
|
|
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
|
|
|
|
imul8 result, arg1, arg2
|
|
lda #0
|
|
sta result + 2
|
|
sta result + 3
|
|
|
|
imul8 inter, arg1 + 1, arg2
|
|
add16 result + 1, result + 1, inter
|
|
add_carry result + 3
|
|
|
|
imul8 inter, arg1, arg2 + 1
|
|
add16 result + 1, result + 1, inter
|
|
add_carry result + 3
|
|
|
|
imul8 inter, arg1 + 1, arg2 + 1
|
|
add16 result + 2, result + 2, inter
|
|
|
|
; In case of negative inputs, adjust high word
|
|
; https://stackoverflow.com/a/28827013
|
|
lda arg1 + 1
|
|
bpl arg1_pos
|
|
sub16 result + 2, result + 2, arg2
|
|
arg1_pos:
|
|
lda arg2 + 1
|
|
bpl arg2_pos
|
|
sub16 result + 2, result + 2, arg1
|
|
arg2_pos:
|
|
|
|
rts ; 6 cyc
|
|
.endproc
|
|
|
|
.proc imul16xe_func
|
|
arg1 = FR0 ; 16-bit arg (clobbered)
|
|
arg2 = FR1 ; 16-bit arg (clobbered)
|
|
result = FR2 ; 32-bit result
|
|
inter = temp2
|
|
|
|
; h1l1 * h2l2
|
|
; (h1*256 + l1) * (h2*256 + l2)
|
|
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
|
|
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
|
|
|
|
imul8xe result, arg1, arg2
|
|
lda #0
|
|
sta result + 2
|
|
sta result + 3
|
|
|
|
imul8xe inter, arg1 + 1, arg2
|
|
add16 result + 1, result + 1, inter
|
|
add_carry result + 3
|
|
|
|
imul8xe inter, arg1, arg2 + 1
|
|
add16 result + 1, result + 1, inter
|
|
add_carry result + 3
|
|
|
|
imul8xe inter, arg1 + 1, arg2 + 1
|
|
add16 result + 2, result + 2, inter
|
|
|
|
; In case of negative inputs, adjust high word
|
|
; https://stackoverflow.com/a/28827013
|
|
lda arg1 + 1
|
|
bpl arg1_pos
|
|
sub16 result + 2, result + 2, arg2
|
|
arg1_pos:
|
|
lda arg2 + 1
|
|
bpl arg2_pos
|
|
sub16 result + 2, result + 2, arg1
|
|
arg2_pos:
|
|
|
|
rts ; 6 cyc
|
|
.endproc
|
|
|
|
.macro round16 arg
|
|
; Round top 16 bits of 32-bit fixed-point number in-place
|
|
.local increment
|
|
.local high_half
|
|
.local check_sign
|
|
.local next
|
|
|
|
; low word > $8000: round up
|
|
; = $8000: round up if positive
|
|
; round down if negative
|
|
; < $8000: round down
|
|
|
|
lda arg + 1
|
|
cmp #$80
|
|
beq high_half
|
|
bpl increment
|
|
bmi next
|
|
|
|
high_half:
|
|
lda arg
|
|
beq check_sign
|
|
bpl increment
|
|
bmi next
|
|
|
|
check_sign:
|
|
lda arg + 3
|
|
bmi next
|
|
|
|
increment: ; 5-10 cyc
|
|
inc arg + 2 ; 5 cyc
|
|
bne next ; 2 cyc
|
|
inc arg + 3 ; 5 cyc
|
|
|
|
next:
|
|
|
|
.endmacro
|
|
|
|
.proc mandelbrot
|
|
; input:
|
|
; cx: position scaled to 4.12 fixed point - -8..+7.9
|
|
; cy: position scaled to 4.12
|
|
;
|
|
; output:
|
|
; iter: iteration count at escape or 0
|
|
|
|
; zx = 0
|
|
; zy = 0
|
|
; zx_2 = 0
|
|
; zy_2 = 0
|
|
; zx_zy = 0
|
|
; dist = 0
|
|
; iter = 0
|
|
lda #00
|
|
ldx #(iter - zx + 1)
|
|
initloop:
|
|
sta zx - 1,x
|
|
dex
|
|
bne initloop
|
|
sta z_buffer_start
|
|
sta z_buffer_end
|
|
|
|
loop:
|
|
; iter++ & max-iters break
|
|
inc iter
|
|
bne keep_going
|
|
jmp exit_path
|
|
keep_going:
|
|
|
|
.macro quick_exit arg, max
|
|
.local positive
|
|
.local negative
|
|
.local nope_out
|
|
.local first_equal
|
|
.local all_done
|
|
|
|
; check sign bit
|
|
lda arg + 1
|
|
bmi negative
|
|
|
|
positive:
|
|
cmp #((max) << 4)
|
|
bmi all_done ; 'less than'
|
|
jmp exit_path
|
|
|
|
negative:
|
|
cmp #(256 - ((max) << 4))
|
|
beq first_equal ; 'equal' on first byte
|
|
bpl all_done ; 'greater than'
|
|
|
|
nope_out:
|
|
jmp exit_path
|
|
|
|
first_equal:
|
|
lda arg
|
|
beq nope_out ; 2nd byte 0 shows it's really 'equal'
|
|
|
|
all_done:
|
|
.endmacro
|
|
|
|
; 4.12: (-8 .. +7.9)
|
|
; zx = zx_2 - zy_2 + cx
|
|
sub16 zx, zx_2, zy_2
|
|
add16 zx, zx, cx
|
|
quick_exit zx, 2
|
|
|
|
; zy = zx_zy + zx_zy + cy
|
|
add16 zy, zx_zy, zx_zy
|
|
add16 zy, zy, cy
|
|
quick_exit zy, 2
|
|
|
|
; zx_2 = zx * zx
|
|
imul16_round zx_2, zx, zx, 4
|
|
|
|
; zy_2 = zy * zy
|
|
imul16_round zy_2, zy, zy, 4
|
|
|
|
; zx_zy = zx * zy
|
|
imul16_round zx_zy, zx, zy, 4
|
|
|
|
; dist = zx_2 + zy_2
|
|
add16 dist, zx_2, zy_2
|
|
quick_exit dist, 4
|
|
|
|
; if may be in the lake, look for looping output with a small buffer
|
|
; as an optimization vs running to max iters
|
|
lda z_buffer_active
|
|
beq skip_z_buffer
|
|
|
|
ldx z_buffer_start
|
|
cpx z_buffer_end
|
|
beq z_nothing_to_read
|
|
|
|
z_buffer_loop:
|
|
.macro z_compare arg
|
|
.local compare_no_match
|
|
lda z_buffer,x
|
|
inx
|
|
cmp arg
|
|
bne compare_no_match
|
|
iny
|
|
compare_no_match:
|
|
.endmacro
|
|
.macro z_advance
|
|
.local skip_reset_x
|
|
cpx #(z_buffer_len * 4)
|
|
bmi skip_reset_x
|
|
ldx #0
|
|
skip_reset_x:
|
|
.endmacro
|
|
.macro z_store arg
|
|
lda arg
|
|
sta z_buffer,x
|
|
inx
|
|
.endmacro
|
|
|
|
; Compare the previously stored z values
|
|
ldy #0
|
|
z_compare zx
|
|
z_compare zx + 1
|
|
z_compare zy
|
|
z_compare zy + 1
|
|
|
|
cpy #4
|
|
bne z_no_matches
|
|
jmp z_exit
|
|
|
|
z_no_matches:
|
|
z_advance
|
|
|
|
cpx z_buffer_end
|
|
bne z_buffer_loop
|
|
|
|
z_nothing_to_read:
|
|
|
|
; Store and expand
|
|
z_store zx
|
|
z_store zx + 1
|
|
z_store zy
|
|
z_store zy + 1
|
|
z_advance
|
|
stx z_buffer_end
|
|
|
|
; Increment the start roller if necessary (limit size)
|
|
lda iter
|
|
cmp #(z_buffer_len * 4)
|
|
bmi skip_inc_start
|
|
lda z_buffer_start
|
|
clc
|
|
adc #4
|
|
tax
|
|
z_advance
|
|
stx z_buffer_start
|
|
skip_inc_start:
|
|
|
|
skip_z_buffer:
|
|
|
|
jmp loop
|
|
|
|
z_exit:
|
|
lda #0
|
|
sta iter
|
|
|
|
exit_path:
|
|
ldx #0
|
|
lda iter
|
|
bne next
|
|
inx
|
|
next:
|
|
stx z_buffer_active
|
|
rts
|
|
|
|
.endproc
|
|
|
|
.macro scale_zoom dest
|
|
; clobbers X, flags
|
|
.local cont
|
|
.local enough
|
|
|
|
; cx = (sx << (8 - zoom))
|
|
ldx zoom
|
|
cont:
|
|
cpx #8
|
|
beq enough
|
|
shl16 dest
|
|
inx
|
|
jmp cont
|
|
enough:
|
|
.endmacro
|
|
|
|
.macro zoom_factor dest, src, zoom, aspect
|
|
; clobbers A, X, flags, etc
|
|
copy16 dest, src
|
|
scale_zoom dest
|
|
|
|
; cy = cy * (3 / 4)
|
|
; cx = cx * (5 / 4)
|
|
imul16_round dest, dest, aspect, 4
|
|
.endmacro
|
|
|
|
.proc pset
|
|
; screen coords in signed sx,sy
|
|
; iter holds the target to use
|
|
; @todo implement
|
|
|
|
; iter -> color
|
|
ldx iter
|
|
lda color_map,x
|
|
sta pixel_color
|
|
lda #(255 - 3)
|
|
sta pixel_mask
|
|
|
|
; sy -> line base address in temp
|
|
lda sy
|
|
bpl positive
|
|
|
|
negative:
|
|
; temp1 = top half
|
|
lda #.lobyte(framebuffer_top + stride * half_height)
|
|
sta pixel_ptr
|
|
lda #.hibyte(framebuffer_top + stride * half_height)
|
|
sta pixel_ptr + 1
|
|
jmp point
|
|
|
|
positive:
|
|
|
|
lda #.lobyte(framebuffer_bottom)
|
|
sta pixel_ptr
|
|
lda #.hibyte(framebuffer_bottom)
|
|
sta pixel_ptr + 1
|
|
|
|
point:
|
|
|
|
; pixel_ptr += sy * stride
|
|
; temp * 40
|
|
; = temp * 32 + temp * 8
|
|
; = (temp << 5) + (temp << 3)
|
|
copy16 temp, sy
|
|
shl16 temp
|
|
shl16 temp
|
|
shl16 temp
|
|
add16 pixel_ptr, pixel_ptr, temp
|
|
shl16 temp
|
|
shl16 temp
|
|
add16 pixel_ptr, pixel_ptr, temp
|
|
|
|
; Ok so temp1 points to the start of the line, which is 40 bytes.
|
|
; Get the byte and bit offsets
|
|
lda sx
|
|
clc
|
|
adc #half_width
|
|
sta temp
|
|
|
|
; pixel_shift = temp & 3
|
|
; pixel_color <<= pixel_shift (shifting in zeros)
|
|
; pixel_mask <<= pixel_shift (shifting in ones)
|
|
and #3
|
|
sta pixel_shift
|
|
lda #3
|
|
sec
|
|
sbc pixel_shift
|
|
tax
|
|
shift_loop:
|
|
beq shift_done
|
|
asl pixel_color
|
|
asl pixel_color
|
|
sec
|
|
rol pixel_mask
|
|
sec
|
|
rol pixel_mask
|
|
dex
|
|
jmp shift_loop
|
|
shift_done:
|
|
|
|
; pixel_offset = temp >> 2
|
|
lda temp
|
|
lsr a
|
|
lsr a
|
|
sta pixel_offset
|
|
tay
|
|
|
|
; read, mask, or, write
|
|
lda (pixel_ptr),y
|
|
and pixel_mask
|
|
ora pixel_color
|
|
sta (pixel_ptr),y
|
|
|
|
rts
|
|
.endproc
|
|
|
|
.macro draw_text_indirect col, len, strptr
|
|
; clobbers A, X
|
|
.local loop
|
|
.local done
|
|
ldx #0
|
|
loop:
|
|
cpx #len
|
|
beq done
|
|
txa
|
|
tay
|
|
lda (strptr),y
|
|
tay
|
|
lda char_map,y
|
|
sta textbuffer + col,x
|
|
inx
|
|
jmp loop
|
|
done:
|
|
.endmacro
|
|
|
|
.macro draw_text col, len, cstr
|
|
; clobbers A, X
|
|
.local loop
|
|
.local done
|
|
ldx #0
|
|
loop:
|
|
cpx #len
|
|
beq done
|
|
ldy cstr,x
|
|
lda char_map,y
|
|
sta textbuffer + col,x
|
|
inx
|
|
jmp loop
|
|
done:
|
|
.endmacro
|
|
|
|
.proc vblank_handler
|
|
inc count_frames
|
|
inc palette_offset
|
|
jsr update_palette
|
|
jmp XITVBV
|
|
.endproc
|
|
|
|
.proc update_palette
|
|
lda palette
|
|
sta COLOR4
|
|
|
|
clc
|
|
lda palette_offset
|
|
and #$f0
|
|
adc palette + 1
|
|
sta COLOR0
|
|
|
|
clc
|
|
lda palette_offset
|
|
and #$f0
|
|
adc palette + 2
|
|
sta COLOR1
|
|
|
|
clc
|
|
lda palette_offset
|
|
and #$f0
|
|
adc palette + 3
|
|
sta COLOR2
|
|
.endproc
|
|
|
|
.proc update_speed
|
|
; convert frames (u16) to fp
|
|
; add to frames_total
|
|
; convert pixels (u16) to fp
|
|
; add to pixels_total
|
|
; (frames_total * 16.66666667) / pixels_total
|
|
; convert to ATASCII
|
|
; draw text
|
|
.endproc
|
|
|
|
.proc keycheck
|
|
; clobbers all
|
|
; returns 255 in A if state change or 0 if no change
|
|
|
|
; check keyboard buffer
|
|
lda CH
|
|
cmp #$ff
|
|
beq skip_char
|
|
|
|
; Clear the keyboard buffer and re-enable interrupts
|
|
ldx #$ff
|
|
stx CH
|
|
|
|
tay
|
|
|
|
lda zoom
|
|
cpy #KEY_PLUS
|
|
beq plus
|
|
cpy #KEY_MINUS
|
|
beq minus
|
|
|
|
; temp = $0010 << (8 - zoom)
|
|
lda #$10
|
|
sta temp
|
|
lda #$00
|
|
sta temp + 1
|
|
scale_zoom temp
|
|
|
|
cpy #KEY_UP
|
|
beq up
|
|
cpy #KEY_DOWN
|
|
beq down
|
|
cpy #KEY_LEFT
|
|
beq left
|
|
cpy #KEY_RIGHT
|
|
beq right
|
|
|
|
skip_char:
|
|
lda #0
|
|
rts
|
|
|
|
plus:
|
|
cmp #8
|
|
bpl skip_char
|
|
inc zoom
|
|
jmp done
|
|
minus:
|
|
cmp #1
|
|
bmi skip_char
|
|
dec zoom
|
|
jmp done
|
|
up:
|
|
sub16 oy, oy, temp
|
|
jmp done
|
|
down:
|
|
add16 oy, oy, temp
|
|
jmp done
|
|
left:
|
|
sub16 ox, ox, temp
|
|
jmp done
|
|
right:
|
|
add16 ox, ox, temp
|
|
done:
|
|
lda #255
|
|
rts
|
|
|
|
.endproc
|
|
|
|
.proc clear_screen
|
|
; zero the range from framebuffer_top to display_list
|
|
lda #.lobyte(framebuffer_top)
|
|
sta temp
|
|
lda #.hibyte(framebuffer_top)
|
|
sta temp + 1
|
|
|
|
zero_page_loop:
|
|
lda #0
|
|
ldy #0
|
|
zero_byte_loop:
|
|
sta (temp),y
|
|
iny
|
|
bne zero_byte_loop
|
|
|
|
inc temp + 1
|
|
lda temp + 1
|
|
cmp #.hibyte(display_list)
|
|
bne zero_page_loop
|
|
|
|
rts
|
|
.endproc
|
|
|
|
.proc status_bar
|
|
; Status bar
|
|
draw_text 0, str_self_len, str_self
|
|
draw_text 40 - str_run_len, str_run_len, str_run
|
|
|
|
rts
|
|
.endproc
|
|
|
|
.proc start
|
|
|
|
jsr imul8xe_init
|
|
|
|
; ox = 0; oy = 0; zoom = 0
|
|
; count_frames = 0; count_pixels = 0
|
|
lda #0
|
|
sta ox
|
|
sta ox + 1
|
|
sta oy
|
|
sta oy + 1
|
|
sta count_frames
|
|
sta count_pixels
|
|
|
|
; total_ms = 0.0; total_pixels = 0.0
|
|
ldx #total_ms
|
|
jsr ZF1
|
|
ldx #total_pixels
|
|
jsr ZF1
|
|
|
|
; zoom = 2x
|
|
lda #1
|
|
sta zoom
|
|
|
|
; Disable display DMA
|
|
lda #0
|
|
sta DMACTL
|
|
|
|
jsr clear_screen
|
|
|
|
; Copy the display list into properly aligned memory
|
|
; Can't cross 1024-byte boundaries :D
|
|
ldx #0
|
|
copy_byte_loop:
|
|
lda display_list_start,x
|
|
sta display_list,x
|
|
inx
|
|
cpx #display_list_len
|
|
bne copy_byte_loop
|
|
|
|
; Set up the display list
|
|
lda #.lobyte(display_list)
|
|
sta DLISTL ; actual register
|
|
sta SDLSTL ; shadow register the OS will copy in
|
|
lda #.hibyte(display_list)
|
|
sta DLISTH ; actual register
|
|
sta SDLSTH ; shadow register the OS will copy in
|
|
|
|
; Re-enable display DMA
|
|
lda #$22
|
|
sta DMACTL
|
|
|
|
; Initialize the palette
|
|
lda #0
|
|
sta palette_offset
|
|
jsr update_palette
|
|
|
|
; install the vblank handler
|
|
lda #7 ; deferred
|
|
ldx #.hibyte(vblank_handler)
|
|
ldy #.lobyte(vblank_handler)
|
|
jsr SETVBV
|
|
|
|
main_loop:
|
|
jsr clear_screen
|
|
jsr status_bar
|
|
|
|
lda #0
|
|
sta fill_level
|
|
|
|
fill_loop:
|
|
|
|
; sy = -92 .. 91
|
|
lda #(256-half_height)
|
|
sta sy
|
|
lda #(256-1)
|
|
sta sy + 1
|
|
|
|
loop_sy:
|
|
; sx = -80 .. 79
|
|
lda #(256-half_width)
|
|
sta sx
|
|
lda #(256-1)
|
|
sta sx + 1
|
|
|
|
loop_sx:
|
|
; check the fill mask
|
|
ldy #0
|
|
|
|
loop_skip_level:
|
|
cpy fill_level
|
|
beq current_level
|
|
|
|
lda fill_masks,y
|
|
and sx
|
|
bne not_skipped_mask1
|
|
|
|
lda fill_masks,y
|
|
and sy
|
|
beq skipped_mask
|
|
|
|
not_skipped_mask1:
|
|
iny
|
|
jmp loop_skip_level
|
|
|
|
current_level:
|
|
lda fill_masks,y
|
|
and sx
|
|
bne skipped_mask
|
|
|
|
lda fill_masks,y
|
|
and sy
|
|
beq not_skipped_mask
|
|
|
|
skipped_mask:
|
|
jmp skipped
|
|
|
|
not_skipped_mask:
|
|
|
|
; run the fractal!
|
|
zoom_factor cx, sx, zoom, aspect_x
|
|
add16 cx, cx, ox
|
|
zoom_factor cy, sy, zoom, aspect_y
|
|
add16 cy, cy, oy
|
|
jsr mandelbrot
|
|
jsr pset
|
|
|
|
jsr keycheck
|
|
beq no_key
|
|
; @fixme clear the pixel stats
|
|
jmp main_loop
|
|
|
|
no_key:
|
|
; check if we should update the counters
|
|
;
|
|
; count_pixels >= width? update!
|
|
inc count_pixels
|
|
lda count_pixels
|
|
cmp #width
|
|
bmi update_status
|
|
|
|
; count_frames >= 120? update!
|
|
lda count_frames
|
|
cmp #120 ; >= 2 seconds
|
|
bmi skipped
|
|
|
|
update_status:
|
|
; FR0 = (float)count_pixels & clear count_pixels
|
|
lda count_pixels
|
|
sta FR0
|
|
lda #0
|
|
sta FR0 + 1
|
|
sta count_pixels
|
|
jsr IFP
|
|
|
|
; FR1 = total_pixels
|
|
ldx #.lobyte(total_pixels)
|
|
ldy #.hibyte(total_pixels)
|
|
jsr FLD1R
|
|
|
|
; FR0 += FR1
|
|
jsr FADD
|
|
|
|
; total_pixels = FR0
|
|
ldx #.lobyte(total_pixels)
|
|
ldy #.hibyte(total_pixels)
|
|
jsr FST0R
|
|
|
|
|
|
; FR0 = (float)count_frames & clear count_frames
|
|
; warning: this should really disable interrupts @TODO
|
|
lda count_frames
|
|
sta FR0
|
|
lda #0
|
|
sta FR0 + 1
|
|
sta count_frames
|
|
jsr IFP
|
|
|
|
; FR0 *= ms_per_frame
|
|
ldx #.lobyte(ms_per_frame)
|
|
ldy #.hibyte(ms_per_frame)
|
|
jsr FLD1R
|
|
jsr FMUL
|
|
|
|
; FR0 += total_ms
|
|
ldx #total_ms
|
|
ldy #0
|
|
jsr FLD1R
|
|
jsr FADD
|
|
|
|
; total_ms = FR0
|
|
ldx #total_ms
|
|
ldy #0
|
|
jsr FST0R
|
|
|
|
; FR0 /= total_pixels
|
|
ldx #total_pixels
|
|
ldy #0
|
|
jsr FLD1R
|
|
jsr FDIV
|
|
|
|
; convert to ASCII in INBUFF
|
|
jsr FASC
|
|
|
|
; print the first 6 digits
|
|
draw_text_indirect speed_start, speed_precision, INBUFF
|
|
draw_text speed_start + speed_precision, str_speed_len, str_speed
|
|
|
|
skipped:
|
|
|
|
clc
|
|
lda sx
|
|
adc #1
|
|
sta sx
|
|
lda sx + 1
|
|
adc #0
|
|
sta sx + 1
|
|
|
|
lda sx
|
|
cmp #half_width
|
|
beq loop_sx_done
|
|
jmp loop_sx
|
|
|
|
loop_sx_done:
|
|
|
|
clc
|
|
lda sy
|
|
adc #1
|
|
sta sy
|
|
lda sy + 1
|
|
adc #0
|
|
sta sy + 1
|
|
|
|
lda sy
|
|
cmp #half_height
|
|
beq loop_sy_done
|
|
jmp loop_sy
|
|
|
|
loop_sy_done:
|
|
|
|
fill_loop_done:
|
|
inc fill_level
|
|
lda fill_level
|
|
cmp #max_fill_level
|
|
beq loop
|
|
jmp fill_loop
|
|
|
|
loop:
|
|
; finished
|
|
draw_text 40 - str_done_len, str_done_len, str_done
|
|
jsr keycheck
|
|
beq loop
|
|
jmp main_loop
|
|
|
|
.endproc
|