mandel-6502/mandel.s
2024-12-31 09:53:22 -08:00

1738 lines
32 KiB
ArmAsm

; Our zero-page vars
ox = $80 ; fixed8.24: center point x
oy = $84 ; fixed8.24: center point y
cx = $88 ; fixed8.24: c_x
cy = $8c ; fixed8.24: c_y
zx = $90 ; fixed8.24: z_x
zy = $94 ; fixed8.24: z_y
zx_2 = $98 ; fixed8.24: z_x^2
zy_2 = $9c ; fixed8.24: z_y^2
zx_zy = $a0 ; fixed8.24: z_x * z_y
dist = $a4 ; fixed8.24: z_x^2 + z_y^2
sx = $a8 ; i16: screen pixel x
sy = $aa ; i16: screen pixel y
z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $ad ; u8: index into z_buffer
z_buffer_end = $ae ; u8: index into z_buffer
iter = $af ; u8: iteration count
ptr = $b0 ; u16
pixel_ptr = $b2 ; u16
zoom = $b4 ; u8: zoom shift level
fill_level = $b5 ; u8
pixel_color = $b6 ; u8
pixel_mask = $b7 ; u8
pixel_shift = $b8 ; u8
pixel_offset = $b9 ; u8
palette_offset = $ba ; u8
chroma_offset = $bb ; u8
palette_ticks = $bc ; u8
chroma_ticks = $bd ; u8
count_frames = $be ; u8
count_pixels = $bf ; u8
total_pixels = $c0 ; float48
total_ms = $c6 ; float48
temp = $cc ; u16
temp2 = $ce ; u16
palette_delay = 23
chroma_delay = 137
; FP registers in zero page
FR0 = $d4 ; float48
FRE = $da
FR1 = $e0 ; float48
FR2 = $e6 ; float48
CIX = $f2 ; u8 - index into INBUFF
INBUFF = $f3 ; u16 - pointer to ascii
FLPTR = $fc ; u16 - pointer to user buffer float48
CH1 = $02f2 ; previous character read from keyboard
CH = $02fc ; current character read from keyboard
LBUFF = $0580 ; result buffer for FASC routine
; FP ROM routine vectors
FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
FADD = $DA66 ; ADDITION (FR0 += FR1)
FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1)
FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1)
FDIV = $DB28 ; DIVISION (FR0 /= FR1)
ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
FMOVE = $DDB6 ; MOVE FR0 TO FR1
; High data
framebuffer_top = $a000
textbuffer = $af00
framebuffer_bottom = $b000
display_list = $bf00
framebuffer_end = $c000
height = 184
half_height = height >> 1
width = 160
half_width = width >> 1
stride = width >> 2
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
PORTB = $D301 ; memory & bank-switch for XL/XE
DMACTL = $D400
DLISTL = $D402
DLISTH = $D403
WSYNC = $D40A
; OS shadow registers
SDLSTL = $230
SDLSTH = $231
; interrupt stuff
SYSVBV = $E45F
XITVBV = $E462
SETVBV = $E45C
COLOR0 = $2C4
COLOR1 = $2C5
COLOR2 = $2C6
COLOR3 = $2C7
COLOR4 = $2C8
; Keycodes!
KEY_PLUS = $06
KEY_MINUS = $0e
KEY_UP = $8e
KEY_DOWN = $8f
KEY_LEFT = $86
KEY_RIGHT = $87
KEY_1 = $1f
KEY_2 = $1e
KEY_3 = $1a
KEY_4 = 24
KEY_5 = 29
KEY_6 = 27
KEY_7 = 51
KEY_8 = 53
KEY_9 = 48
KEY_0 = 50
.struct float48
exponent .byte
mantissa .byte 5
.endstruct
.import mul_lobyte256
.import mul_hibyte256
.import mul_hibyte512
.import sqr_lobyte
.import sqr_hibyte
.data
strings:
str_self:
.byte "MANDEL-6502"
str_self_end:
str_speed:
.byte " ms/px"
str_speed_end:
str_run:
.byte " RUN"
str_run_end:
str_done:
.byte "DONE"
str_done_end:
str_self_len = str_self_end - str_self
str_speed_len = str_speed_end - str_speed
str_run_len = str_run_end - str_run
str_done_len = str_done_end - str_done
speed_precision = 6
speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
speed_len = 14 + str_speed_len
char_map:
; Map ATASCII string values to framebuffer font entries
; Sighhhhh
.repeat 32, i
.byte i + 64
.endrepeat
.repeat 64, i
.byte i
.endrepeat
.repeat 32, i
.byte 96 + i
.endrepeat
hex_chars:
.byte "0123456789abcdef"
aspect:
; aspect ratio!
; pixels at 320w are 5:6 (narrow)
; pixels at 160w are 5:3 (wide)
;
; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4)
; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4)
;
; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624
; &horizontal range -80 .. 79.9 is -3.125 .. 3.124
;
; 184h is the equiv of 220.8h at square pixels
; 320 / 220.8 = 1.45 display aspect ratio
aspect_x: ; fixed4.16 5/4
.word 5 << (12 - 2)
aspect_y: ; fixed4.16 3/4
.word 3 << (12 - 2)
ms_per_frame: ; float48 16.66666667
.byte 64 ; exponent/sign
.byte $16 ; BCD digits
.byte $66
.byte $66
.byte $66
.byte $67
display_list_start:
; 24 lines overscan
.repeat 3
.byte $70 ; 8 blank lines
.endrep
; 8 scan lines, 1 row of 40-column text
.byte $42
.addr textbuffer
; 184 lines graphics
; ANTIC mode e (160px 2bpp, 1 scan line per line)
.byte $4e
.addr framebuffer_top
.repeat half_height - 1
.byte $0e
.endrep
.byte $4e
.addr framebuffer_bottom
.repeat half_height - 1
.byte $0e
.endrep
.byte $41 ; jump and blank
.addr display_list
display_list_end:
display_list_len = display_list_end - display_list_start
color_map:
.byte 0
.repeat 85
.byte 1
.byte 2
.byte 3
.endrepeat
palette_start:
.byte $0e
.byte $08
.byte $04
palette_repeat:
.byte $0e
.byte $08
palette_entries = 3
palette_chroma:
.repeat 15, i
.byte (i + 1) << 4
.endrepeat
.repeat 2, i
.byte (i + 1) << 4
.endrepeat
palette_chroma_entries = 15
.code
;z_buffer_len = 16 ; 10.863 ms/px
;z_buffer_len = 12 ; 10.619 ms/px
z_buffer_len = 8 ; 10.612 ms/px
;z_buffer_len = 4 ; 12.395 ms/px
z_buffer_mask = z_buffer_len - 1
z_buffer:
; the last N zx/zy values
.repeat z_buffer_len
.word 0
.word 0
.endrepeat
.export start
;max_fill_level = 6
max_fill_level = 3
fill_masks:
; .byte %00011111
; .byte %00001111
; .byte %00000111
.byte %00000011
.byte %00000001
.byte %00000000
viewport_zoom:
.byte 1
.byte 6
.byte 8
.byte 6
viewport_ox:
.dword $00000000
.dword $ff110000
.dword $ff110000
.dword $fe400000
viewport_oy:
.dword $00000000
.dword $ffb60000
.dword $ffbe0000
.dword $00000000
; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
clc ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
adc arg2 + byte
sta dest + byte
.endrepeat
.endmacro
; 20 cycles
.macro add16 dest, arg1, arg2
add 2, dest, arg1, arg2
.endmacro
; 38 cycles
.macro add32 dest, arg1, arg2
add 4, dest, arg2, dest
.endmacro
; 8 cycles
.macro add_carry dest
lda dest ; 3 cyc
adc #0 ; 2 cyc
sta dest ; 3 cyc
.endmacro
; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2
sec ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
sbc arg2 + byte
sta dest + byte
.endrepeat
.endmacro
; 20 cycles
.macro sub16 dest, arg1, arg2
sub 2, dest, arg1, arg2
.endmacro
; 38 cycles
.macro sub32 dest, arg1, arg2
sub 4, dest, arg1, arg2
.endmacro
; 3 + 5 * bytes cycles
.macro shl bytes, arg
asl arg ; 3 cyc
.repeat bytes-1, i
rol arg + 1 + i ; 5 cyc
.endrepeat
.endmacro
; 13 cycles
.macro shl16 arg
shl 2, arg
.endmacro
; 18 cycles
.macro shl24 arg
shl 3, arg
.endmacro
; 23 cycles
.macro shl32 arg
shl 4, arg
.endmacro
; 6 * bytes cycles
.macro copy bytes, dest, arg
.repeat bytes, byte ; 6 * bytes cycles
lda arg + byte ; 3 cyc
sta dest + byte ; 3 cyc
.endrepeat
.endmacro
; 12 cycles
.macro copy16 dest, arg
copy 2, dest, arg
.endmacro
; 24 cycles
.macro copy32 dest, arg
copy 4, dest, arg
.endmacro
; 36 cycles
.macro copyfloat dest, arg
copy 6, dest, arg
.endmacro
; 2 + 8 * byte cycles
.macro neg bytes, arg
sec ; 2 cyc
.repeat bytes, byte ; 8 * byte cycles
lda #00 ; 2 cyc
sbc arg + byte ; 3 cyc
sta arg + byte ; 3 cyc
.endrepeat
.endmacro
; 18 cycles
.macro neg16 arg
neg 2, arg
.endmacro
; 34 cycles
.macro neg32 arg
neg 4, arg
.endmacro
; 11-27 + 23 * shift cycles
; 103-119 cycles for shift=4
.macro shift_round_16 arg, shift
.repeat shift
shl32 arg ; 23 cycles
.endrepeat
round16 arg ; 11-27 cycles
.endmacro
; input: arg1, arg2 as fixed4.12
; output: dest as fixed8.24
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; ? cyc
copy32 dest, FR2 ; 24 cyc
.endmacro
; input: arg as fixed4.12
; output: dest as fixed8.24
.macro sqr16 dest, arg
copy16 FR0, arg ; 12 cyc
jsr sqr16_func ; ? cyc
copy32 dest, FR2 ; 24 cyc
.endmacro
; input: arg as u8
; output: dest as u16
; clobbers a, x
.macro sqr8 dest, arg
ldx arg
lda sqr_lobyte,x
sta dest
lda sqr_hibyte,x
sta dest + 1
.endmacro
; input: arg as u8
; input/output: dest as u16
; clobbers a, x
.macro sqr8_add16 dest, arg
ldx arg
clc
lda sqr_lobyte,x
adc dest
sta dest
lda sqr_hibyte,x
adc dest + 1
sta dest + 1
.endmacro
.segment "TABLES"
; lookup table for top byte -> PORTB value for bank-switch
.align 256
bank_switch_table:
.repeat 256, i
.byte ((i & $c0) >> 4) | $e3
.endrepeat
.code
.macro bank_switch bank
lda #((bank << 2) | $e3)
sta PORTB
.endmacro
.macro imul8 dest, arg1, arg2, xe
.if xe
; using 64KB lookup table
; 51-70 cycles
; clobbers x, y, dest, ptr
.scope
output = dest
; top 2 bits are the table bank selector
ldx arg2 ; 3 cyc
lda bank_switch_table,x ; 4 cyc
sta PORTB ; 4 cyc
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
txa ; 2 cyc
and #$3f ; 2 cyc
ora #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; copy the entry into output
lda arg1 ; 3 cyc
and #$fe ; 2 cyc
tay ; 2 cyc
lda (ptr),y ; 5 cyc
sta output ; 3 cyc
iny ; 2 cyc
lda (ptr),y ; 5 cyc
sta output+1 ; 3 cyc
; note: we are not restoring memory to save 6 cycles!
; this means those 16kb have to be switched back to base RAM
; if we need to use them anywhere else
;;; restore memory
;;lda #$81 ; 2 cyc - disabled
;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc
and #1 ; 2 cyc
beq done ; 2 cyc
; add arg2 one last time for the skipped bit
clc ; 2 cyc
txa ; 2 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
adc output+1 ; 3 cyc
sta output+1 ; 3 cyc
done:
.endscope
.else
; Using base 48k RAM compatibility mode
; Small table of half squares
; Adapted from https://everything2.com/title/Fast+6502+multiplication
; 81-92 cycles
.scope
mul_factor_a = arg1
mul_factor_x = arg2
mul_product_lo = dest
mul_product_hi = dest + 1
lda mul_factor_a ; 3 cyc
; (a + x)^2/2
clc ; 2 cyc
adc mul_factor_x ; 3 cyc
tax ; 2 cyc
bcc under256 ; 2 cyc
lda mul_hibyte512,x ; 4 cyc
bcs next ; 2 cyc
under256:
lda mul_hibyte256,x ; 4 cyc
sec ; 2 cyc
next:
sta mul_product_hi ; 3 cyc
lda mul_lobyte256,x ; 4 cyc
; - a^2/2
ldx mul_factor_a ; 3 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
; + x & a & 1:
; (this is a kludge to correct a
; roundoff error that makes odd * odd too low)
ldx mul_factor_x ; 3 cyc
txa ; 2 cyc
and mul_factor_a ; 3 cyc
and #1 ; 2 cyc
clc ; 2 cyc
adc mul_product_lo ; 3 cyc
bcc small_product ; 2 cyc
inc mul_product_hi ; 5 cyc
; - x^2/2
small_product:
sec ; 2 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
.endscope
.endif
.endmacro
.proc imul8xe_init
bank_switch 0
lda #0
sta EXTENDED_RAM
bank_switch 1
lda #1
sta EXTENDED_RAM
bank_switch 0
lda EXTENDED_RAM
beq init
; no bank switching available, we just overwrite the value in base ram
rts
init:
; patch imul16_func into a forwarding thunk to imul16xe_func
lda #$4c ; 'jmp' opcode
sta imul16_func
lda #.lobyte(imul16xe_func)
sta imul16_func + 1
lda #.hibyte(imul16xe_func)
sta imul16_func + 2
; ditto for sqr16_func -> sqr16xe_func
lda #$4c ; 'jmp' opcode
sta sqr16_func
lda #.lobyte(sqr16xe_func)
sta sqr16_func + 1
lda #.hibyte(sqr16xe_func)
sta sqr16_func + 2
; create the lookup table
; go through the input set, in four 16KB chunks
arg1 = FR1
arg2 = FR2
result = FR0
lda #$00
sta arg1
sta arg2
sta ptr
lda #$40
sta ptr + 1
; $00 * $00 -> $3f * $ff
bank_switch 0
jsr imul8xe_init_section
; $40 * $00 -> $7f * $ff
bank_switch 1
jsr imul8xe_init_section
; $80 * $00 -> $bf * $ff
bank_switch 2
jsr imul8xe_init_section
; $c0 * $00 -> $ff * $ff
bank_switch 3
jsr imul8xe_init_section
rts
.endproc
; Initialize a 16 KB chunk of the table
; input: multipliers in temp
; output: new multipliers in temp
; clobbers: temp, temp2
.proc imul8xe_init_section
arg1 = FR1
arg2 = FR2
result = FR0
ptr = temp2
lda #$00
sta ptr
lda #$40
sta ptr + 1
ldy #0
; outer loop: $00 -> $3f
outer_loop:
; reset result to 0
lda #0
sta result
sta result + 1
; inner loop: $00 -> $ff
inner_loop:
; copy result to data set
lda result
sta (ptr),y
lda result + 1
iny
sta (ptr),y
dey
; result += 2 * arg2
clc
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result + 1
clc
lda arg2
adc result
sta result
lda #0
adc result + 1
sta result + 1
; inner loop check
inc arg1
inc arg1
inc ptr
inc ptr
bne inner_loop
; outer loop check
inc arg2
inc ptr + 1
lda ptr + 1
cmp #$80
bne outer_loop
rts
.endproc
.macro imul16_impl xe
.local arg1
.local arg2
.local result
.local inter
.local arg1_pos
.local arg2_pos
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
inter = temp2
; h1l1 * h2l2
; (h1*256 + l1) * (h2*256 + l2)
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8 result, arg1, arg2, xe
lda #0
sta result + 2
sta result + 3
imul8 inter, arg1 + 1, arg2, xe
add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1, arg2 + 1, xe
add16 result + 1, result + 1, inter
add_carry result + 3
imul8 inter, arg1 + 1, arg2 + 1, xe
add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word
; https://stackoverflow.com/a/28827013
lda arg1 + 1
bpl arg1_pos
sub16 result + 2, result + 2, arg2
arg1_pos:
lda arg2 + 1
bpl arg2_pos
sub16 result + 2, result + 2, arg1
arg2_pos:
rts ; 6 cyc
.endmacro
.macro sqr16_impl xe
.scope
arg = FR0 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
;inter = temp2
inter = FR1
lda arg + 1
bpl arg_pos
neg16 arg
arg_pos:
; hl * hl
; (h*256 + l) * (h*256 + l)
; h*256*(h*256 + l) + l*(h*256 + l)
; h*h*256*256 + h*l*256 + h*l*256 + l*l
sqr8 result, arg
lda #0
sta result + 2
sta result + 3
imul8 inter, arg + 1, arg, xe
add16 result + 1, result + 1, inter
add_carry result + 3
add16 result + 1, result + 1, inter
add_carry result + 3
sqr8_add16 result + 2, arg + 1
rts ; 6 cyc
.endscope
.endmacro
.proc imul16_func
imul16_impl 0
.endproc
.proc imul16xe_func
imul16_impl 1
.endproc
.proc sqr16_func
sqr16_impl 0
.endproc
.proc sqr16xe_func
sqr16_impl 1
.endproc
; 11-27 cycles
.macro round16 arg
; Round top 16 bits of 32-bit fixed-point number in-place
.local increment
.local high_half
.local check_sign
.local next
; low word > $8000: round up
; = $8000: round up if positive
; round down if negative
; < $8000: round down
; $8000 17
; $8001 27
; $8100 21
; $7fff 11
lda arg + 1 ; 3 cyc
cmp #$80 ; 2 cyc
beq high_half ; 2 cyc
bpl increment ; 2 cyc
bmi next ; 2 cyc
high_half:
lda arg ; 3 cyc
beq check_sign ; 2 cyc
jmp increment ; 3 cyc
check_sign:
lda arg + 3 ; 3 cyc
bmi next ; 2 cyc
increment: ; 5-10 cyc
inc arg + 2 ; 5 cyc
bne next ; 2 cyc
inc arg + 3 ; 5 cyc
next:
.endmacro
.proc mandelbrot
; input:
; cx: position scaled to 8.24 fixed point - -128..+127.9
; cy: position scaled to 8.24
;
; output:
; iter: iteration count at escape or 0
; zx = 0
; zy = 0
; zx_2 = 0
; zy_2 = 0
; zx_zy = 0
; dist = 0
; iter = 0
; lda #00
; ldx #(iter - zx + 1)
;initloop:
; sta zx - 1,x
; dex
; bne initloop
; sta z_buffer_start
; sta z_buffer_end
lda #00
sta zx
sta zx + 1
sta zx + 2
sta zx + 3
sta zy
sta zy + 1
sta zy + 2
sta zy + 3
sta zx_2
sta zx_2 + 1
sta zx_2 + 2
sta zx_2 + 3
sta zy_2
sta zy_2 + 1
sta zy_2 + 2
sta zy_2 + 3
sta zx_zy
sta zx_zy + 1
sta zx_zy + 2
sta zx_zy + 3
sta dist
sta dist + 1
sta dist + 2
sta dist + 3
sta iter
sta z_buffer_start
sta z_buffer_end
loop:
; iter++ & max-iters break
inc iter
bne keep_going
jmp exit_path
keep_going:
.macro quick_exit arg, max
; arg: fixed8.24
; max: integer
.local positive
.local negative
.local nope_out
.local first_equal
.local all_done
; check sign bit
lda arg + 3
bmi negative
positive:
cmp #max
bmi all_done ; 'less than'
jmp exit_path
negative:
cmp #(256 - max)
beq first_equal ; 'equal' on first byte
bpl all_done ; 'greater than'
nope_out:
jmp exit_path
first_equal:
; following bytes all 0 shows it's really 'equal'
lda arg + 2
bne all_done
lda arg + 1
bne all_done
lda arg
bne all_done
jmp exit_path
all_done:
.endmacro
; 8.24: (-128 .. 127.9)
; zx = zx_2 - zy_2 + cx
sub32 zx, zx_2, zy_2
add32 zx, zx, cx
quick_exit zx, 2
; zy = zx_zy + zx_zy + cy
add32 zy, zx_zy, zx_zy
add32 zy, zy, cy
quick_exit zy, 2
; convert 8.24 -> 4.12: (-8 .. +7.9)
shift_round_16 zx, 4
shift_round_16 zy, 4
; zx_2 = zx * zx
sqr16 zx_2, zx + 2
; zy_2 = zy * zy
sqr16 zy_2, zy + 2
; zx_zy = zx * zy
imul16 zx_zy, zx + 2, zy + 2
; dist = zx_2 + zy_2
add32 dist, zx_2, zy_2
quick_exit dist, 4
; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters
lda z_buffer_active
beq skip_z_buffer
ldx z_buffer_start
cpx z_buffer_end
beq z_nothing_to_read
z_buffer_loop:
.macro z_compare arg
.local compare_no_match
lda z_buffer,x
inx
cmp arg
bne compare_no_match
iny
compare_no_match:
.endmacro
.macro z_advance
.local skip_reset_x
cpx #(z_buffer_len * 4)
bmi skip_reset_x
ldx #0
skip_reset_x:
.endmacro
.macro z_store arg
lda arg
sta z_buffer,x
inx
.endmacro
; Compare the previously stored z values
ldy #0
z_compare zx + 2
z_compare zx + 3
z_compare zy + 2
z_compare zy + 3
cpy #4
bne z_no_matches
jmp z_exit
z_no_matches:
z_advance
cpx z_buffer_end
bne z_buffer_loop
z_nothing_to_read:
; Store and expand
z_store zx + 2
z_store zx + 3
z_store zy + 2
z_store zy + 3
z_advance
stx z_buffer_end
; Increment the start roller if necessary (limit size)
lda iter
cmp #(z_buffer_len * 4)
bmi skip_inc_start
lda z_buffer_start
clc
adc #4
tax
z_advance
stx z_buffer_start
skip_inc_start:
skip_z_buffer:
jmp loop
z_exit:
lda #0
sta iter
exit_path:
ldx #0
lda iter
bne next
inx
next:
stx z_buffer_active
rts
.endproc
.macro scale_zoom dest
; clobbers X, flags
.local cont
.local enough
; cx = (sx << (8 - zoom))
ldx zoom
cont:
cpx #8
beq enough
shl16 dest
inx
jmp cont
enough:
.endmacro
.macro zoom_factor dest, src, aspect
; output: dest: fixed8.24
; input: src: fixed4.12
; aspect: fixed4.12
; clobbers A, X, flags, etc
copy16 dest, src
scale_zoom dest
; cy = cy * (3 / 4)
; cx = cx * (5 / 4)
imul16 dest, dest, aspect
.endmacro
.proc pset
; screen coords in signed sx,sy
; iter holds the target to use
; @todo implement
; iter -> color
ldx iter
lda color_map,x
sta pixel_color
lda #(255 - 3)
sta pixel_mask
; sy -> line base address in temp
lda sy
bpl positive
negative:
; temp1 = top half
lda #.lobyte(framebuffer_top + stride * half_height)
sta pixel_ptr
lda #.hibyte(framebuffer_top + stride * half_height)
sta pixel_ptr + 1
jmp point
positive:
lda #.lobyte(framebuffer_bottom)
sta pixel_ptr
lda #.hibyte(framebuffer_bottom)
sta pixel_ptr + 1
point:
; pixel_ptr += sy * stride
; temp * 40
; = temp * 32 + temp * 8
; = (temp << 5) + (temp << 3)
copy16 temp, sy
shl16 temp
shl16 temp
shl16 temp
add16 pixel_ptr, pixel_ptr, temp
shl16 temp
shl16 temp
add16 pixel_ptr, pixel_ptr, temp
; Ok so temp1 points to the start of the line, which is 40 bytes.
; Get the byte and bit offsets
lda sx
clc
adc #half_width
sta temp
; pixel_shift = temp & 3
; pixel_color <<= pixel_shift (shifting in zeros)
; pixel_mask <<= pixel_shift (shifting in ones)
and #3
sta pixel_shift
lda #3
sec
sbc pixel_shift
tax
shift_loop:
beq shift_done
asl pixel_color
asl pixel_color
sec
rol pixel_mask
sec
rol pixel_mask
dex
jmp shift_loop
shift_done:
; pixel_offset = temp >> 2
lda temp
lsr a
lsr a
sta pixel_offset
tay
; read, mask, or, write
lda (pixel_ptr),y
and pixel_mask
ora pixel_color
sta (pixel_ptr),y
rts
.endproc
.macro draw_text_indirect col, len, strptr
; clobbers A, X
.local loop
.local done
ldx #0
loop:
cpx #len
beq done
txa
tay
lda (strptr),y
tay
lda char_map,y
sta textbuffer + col,x
inx
jmp loop
done:
.endmacro
.macro draw_text col, len, cstr
; clobbers A, X
.local loop
.local done
ldx #0
loop:
cpx #len
beq done
ldy cstr,x
lda char_map,y
sta textbuffer + col,x
inx
jmp loop
done:
.endmacro
.proc vblank_handler
inc count_frames
inc chroma_ticks
lda chroma_ticks
cmp #(chroma_delay)
bne skip_chroma
lda #0
sta chroma_ticks
inc chroma_offset
lda chroma_offset
cmp #(palette_chroma_entries)
bne skip_chroma
lda #0
sta chroma_offset
skip_chroma:
inc palette_ticks
lda palette_ticks
cmp #(palette_delay)
bne skip_luma
lda #0
sta palette_ticks
inc palette_offset
lda palette_offset
cmp #(palette_entries)
bne skip_luma
lda #0
sta palette_offset
skip_luma:
jsr update_palette
jmp XITVBV
.endproc
.proc update_palette
lda #0
sta COLOR4
ldx chroma_offset
ldy palette_offset
lda palette_chroma,x
ora palette_start,y
sta COLOR2
;inx
iny
lda palette_chroma,x
ora palette_start,y
sta COLOR1
;inx
iny
lda palette_chroma,x
ora palette_start,y
sta COLOR0
rts
.endproc
.proc update_speed
; convert frames (u16) to fp
; add to frames_total
; convert pixels (u16) to fp
; add to pixels_total
; (frames_total * 16.66666667) / pixels_total
; convert to ATASCII
; draw text
.endproc
.proc keycheck
; clobbers all
; returns 255 in A if state change or 0 if no change
; check keyboard buffer
lda CH
cmp #$ff
beq skip_char
; Clear the keyboard buffer and re-enable interrupts
ldx #$ff
stx CH
tay
lda zoom
cpy #KEY_PLUS
beq plus
cpy #KEY_MINUS
beq minus
; temp = $0010 << (8 - zoom)
lda #$10
sta temp
lda #$00
sta temp + 1
scale_zoom temp
cpy #KEY_UP
beq up
cpy #KEY_DOWN
beq down
cpy #KEY_LEFT
beq left
cpy #KEY_RIGHT
beq right
cpy #KEY_1
beq one
cpy #KEY_2
beq two
cpy #KEY_3
beq three
cpy #KEY_4
beq four
skip_char:
lda #0
rts
plus:
lda zoom
cmp #8
bpl skip_char
inc zoom
jmp done
minus:
lda zoom
cmp #1
bmi skip_char
dec zoom
jmp done
up:
sub16 oy, oy, temp
jmp done
down:
add16 oy, oy, temp
jmp done
left:
sub16 ox, ox, temp
jmp done
right:
add16 ox, ox, temp
jmp done
one:
ldx #0
jmp load_key_viewport
two:
ldx #1
jmp load_key_viewport
three:
ldx #2
jmp load_key_viewport
four:
ldx #3
; fall through
load_key_viewport:
jsr load_viewport
; fall through
done:
lda #255
rts
.endproc
.proc clear_screen
; zero the range from framebuffer_top to display_list
lda #.lobyte(framebuffer_top)
sta temp
lda #.hibyte(framebuffer_top)
sta temp + 1
zero_page_loop:
lda #0
ldy #0
zero_byte_loop:
sta (temp),y
iny
bne zero_byte_loop
inc temp + 1
lda temp + 1
cmp #.hibyte(display_list)
bne zero_page_loop
rts
.endproc
.proc status_bar
; Status bar
draw_text 0, str_self_len, str_self
draw_text 40 - str_run_len, str_run_len, str_run
rts
.endproc
; input: viewport selector in x
; clobbers: a, x
.proc load_viewport
lda viewport_zoom,x
sta zoom
txa
asl a
asl a
tax
lda viewport_ox,x
sta ox
lda viewport_oy,x
sta oy
inx
lda viewport_ox,x
sta ox + 1
lda viewport_oy,x
sta oy + 1
inx
lda viewport_ox,x
sta ox + 2
lda viewport_oy,x
sta oy + 2
inx
lda viewport_ox,x
sta ox + 3
lda viewport_oy,x
sta oy + 3
rts
.endproc
.proc start
jsr imul8xe_init
; initialize viewport
ldx #0 ; overview
jsr load_viewport
; Disable display DMA
lda #0
sta DMACTL
jsr clear_screen
; Copy the display list into properly aligned memory
; Can't cross 1024-byte boundaries :D
ldx #0
copy_byte_loop:
lda display_list_start,x
sta display_list,x
inx
cpx #display_list_len
bne copy_byte_loop
; Set up the display list
lda #.lobyte(display_list)
sta DLISTL ; actual register
sta SDLSTL ; shadow register the OS will copy in
lda #.hibyte(display_list)
sta DLISTH ; actual register
sta SDLSTH ; shadow register the OS will copy in
; Re-enable display DMA
lda #$22
sta DMACTL
; Initialize the palette
lda #0
sta palette_offset
sta palette_delay
sta chroma_offset
sta chroma_delay
jsr update_palette
; install the vblank handler
lda #7 ; deferred
ldx #.hibyte(vblank_handler)
ldy #.lobyte(vblank_handler)
jsr SETVBV
main_loop:
; count_frames = 0; count_pixels = 0
lda #0
sta count_frames
sta count_pixels
; total_ms = 0.0; total_pixels = 0.0
ldx #total_ms
jsr ZF1
ldx #total_pixels
jsr ZF1
jsr clear_screen
jsr status_bar
lda #0
sta fill_level
fill_loop:
; sy = -92 .. 91
lda #(256-half_height)
sta sy
lda #(256-1)
sta sy + 1
loop_sy:
; sx = -80 .. 79
lda #(256-half_width)
sta sx
lda #(256-1)
sta sx + 1
loop_sx:
; check the fill mask
ldy #0
loop_skip_level:
cpy fill_level
beq current_level
lda fill_masks,y
and sx
bne not_skipped_mask1
lda fill_masks,y
and sy
beq skipped_mask
not_skipped_mask1:
iny
jmp loop_skip_level
current_level:
lda fill_masks,y
and sx
bne skipped_mask
lda fill_masks,y
and sy
beq not_skipped_mask
skipped_mask:
jmp skipped
not_skipped_mask:
; run the fractal!
zoom_factor cx, sx, aspect_x
add32 cx, cx, ox
zoom_factor cy, sy, aspect_y
add32 cy, cy, oy
jsr mandelbrot
jsr pset
jsr keycheck
beq no_key
; @fixme clear the pixel stats
jmp main_loop
no_key:
; check if we should update the counters
;
; count_pixels >= width? update!
inc count_pixels
lda count_pixels
cmp #width
bmi update_status
; count_frames >= 120? update!
lda count_frames
cmp #120 ; >= 2 seconds
bmi skipped
update_status:
; FR0 = (float)count_pixels & clear count_pixels
lda count_pixels
sta FR0
lda #0
sta FR0 + 1
sta count_pixels
jsr IFP
; FR1 = total_pixels
ldx #.lobyte(total_pixels)
ldy #.hibyte(total_pixels)
jsr FLD1R
; FR0 += FR1
jsr FADD
; total_pixels = FR0
ldx #.lobyte(total_pixels)
ldy #.hibyte(total_pixels)
jsr FST0R
; FR0 = (float)count_frames & clear count_frames
; warning: this should really disable interrupts @TODO
lda count_frames
sta FR0
lda #0
sta FR0 + 1
sta count_frames
jsr IFP
; FR0 *= ms_per_frame
ldx #.lobyte(ms_per_frame)
ldy #.hibyte(ms_per_frame)
jsr FLD1R
jsr FMUL
; FR0 += total_ms
ldx #total_ms
ldy #0
jsr FLD1R
jsr FADD
; total_ms = FR0
ldx #total_ms
ldy #0
jsr FST0R
; FR0 /= total_pixels
ldx #total_pixels
ldy #0
jsr FLD1R
jsr FDIV
; convert to ASCII in INBUFF
jsr FASC
; print the first 6 digits
draw_text_indirect speed_start, speed_precision, INBUFF
draw_text speed_start + speed_precision, str_speed_len, str_speed
skipped:
clc
lda sx
adc #1
sta sx
lda sx + 1
adc #0
sta sx + 1
lda sx
cmp #half_width
beq loop_sx_done
jmp loop_sx
loop_sx_done:
clc
lda sy
adc #1
sta sy
lda sy + 1
adc #0
sta sy + 1
lda sy
cmp #half_height
beq loop_sy_done
jmp loop_sy
loop_sy_done:
fill_loop_done:
inc fill_level
lda fill_level
cmp #max_fill_level
beq loop
jmp fill_loop
loop:
; finished
draw_text 40 - str_done_len, str_done_len, str_done
jsr keycheck
beq loop
jmp main_loop
.endproc