1402 lines
26 KiB
ArmAsm
1402 lines
26 KiB
ArmAsm
; Our zero-page vars
|
|
sx = $80 ; i16: screen pixel x
|
|
sy = $82 ; i16: screen pixel y
|
|
ox = $84 ; fixed4.12: center point x
|
|
oy = $86 ; fixed4.12: center point y
|
|
cx = $88 ; fixed4.12: c_x
|
|
cy = $8a ; fixed4.12: c_y
|
|
zx = $8c ; fixed4.12: z_x
|
|
zy = $8e ; fixed4.12: z_y
|
|
|
|
zx_2 = $90 ; fixed4.12: z_x^2
|
|
zy_2 = $92 ; fixed4.12: z_y^2
|
|
zx_zy = $94 ; fixed4.12: z_x * z_y
|
|
dist = $96 ; fixed4.12: z_x^2 + z_y^2
|
|
|
|
iter = $a0 ; u8: iteration count
|
|
|
|
zoom = $a1 ; u8: zoom shift level
|
|
count_frames = $a2 ; u8
|
|
count_pixels = $a3 ; u8
|
|
total_ms = $a4 ; float48
|
|
total_pixels = $aa ; float48
|
|
|
|
z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
|
|
z_buffer_start = $b1 ; u8: index into z_buffer
|
|
z_buffer_end = $b2 ; u8: index into z_buffer
|
|
temp = $b4 ; u16
|
|
temp2 = $b6 ; u16
|
|
pixel_ptr = $b8 ; u16
|
|
pixel_color = $ba ; u8
|
|
pixel_mask = $bb ; u8
|
|
pixel_shift = $bc ; u8
|
|
pixel_offset = $bd ; u8
|
|
fill_level = $be ; u8
|
|
palette_offset = $bf ; u8
|
|
|
|
; FP registers in zero page
|
|
FR0 = $d4 ; float48
|
|
FRE = $da
|
|
FR1 = $e0 ; float48
|
|
FR2 = $e6 ; float48
|
|
CIX = $f2 ; u8 - index into INBUFF
|
|
INBUFF = $f3 ; u16 - pointer to ascii
|
|
FLPTR = $fc ; u16 - pointer to user buffer float48
|
|
|
|
CH1 = $02f2 ; previous character read from keyboard
|
|
CH = $02fc ; current character read from keyboard
|
|
|
|
LBUFF = $0580 ; result buffer for FASC routine
|
|
|
|
; FP ROM routine vectors
|
|
FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
|
|
IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
|
|
FADD = $DA66 ; ADDITION (FR0 += FR1)
|
|
FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1)
|
|
FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1)
|
|
FDIV = $DB28 ; DIVISION (FR0 /= FR1)
|
|
ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
|
|
FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
|
|
FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
|
|
FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
|
|
FMOVE = $DDB6 ; MOVE FR0 TO FR1
|
|
|
|
; High data
|
|
framebuffer_top = $8000
|
|
textbuffer = $8f00
|
|
framebuffer_bottom = $9000
|
|
display_list = $9f00
|
|
framebuffer_end = $a000
|
|
|
|
height = 184
|
|
half_height = height >> 1
|
|
width = 160
|
|
half_width = width >> 1
|
|
stride = width >> 2
|
|
|
|
EXTENDED_RAM = $4000 ; 16KiB bank on the XE
|
|
PORTB = $D301 ; memory & bank-switch for XL/XE
|
|
|
|
DMACTL = $D400
|
|
DLISTL = $D402
|
|
DLISTH = $D403
|
|
WSYNC = $D40A
|
|
|
|
; OS shadow registers
|
|
SDLSTL = $230
|
|
SDLSTH = $231
|
|
|
|
; interrupt stuff
|
|
SYSVBV = $E45F
|
|
XITVBV = $E462
|
|
SETVBV = $E45C
|
|
|
|
COLOR0 = $2C4
|
|
COLOR1 = $2C5
|
|
COLOR2 = $2C6
|
|
COLOR3 = $2C7
|
|
COLOR4 = $2C8
|
|
|
|
; Keycodes!
|
|
KEY_PLUS = $06
|
|
KEY_MINUS = $0e
|
|
KEY_UP = $8e
|
|
KEY_DOWN = $8f
|
|
KEY_LEFT = $86
|
|
KEY_RIGHT = $87
|
|
|
|
.struct float48
|
|
exponent .byte
|
|
mantissa .byte 6
|
|
.endstruct
|
|
|
|
.import mul_lobyte256
|
|
.import mul_hibyte256
|
|
.import mul_hibyte512
|
|
|
|
.data
|
|
|
|
strings:
|
|
str_self:
|
|
.byte "MANDEL-6502"
|
|
str_self_end:
|
|
str_speed:
|
|
.byte " ms/px"
|
|
str_speed_end:
|
|
str_run:
|
|
.byte " RUN"
|
|
str_run_end:
|
|
str_done:
|
|
.byte "DONE"
|
|
str_done_end:
|
|
|
|
str_self_len = str_self_end - str_self
|
|
str_speed_len = str_speed_end - str_speed
|
|
str_run_len = str_run_end - str_run
|
|
str_done_len = str_done_end - str_done
|
|
speed_precision = 6
|
|
|
|
speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
|
|
speed_len = 14 + str_speed_len
|
|
|
|
|
|
char_map:
|
|
; Map ATASCII string values to framebuffer font entries
|
|
; Sighhhhh
|
|
.repeat 32, i
|
|
.byte i + 64
|
|
.endrepeat
|
|
.repeat 64, i
|
|
.byte i
|
|
.endrepeat
|
|
.repeat 32, i
|
|
.byte 96 + i
|
|
.endrepeat
|
|
|
|
hex_chars:
|
|
.byte "0123456789abcdef"
|
|
|
|
aspect:
|
|
; aspect ratio!
|
|
; pixels at 320w are 5:6 (narrow)
|
|
; pixels at 160w are 5:3 (wide)
|
|
;
|
|
; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4)
|
|
; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4)
|
|
;
|
|
; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624
|
|
; &horizontal range -80 .. 79.9 is -3.125 .. 3.124
|
|
;
|
|
; 184h is the equiv of 220.8h at square pixels
|
|
; 320 / 220.8 = 1.45 display aspect ratio
|
|
aspect_x: ; fixed4.16 5/4
|
|
.word 5 << (12 - 2)
|
|
|
|
aspect_y: ; fixed4.16 3/4
|
|
.word 3 << (12 - 2)
|
|
|
|
ms_per_frame: ; float48 16.66666667
|
|
.byte 64 ; exponent/sign
|
|
.byte $16 ; BCD digits
|
|
.byte $66
|
|
.byte $66
|
|
.byte $66
|
|
.byte $67
|
|
|
|
display_list_start:
|
|
; 24 lines overscan
|
|
.repeat 3
|
|
.byte $70 ; 8 blank lines
|
|
.endrep
|
|
|
|
; 8 scan lines, 1 row of 40-column text
|
|
.byte $42
|
|
.addr textbuffer
|
|
|
|
; 184 lines graphics
|
|
; ANTIC mode e (160px 2bpp, 1 scan line per line)
|
|
.byte $4e
|
|
.addr framebuffer_top
|
|
.repeat half_height - 1
|
|
.byte $0e
|
|
.endrep
|
|
.byte $4e
|
|
.addr framebuffer_bottom
|
|
.repeat half_height - 1
|
|
.byte $0e
|
|
.endrep
|
|
|
|
.byte $41 ; jump and blank
|
|
.addr display_list
|
|
display_list_end:
|
|
display_list_len = display_list_end - display_list_start
|
|
|
|
color_map:
|
|
.byte 0
|
|
.repeat 85
|
|
.byte 1
|
|
.byte 2
|
|
.byte 3
|
|
.endrepeat
|
|
|
|
palette:
|
|
.byte $00
|
|
.byte $46
|
|
.byte $78
|
|
.byte $b4
|
|
.code
|
|
|
|
z_buffer_len = 16
|
|
z_buffer_mask = z_buffer_len - 1
|
|
z_buffer:
|
|
; the last N zx/zy values
|
|
.repeat z_buffer_len
|
|
.word 0
|
|
.word 0
|
|
.endrepeat
|
|
|
|
.export start
|
|
|
|
max_fill_level = 6
|
|
fill_masks:
|
|
.byte %00011111
|
|
.byte %00001111
|
|
.byte %00000111
|
|
.byte %00000011
|
|
.byte %00000001
|
|
.byte %00000000
|
|
|
|
; 2 + 9 * byte cycles
|
|
.macro add bytes, dest, arg1, arg2
|
|
clc ; 2 cyc
|
|
.repeat bytes, byte ; 9 * byte cycles
|
|
lda arg1 + byte
|
|
adc arg2 + byte
|
|
sta dest + byte
|
|
.endrepeat
|
|
.endmacro
|
|
|
|
.macro add16 dest, arg1, arg2
|
|
add 2, dest, arg1, arg2
|
|
.endmacro
|
|
|
|
.macro add32 dest, arg1, arg2
|
|
add 4, dest, arg2, dest
|
|
.endmacro
|
|
|
|
.macro add_carry dest
|
|
lda dest
|
|
adc #0
|
|
sta dest
|
|
.endmacro
|
|
|
|
; 2 + 9 * byte cycles
|
|
.macro sub bytes, dest, arg1, arg2
|
|
sec ; 2 cyc
|
|
.repeat bytes, byte ; 9 * byte cycles
|
|
lda arg1 + byte
|
|
sbc arg2 + byte
|
|
sta dest + byte
|
|
.endrepeat
|
|
.endmacro
|
|
|
|
.macro sub16 dest, arg1, arg2
|
|
sub 2, dest, arg1, arg2
|
|
.endmacro
|
|
|
|
.macro sub32 dest, arg1, arg2
|
|
sub 4, dest, arg1, arg2
|
|
.endmacro
|
|
|
|
.macro shl bytes, arg
|
|
asl arg
|
|
.repeat bytes-1, i
|
|
rol arg + 1 + i
|
|
.endrepeat
|
|
.endmacro
|
|
|
|
.macro shl16 arg
|
|
shl 2, arg
|
|
.endmacro
|
|
|
|
.macro shl24 arg
|
|
shl 3, arg
|
|
.endmacro
|
|
|
|
.macro shl32 arg
|
|
shl 4, arg
|
|
.endmacro
|
|
|
|
; 6 * bytes cycles
|
|
.macro copy bytes, dest, arg
|
|
.repeat bytes, byte ; 6 * bytes cycles
|
|
lda arg + byte ; 3 cyc
|
|
sta dest + byte ; 3 cyc
|
|
.endrepeat
|
|
.endmacro
|
|
|
|
.macro copy16 dest, arg
|
|
copy 2, dest, arg
|
|
.endmacro
|
|
|
|
.macro copy32 dest, arg
|
|
copy 4, dest, arg
|
|
.endmacro
|
|
|
|
.macro copyfloat dest, arg
|
|
copy 6, dest, arg
|
|
.endmacro
|
|
|
|
; 2 + 8 * byte cycles
|
|
.macro neg bytes, arg
|
|
sec ; 2 cyc
|
|
.repeat bytes, byte ; 8 * byte cycles
|
|
lda #00 ; 2 cyc
|
|
sbc arg + byte ; 3 cyc
|
|
sta arg + byte ; 3 cyc
|
|
.endrepeat
|
|
.endmacro
|
|
|
|
; 18 cycles
|
|
.macro neg16 arg
|
|
neg 2, arg
|
|
.endmacro
|
|
|
|
; 34 cycles
|
|
.macro neg32 arg
|
|
neg 4, arg
|
|
.endmacro
|
|
|
|
; 518 - 828 cyc
|
|
.macro imul16 dest, arg1, arg2
|
|
copy16 FR0, arg1 ; 12 cyc
|
|
copy16 FR1, arg2 ; 12 cyc
|
|
jsr imul16_func ; 470-780 cyc
|
|
copy32 dest, FR2 ; 24 cyc
|
|
.endmacro
|
|
|
|
.macro shift_round_16 arg, shift
|
|
.repeat shift
|
|
shl32 arg
|
|
.endrepeat
|
|
round16 arg
|
|
.endmacro
|
|
|
|
.macro imul16_round dest, arg1, arg2, shift
|
|
copy16 FR0, arg1 ; 12 cyc
|
|
copy16 FR1, arg2 ; 12 cyc
|
|
jsr imul16_func ; 470-780 cyc
|
|
shift_round_16 FR2, shift
|
|
copy16 dest, FR2 + 2 ; 12 cyc
|
|
.endmacro
|
|
|
|
; Adapted from https://everything2.com/title/Fast+6502+multiplication
|
|
.macro imul8 dest, arg1, arg2
|
|
.local under256
|
|
.local next
|
|
.local small_product
|
|
; circa 92 cycles? this doesn't seem right
|
|
; 81-92 cycles
|
|
.scope
|
|
mul_factor_a = arg1
|
|
mul_factor_x = arg2
|
|
mul_product_lo = dest
|
|
mul_product_hi = dest + 1
|
|
|
|
lda mul_factor_a ; 3 cyc
|
|
|
|
; (a + x)^2/2
|
|
clc ; 2 cyc
|
|
adc mul_factor_x ; 3 cyc
|
|
tax ; 2 cyc
|
|
bcc under256 ; 2 cyc
|
|
lda mul_hibyte512,x ; 4 cyc
|
|
bcs next ; 2 cyc
|
|
under256:
|
|
lda mul_hibyte256,x ; 4 cyc
|
|
sec ; 2 cyc
|
|
next:
|
|
sta mul_product_hi ; 3 cyc
|
|
lda mul_lobyte256,x ; 4 cyc
|
|
|
|
; - a^2/2
|
|
ldx mul_factor_a ; 3 cyc
|
|
sbc mul_lobyte256,x ; 4 cyc
|
|
sta mul_product_lo ; 3 cyc
|
|
lda mul_product_hi ; 3 cyc
|
|
sbc mul_hibyte256,x ; 4 cyc
|
|
sta mul_product_hi ; 3 cyc
|
|
|
|
; + x & a & 1:
|
|
; (this is a kludge to correct a
|
|
; roundoff error that makes odd * odd too low)
|
|
ldx mul_factor_x ; 3 cyc
|
|
txa ; 2 cyc
|
|
and mul_factor_a ; 3 cyc
|
|
and #1 ; 2 cyc
|
|
|
|
clc ; 2 cyc
|
|
adc mul_product_lo ; 3 cyc
|
|
bcc small_product ; 2 cyc
|
|
inc mul_product_hi ; 5 cyc
|
|
|
|
; - x^2/2
|
|
small_product:
|
|
sec ; 2 cyc
|
|
sbc mul_lobyte256,x ; 4 cyc
|
|
sta mul_product_lo ; 3 cyc
|
|
lda mul_product_hi ; 3 cyc
|
|
sbc mul_hibyte256,x ; 4 cyc
|
|
sta mul_product_hi ; 3 cyc
|
|
.endscope
|
|
.endmacro
|
|
|
|
; lookup table for top byte -> PORTB value for bank-switch
|
|
;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
|
|
bank_switch_table:
|
|
.repeat 256, i
|
|
.byte ((i & $c0) >> 5) | $c1
|
|
.endrepeat
|
|
|
|
; 58-77 cycles
|
|
; clobbers x, y, dest to dest + 3
|
|
.macro imul8xe dest, arg1, arg2
|
|
.local done
|
|
.local output
|
|
.local ptr
|
|
|
|
output = dest
|
|
ptr = dest + 2 ; scratch space assumed
|
|
|
|
; bottom 14 bits except the LSB are the per-bank table index
|
|
; add $4000 for the bank pointer
|
|
lda arg1 ; 3 cyc
|
|
and #$fe ; 2 cyc
|
|
sta ptr ; 3 cyc
|
|
lda arg2 ; 3 cyc
|
|
and #$3f ; 2 cyc
|
|
clc ; 2 cyc
|
|
adc #$40 ; 2 cyc
|
|
sta ptr + 1 ; 3 cyc
|
|
|
|
; top 2 bits are the table bank selector
|
|
ldx arg2 ; 3 cyc
|
|
lda bank_switch_table,x ; 4 cyc
|
|
sta PORTB ; 4 cyc
|
|
|
|
|
|
; copy the entry into output
|
|
ldy #0 ; 2 cyc
|
|
lda (ptr),y ; 5 cyc
|
|
sta output ; 3 cyc
|
|
iny ; 2 cyc
|
|
lda (ptr),y ; 5 cyc
|
|
sta output+1 ; 3 cyc
|
|
|
|
; note: we are not restoring memory to save 6 cycles!
|
|
; this means those 16kb have to be switched back to base RAM
|
|
; if we need to use them anywhere else
|
|
;;; restore memory
|
|
;;lda #$81 ; 2 cyc - disabled
|
|
;;sta PORTB ; 4 cyc - disabled
|
|
|
|
; check that 1 bit we skipped to fit into space
|
|
lda arg1 ; 3 cyc
|
|
and #1 ; 2 cyc
|
|
beq done ; 2 cyc
|
|
|
|
; add the second param one last time for the skipped bit
|
|
clc ; 2 cyc
|
|
lda arg2 ; 3 cyc
|
|
adc output ; 3 cyc
|
|
sta output ; 3 cyc
|
|
lda #0 ; 2 cyc
|
|
adc output+1 ; 3 cyc
|
|
sta output+1 ; 3 cyc
|
|
|
|
done:
|
|
.endmacro
|
|
|
|
.macro bank_switch bank
|
|
lda #((bank << 1) | $c1)
|
|
sta PORTB
|
|
.endmacro
|
|
|
|
.proc imul8xe_init
|
|
|
|
; go through the input set, in four 16KB chunks
|
|
|
|
arg1 = FR1
|
|
arg2 = FR2
|
|
result = FR0
|
|
|
|
lda #$00
|
|
sta arg1
|
|
sta arg2
|
|
|
|
; $00 * $00 -> $3f * $ff
|
|
bank_switch 0
|
|
jsr imul8xe_init_section
|
|
|
|
; $40 * $00 -> $7f * $ff
|
|
bank_switch 1
|
|
jsr imul8xe_init_section
|
|
|
|
; $80 * $00 -> $bf * $ff
|
|
bank_switch 2
|
|
jsr imul8xe_init_section
|
|
|
|
; $c0 * $00 -> $ff * $ff
|
|
bank_switch 3
|
|
jsr imul8xe_init_section
|
|
|
|
rts
|
|
.endproc
|
|
|
|
; Initialize a 16 KB chunk of the table
|
|
; input: multipliers in temp
|
|
; output: new multipliers in temp
|
|
; clobbers: temp, temp2
|
|
.proc imul8xe_init_section
|
|
arg1 = FR1
|
|
arg2 = FR2
|
|
result = FR0
|
|
ptr = temp2
|
|
|
|
lda #$00
|
|
sta ptr
|
|
lda #$40
|
|
sta ptr + 1
|
|
|
|
ldx #0
|
|
ldy #0
|
|
|
|
; outer loop: $00 -> $3f
|
|
outer_loop:
|
|
|
|
; reset result to 0
|
|
lda #0
|
|
sta result
|
|
sta result + 1
|
|
|
|
; inner loop: $00 -> $ff
|
|
inner_loop:
|
|
|
|
; copy result to data set
|
|
lda result
|
|
sta (ptr),y
|
|
lda result + 1
|
|
sta (ptr),y
|
|
|
|
; result += 2 * arg2
|
|
clc
|
|
lda arg2
|
|
adc result
|
|
sta result
|
|
lda #0
|
|
adc result + 1
|
|
sta result
|
|
lda arg2
|
|
adc result
|
|
sta result
|
|
lda #0
|
|
adc result + 1
|
|
sta result
|
|
|
|
; inner loop check
|
|
inc arg1
|
|
inc arg1
|
|
inc ptr
|
|
inc ptr
|
|
bne inner_loop
|
|
|
|
; outer loop check
|
|
inc arg2
|
|
inc ptr + 1
|
|
lda ptr + 1
|
|
cmp #$40
|
|
bne outer_loop
|
|
|
|
rts
|
|
|
|
.endproc
|
|
|
|
.proc imul16_func
|
|
arg1 = FR0 ; 16-bit arg (clobbered)
|
|
arg2 = FR1 ; 16-bit arg (clobbered)
|
|
result = FR2 ; 32-bit result
|
|
inter = temp2
|
|
|
|
; h1l1 * h2l2
|
|
; (h1*256 + l1) * (h2*256 + l2)
|
|
; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
|
|
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
|
|
|
|
imul8xe result, arg1, arg2
|
|
lda #0
|
|
sta result + 2
|
|
sta result + 3
|
|
|
|
imul8xe inter, arg1 + 1, arg2
|
|
add16 result + 1, result + 1, inter
|
|
add_carry result + 3
|
|
|
|
imul8xe inter, arg1, arg2 + 1
|
|
add16 result + 1, result + 1, inter
|
|
add_carry result + 3
|
|
|
|
imul8xe inter, arg1 + 1, arg2 + 1
|
|
add16 result + 2, result + 2, inter
|
|
|
|
; In case of negative inputs, adjust high word
|
|
; https://stackoverflow.com/a/28827013
|
|
lda arg1 + 1
|
|
bpl arg1_pos
|
|
sub16 result + 2, result + 2, arg2
|
|
arg1_pos:
|
|
lda arg2 + 1
|
|
bpl arg2_pos
|
|
sub16 result + 2, result + 2, arg1
|
|
arg2_pos:
|
|
|
|
rts ; 6 cyc
|
|
.endproc
|
|
|
|
.macro round16 arg
|
|
; Round top 16 bits of 32-bit fixed-point number in-place
|
|
.local increment
|
|
.local high_half
|
|
.local check_sign
|
|
.local next
|
|
|
|
; low word > $8000: round up
|
|
; = $8000: round up if positive
|
|
; round down if negative
|
|
; < $8000: round down
|
|
|
|
lda arg + 1
|
|
cmp #$80
|
|
beq high_half
|
|
bpl increment
|
|
bmi next
|
|
|
|
high_half:
|
|
lda arg
|
|
beq check_sign
|
|
bpl increment
|
|
bmi next
|
|
|
|
check_sign:
|
|
lda arg + 3
|
|
bmi next
|
|
|
|
increment: ; 5-10 cyc
|
|
inc arg + 2 ; 5 cyc
|
|
bne next ; 2 cyc
|
|
inc arg + 3 ; 5 cyc
|
|
|
|
next:
|
|
|
|
.endmacro
|
|
|
|
.proc mandelbrot
|
|
; input:
|
|
; cx: position scaled to 4.12 fixed point - -8..+7.9
|
|
; cy: position scaled to 4.12
|
|
;
|
|
; output:
|
|
; iter: iteration count at escape or 0
|
|
|
|
; zx = 0
|
|
; zy = 0
|
|
; zx_2 = 0
|
|
; zy_2 = 0
|
|
; zx_zy = 0
|
|
; dist = 0
|
|
; iter = 0
|
|
lda #00
|
|
ldx #(iter - zx + 1)
|
|
initloop:
|
|
sta zx - 1,x
|
|
dex
|
|
bne initloop
|
|
sta z_buffer_start
|
|
sta z_buffer_end
|
|
|
|
loop:
|
|
; iter++ & max-iters break
|
|
inc iter
|
|
bne keep_going
|
|
jmp exit_path
|
|
keep_going:
|
|
|
|
.macro quick_exit arg, max
|
|
.local positive
|
|
.local negative
|
|
.local nope_out
|
|
.local first_equal
|
|
.local all_done
|
|
|
|
; check sign bit
|
|
lda arg + 1
|
|
bmi negative
|
|
|
|
positive:
|
|
cmp #((max) << 4)
|
|
bmi all_done ; 'less than'
|
|
jmp exit_path
|
|
|
|
negative:
|
|
cmp #(256 - ((max) << 4))
|
|
beq first_equal ; 'equal' on first byte
|
|
bpl all_done ; 'greater than'
|
|
|
|
nope_out:
|
|
jmp exit_path
|
|
|
|
first_equal:
|
|
lda arg
|
|
beq nope_out ; 2nd byte 0 shows it's really 'equal'
|
|
|
|
all_done:
|
|
.endmacro
|
|
|
|
; 4.12: (-8 .. +7.9)
|
|
; zx = zx_2 - zy_2 + cx
|
|
sub16 zx, zx_2, zy_2
|
|
add16 zx, zx, cx
|
|
quick_exit zx, 2
|
|
|
|
; zy = zx_zy + zx_zy + cy
|
|
add16 zy, zx_zy, zx_zy
|
|
add16 zy, zy, cy
|
|
quick_exit zy, 2
|
|
|
|
; zx_2 = zx * zx
|
|
imul16_round zx_2, zx, zx, 4
|
|
|
|
; zy_2 = zy * zy
|
|
imul16_round zy_2, zy, zy, 4
|
|
|
|
; zx_zy = zx * zy
|
|
imul16_round zx_zy, zx, zy, 4
|
|
|
|
; dist = zx_2 + zy_2
|
|
add16 dist, zx_2, zy_2
|
|
quick_exit dist, 4
|
|
|
|
; if may be in the lake, look for looping output with a small buffer
|
|
; as an optimization vs running to max iters
|
|
lda z_buffer_active
|
|
beq skip_z_buffer
|
|
|
|
ldx z_buffer_start
|
|
cpx z_buffer_end
|
|
beq z_nothing_to_read
|
|
|
|
z_buffer_loop:
|
|
.macro z_compare arg
|
|
.local compare_no_match
|
|
lda z_buffer,x
|
|
inx
|
|
cmp arg
|
|
bne compare_no_match
|
|
iny
|
|
compare_no_match:
|
|
.endmacro
|
|
.macro z_advance
|
|
.local skip_reset_x
|
|
cpx #(z_buffer_len * 4)
|
|
bmi skip_reset_x
|
|
ldx #0
|
|
skip_reset_x:
|
|
.endmacro
|
|
.macro z_store arg
|
|
lda arg
|
|
sta z_buffer,x
|
|
inx
|
|
.endmacro
|
|
|
|
; Compare the previously stored z values
|
|
ldy #0
|
|
z_compare zx
|
|
z_compare zx + 1
|
|
z_compare zy
|
|
z_compare zy + 1
|
|
|
|
cpy #4
|
|
bne z_no_matches
|
|
jmp z_exit
|
|
|
|
z_no_matches:
|
|
z_advance
|
|
|
|
cpx z_buffer_end
|
|
bne z_buffer_loop
|
|
|
|
z_nothing_to_read:
|
|
|
|
; Store and expand
|
|
z_store zx
|
|
z_store zx + 1
|
|
z_store zy
|
|
z_store zy + 1
|
|
z_advance
|
|
stx z_buffer_end
|
|
|
|
; Increment the start roller if necessary (limit size)
|
|
lda iter
|
|
cmp #(z_buffer_len * 4)
|
|
bmi skip_inc_start
|
|
lda z_buffer_start
|
|
clc
|
|
adc #4
|
|
tax
|
|
z_advance
|
|
stx z_buffer_start
|
|
skip_inc_start:
|
|
|
|
skip_z_buffer:
|
|
|
|
jmp loop
|
|
|
|
z_exit:
|
|
lda #0
|
|
sta iter
|
|
|
|
exit_path:
|
|
ldx #0
|
|
lda iter
|
|
bne next
|
|
inx
|
|
next:
|
|
stx z_buffer_active
|
|
rts
|
|
|
|
.endproc
|
|
|
|
.macro scale_zoom dest
|
|
; clobbers X, flags
|
|
.local cont
|
|
.local enough
|
|
|
|
; cx = (sx << (8 - zoom))
|
|
ldx zoom
|
|
cont:
|
|
cpx #8
|
|
beq enough
|
|
shl16 dest
|
|
inx
|
|
jmp cont
|
|
enough:
|
|
.endmacro
|
|
|
|
.macro zoom_factor dest, src, zoom, aspect
|
|
; clobbers A, X, flags, etc
|
|
copy16 dest, src
|
|
scale_zoom dest
|
|
|
|
; cy = cy * (3 / 4)
|
|
; cx = cx * (5 / 4)
|
|
imul16_round dest, dest, aspect, 4
|
|
.endmacro
|
|
|
|
.proc pset
|
|
; screen coords in signed sx,sy
|
|
; iter holds the target to use
|
|
; @todo implement
|
|
|
|
; iter -> color
|
|
ldx iter
|
|
lda color_map,x
|
|
sta pixel_color
|
|
lda #(255 - 3)
|
|
sta pixel_mask
|
|
|
|
; sy -> line base address in temp
|
|
lda sy
|
|
bpl positive
|
|
|
|
negative:
|
|
; temp1 = top half
|
|
lda #.lobyte(framebuffer_top + stride * half_height)
|
|
sta pixel_ptr
|
|
lda #.hibyte(framebuffer_top + stride * half_height)
|
|
sta pixel_ptr + 1
|
|
jmp point
|
|
|
|
positive:
|
|
|
|
lda #.lobyte(framebuffer_bottom)
|
|
sta pixel_ptr
|
|
lda #.hibyte(framebuffer_bottom)
|
|
sta pixel_ptr + 1
|
|
|
|
point:
|
|
|
|
; pixel_ptr += sy * stride
|
|
; temp * 40
|
|
; = temp * 32 + temp * 8
|
|
; = (temp << 5) + (temp << 3)
|
|
copy16 temp, sy
|
|
shl16 temp
|
|
shl16 temp
|
|
shl16 temp
|
|
add16 pixel_ptr, pixel_ptr, temp
|
|
shl16 temp
|
|
shl16 temp
|
|
add16 pixel_ptr, pixel_ptr, temp
|
|
|
|
; Ok so temp1 points to the start of the line, which is 40 bytes.
|
|
; Get the byte and bit offsets
|
|
lda sx
|
|
clc
|
|
adc #half_width
|
|
sta temp
|
|
|
|
; pixel_shift = temp & 3
|
|
; pixel_color <<= pixel_shift (shifting in zeros)
|
|
; pixel_mask <<= pixel_shift (shifting in ones)
|
|
and #3
|
|
sta pixel_shift
|
|
lda #3
|
|
sec
|
|
sbc pixel_shift
|
|
tax
|
|
shift_loop:
|
|
beq shift_done
|
|
asl pixel_color
|
|
asl pixel_color
|
|
sec
|
|
rol pixel_mask
|
|
sec
|
|
rol pixel_mask
|
|
dex
|
|
jmp shift_loop
|
|
shift_done:
|
|
|
|
; pixel_offset = temp >> 2
|
|
lda temp
|
|
lsr a
|
|
lsr a
|
|
sta pixel_offset
|
|
tay
|
|
|
|
; read, mask, or, write
|
|
lda (pixel_ptr),y
|
|
and pixel_mask
|
|
ora pixel_color
|
|
sta (pixel_ptr),y
|
|
|
|
rts
|
|
.endproc
|
|
|
|
.macro draw_text_indirect col, len, strptr
|
|
; clobbers A, X
|
|
.local loop
|
|
.local done
|
|
ldx #0
|
|
loop:
|
|
cpx #len
|
|
beq done
|
|
txa
|
|
tay
|
|
lda (strptr),y
|
|
tay
|
|
lda char_map,y
|
|
sta textbuffer + col,x
|
|
inx
|
|
jmp loop
|
|
done:
|
|
.endmacro
|
|
|
|
.macro draw_text col, len, cstr
|
|
; clobbers A, X
|
|
.local loop
|
|
.local done
|
|
ldx #0
|
|
loop:
|
|
cpx #len
|
|
beq done
|
|
ldy cstr,x
|
|
lda char_map,y
|
|
sta textbuffer + col,x
|
|
inx
|
|
jmp loop
|
|
done:
|
|
.endmacro
|
|
|
|
.proc vblank_handler
|
|
inc count_frames
|
|
inc palette_offset
|
|
jsr update_palette
|
|
jmp XITVBV
|
|
.endproc
|
|
|
|
.proc update_palette
|
|
lda palette
|
|
sta COLOR4
|
|
|
|
clc
|
|
lda palette_offset
|
|
and #$f0
|
|
adc palette + 1
|
|
sta COLOR0
|
|
|
|
clc
|
|
lda palette_offset
|
|
and #$f0
|
|
adc palette + 2
|
|
sta COLOR1
|
|
|
|
clc
|
|
lda palette_offset
|
|
and #$f0
|
|
adc palette + 3
|
|
sta COLOR2
|
|
.endproc
|
|
|
|
.proc update_speed
|
|
; convert frames (u16) to fp
|
|
; add to frames_total
|
|
; convert pixels (u16) to fp
|
|
; add to pixels_total
|
|
; (frames_total * 16.66666667) / pixels_total
|
|
; convert to ATASCII
|
|
; draw text
|
|
.endproc
|
|
|
|
.proc keycheck
|
|
; clobbers all
|
|
; returns 255 in A if state change or 0 if no change
|
|
|
|
; check keyboard buffer
|
|
lda CH
|
|
cmp #$ff
|
|
beq skip_char
|
|
|
|
; Clear the keyboard buffer and re-enable interrupts
|
|
ldx #$ff
|
|
stx CH
|
|
|
|
tay
|
|
|
|
lda zoom
|
|
cpy #KEY_PLUS
|
|
beq plus
|
|
cpy #KEY_MINUS
|
|
beq minus
|
|
|
|
; temp = $0010 << (8 - zoom)
|
|
lda #$10
|
|
sta temp
|
|
lda #$00
|
|
sta temp + 1
|
|
scale_zoom temp
|
|
|
|
cpy #KEY_UP
|
|
beq up
|
|
cpy #KEY_DOWN
|
|
beq down
|
|
cpy #KEY_LEFT
|
|
beq left
|
|
cpy #KEY_RIGHT
|
|
beq right
|
|
|
|
skip_char:
|
|
lda #0
|
|
rts
|
|
|
|
plus:
|
|
cmp #8
|
|
bpl skip_char
|
|
inc zoom
|
|
jmp done
|
|
minus:
|
|
cmp #1
|
|
bmi skip_char
|
|
dec zoom
|
|
jmp done
|
|
up:
|
|
sub16 oy, oy, temp
|
|
jmp done
|
|
down:
|
|
add16 oy, oy, temp
|
|
jmp done
|
|
left:
|
|
sub16 ox, ox, temp
|
|
jmp done
|
|
right:
|
|
add16 ox, ox, temp
|
|
done:
|
|
lda #255
|
|
rts
|
|
|
|
.endproc
|
|
|
|
.proc clear_screen
|
|
; zero the range from framebuffer_top to display_list
|
|
lda #.lobyte(framebuffer_top)
|
|
sta temp
|
|
lda #.hibyte(framebuffer_top)
|
|
sta temp + 1
|
|
|
|
zero_page_loop:
|
|
lda #0
|
|
ldy #0
|
|
zero_byte_loop:
|
|
sta (temp),y
|
|
iny
|
|
bne zero_byte_loop
|
|
|
|
inc temp + 1
|
|
lda temp + 1
|
|
cmp #.hibyte(display_list)
|
|
bne zero_page_loop
|
|
|
|
rts
|
|
.endproc
|
|
|
|
.proc status_bar
|
|
; Status bar
|
|
draw_text 0, str_self_len, str_self
|
|
draw_text 40 - str_run_len, str_run_len, str_run
|
|
|
|
rts
|
|
.endproc
|
|
|
|
.proc start
|
|
|
|
jsr imul8xe_init
|
|
|
|
; ox = 0; oy = 0; zoom = 0
|
|
; count_frames = 0; count_pixels = 0
|
|
lda #0
|
|
sta ox
|
|
sta ox + 1
|
|
sta oy
|
|
sta oy + 1
|
|
sta count_frames
|
|
sta count_pixels
|
|
|
|
; total_ms = 0.0; total_pixels = 0.0
|
|
ldx #total_ms
|
|
jsr ZF1
|
|
ldx #total_pixels
|
|
jsr ZF1
|
|
|
|
; zoom = 2x
|
|
lda #1
|
|
sta zoom
|
|
|
|
; Disable display DMA
|
|
lda #0
|
|
sta DMACTL
|
|
|
|
jsr clear_screen
|
|
|
|
; Copy the display list into properly aligned memory
|
|
; Can't cross 1024-byte boundaries :D
|
|
ldx #0
|
|
copy_byte_loop:
|
|
lda display_list_start,x
|
|
sta display_list,x
|
|
inx
|
|
cpx #display_list_len
|
|
bne copy_byte_loop
|
|
|
|
; Set up the display list
|
|
lda #.lobyte(display_list)
|
|
sta DLISTL ; actual register
|
|
sta SDLSTL ; shadow register the OS will copy in
|
|
lda #.hibyte(display_list)
|
|
sta DLISTH ; actual register
|
|
sta SDLSTH ; shadow register the OS will copy in
|
|
|
|
; Re-enable display DMA
|
|
lda #$22
|
|
sta DMACTL
|
|
|
|
; Initialize the palette
|
|
lda #0
|
|
sta palette_offset
|
|
jsr update_palette
|
|
|
|
; install the vblank handler
|
|
lda #7 ; deferred
|
|
ldx #.hibyte(vblank_handler)
|
|
ldy #.lobyte(vblank_handler)
|
|
jsr SETVBV
|
|
|
|
main_loop:
|
|
jsr clear_screen
|
|
jsr status_bar
|
|
|
|
lda #0
|
|
sta fill_level
|
|
|
|
fill_loop:
|
|
|
|
; sy = -92 .. 91
|
|
lda #(256-half_height)
|
|
sta sy
|
|
lda #(256-1)
|
|
sta sy + 1
|
|
|
|
loop_sy:
|
|
; sx = -80 .. 79
|
|
lda #(256-half_width)
|
|
sta sx
|
|
lda #(256-1)
|
|
sta sx + 1
|
|
|
|
loop_sx:
|
|
; check the fill mask
|
|
ldy #0
|
|
|
|
loop_skip_level:
|
|
cpy fill_level
|
|
beq current_level
|
|
|
|
lda fill_masks,y
|
|
and sx
|
|
bne not_skipped_mask1
|
|
|
|
lda fill_masks,y
|
|
and sy
|
|
beq skipped_mask
|
|
|
|
not_skipped_mask1:
|
|
iny
|
|
jmp loop_skip_level
|
|
|
|
current_level:
|
|
lda fill_masks,y
|
|
and sx
|
|
bne skipped_mask
|
|
|
|
lda fill_masks,y
|
|
and sy
|
|
beq not_skipped_mask
|
|
|
|
skipped_mask:
|
|
jmp skipped
|
|
|
|
not_skipped_mask:
|
|
|
|
; run the fractal!
|
|
zoom_factor cx, sx, zoom, aspect_x
|
|
add16 cx, cx, ox
|
|
zoom_factor cy, sy, zoom, aspect_y
|
|
add16 cy, cy, oy
|
|
jsr mandelbrot
|
|
jsr pset
|
|
|
|
jsr keycheck
|
|
beq no_key
|
|
; @fixme clear the pixel stats
|
|
jmp main_loop
|
|
|
|
no_key:
|
|
; check if we should update the counters
|
|
;
|
|
; count_pixels >= width? update!
|
|
inc count_pixels
|
|
lda count_pixels
|
|
cmp #width
|
|
bmi update_status
|
|
|
|
; count_frames >= 120? update!
|
|
lda count_frames
|
|
cmp #120 ; >= 2 seconds
|
|
bmi skipped
|
|
|
|
update_status:
|
|
; FR0 = (float)count_pixels & clear count_pixels
|
|
lda count_pixels
|
|
sta FR0
|
|
lda #0
|
|
sta FR0 + 1
|
|
sta count_pixels
|
|
jsr IFP
|
|
|
|
; FR1 = total_pixels
|
|
ldx #.lobyte(total_pixels)
|
|
ldy #.hibyte(total_pixels)
|
|
jsr FLD1R
|
|
|
|
; FR0 += FR1
|
|
jsr FADD
|
|
|
|
; total_pixels = FR0
|
|
ldx #.lobyte(total_pixels)
|
|
ldy #.hibyte(total_pixels)
|
|
jsr FST0R
|
|
|
|
|
|
; FR0 = (float)count_frames & clear count_frames
|
|
; warning: this should really disable interrupts @TODO
|
|
lda count_frames
|
|
sta FR0
|
|
lda #0
|
|
sta FR0 + 1
|
|
sta count_frames
|
|
jsr IFP
|
|
|
|
; FR0 *= ms_per_frame
|
|
ldx #.lobyte(ms_per_frame)
|
|
ldy #.hibyte(ms_per_frame)
|
|
jsr FLD1R
|
|
jsr FMUL
|
|
|
|
; FR0 += total_ms
|
|
ldx #total_ms
|
|
ldy #0
|
|
jsr FLD1R
|
|
jsr FADD
|
|
|
|
; total_ms = FR0
|
|
ldx #total_ms
|
|
ldy #0
|
|
jsr FST0R
|
|
|
|
; FR0 /= total_pixels
|
|
ldx #total_pixels
|
|
ldy #0
|
|
jsr FLD1R
|
|
jsr FDIV
|
|
|
|
; convert to ASCII in INBUFF
|
|
jsr FASC
|
|
|
|
; print the first 6 digits
|
|
draw_text_indirect speed_start, speed_precision, INBUFF
|
|
draw_text speed_start + speed_precision, str_speed_len, str_speed
|
|
|
|
skipped:
|
|
|
|
clc
|
|
lda sx
|
|
adc #1
|
|
sta sx
|
|
lda sx + 1
|
|
adc #0
|
|
sta sx + 1
|
|
|
|
lda sx
|
|
cmp #half_width
|
|
beq loop_sx_done
|
|
jmp loop_sx
|
|
|
|
loop_sx_done:
|
|
|
|
clc
|
|
lda sy
|
|
adc #1
|
|
sta sy
|
|
lda sy + 1
|
|
adc #0
|
|
sta sy + 1
|
|
|
|
lda sy
|
|
cmp #half_height
|
|
beq loop_sy_done
|
|
jmp loop_sy
|
|
|
|
loop_sy_done:
|
|
|
|
fill_loop_done:
|
|
inc fill_level
|
|
lda fill_level
|
|
cmp #max_fill_level
|
|
beq loop
|
|
jmp fill_loop
|
|
|
|
loop:
|
|
; finished
|
|
draw_text 40 - str_done_len, str_done_len, str_done
|
|
jsr keycheck
|
|
beq loop
|
|
jmp main_loop
|
|
|
|
.endproc
|