Fork 0
forked from brooke/mandel-6502
2023-01-22 12:47:24 -08:00

730 lines
14 KiB

; Our zero-page vars
sx = $80 ; i16: screen pixel x
sy = $82 ; i16: screen pixel y
ox = $84 ; fixed4.12: center point x
oy = $86 ; fixed4.12: center point y
cx = $84 ; fixed4.12: c_x
cy = $86 ; fixed4.12: c_y
zx = $88 ; fixed4.12: z_x
zy = $8a ; fixed4.12: z_y
zx_2 = $90 ; fixed8.24: z_x^2
zy_2 = $94 ; fixed8.24: z_y^2
zx_zy = $98 ; fixed8.24: z_x * z_y
dist = $9c ; fixed8.24: z_x^2 + z_y^2
iter = $a0 ; u8: iteration count
zoom = $a1 ; u8: zoom shift level
temp = $a2 ; u16
temp2 = $a4 ; u16
pixel_ptr = $b0 ; u16
pixel_color = $b2 ; u8
pixel_mask = $b3 ; u8
pixel_shift = $b4 ; u8
pixel_offset = $b5 ; u8
; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6
; High data
framebuffer_top = $8000
textbuffer = $8f00
framebuffer_bottom = $9000
display_list = $9f00
framebuffer_end = $a000
height = 184
half_height = height >> 1
width = 160
half_width = width >> 1
stride = width >> 2
DMACTL = $D400
DLISTL = $D402
DLISTH = $D403
; OS shadow registers
SDLSTL = $230
SDLSTH = $231
.byte "MANDEL-6502"
.byte "ms/px"
.byte " RUN"
.byte "DONE"
str_self_len = str_self_end - str_self
str_speed_len = str_speed_end - str_speed
str_run_len = str_run_end - str_run
str_done_len = str_done_end - str_done
; Map ATASCII string values to framebuffer font entries
; Sighhhhh
.repeat 32, i
.byte i + 64
.repeat 64, i
.byte i
.repeat 32, i
.byte 96 + i
; aspect ratio!
; pixels at 320w are 5:6 (narrow)
; pixels at 160w are 5:3 (wide)
; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4)
; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4)
; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624
; &horizontal range -80 .. 79.9 is -3.125 .. 3.124
; 184h is the equiv of 220.8h at square pixels
; 320 / 220.8 = 1.45 display aspect ratio
.word 5 << (12 - 2)
.word 3 << (12 - 2)
; 24 lines overscan
.repeat 3
.byte $70 ; 8 blank lines
; 8 scan lines, 1 row of 40-column text
.byte $42
.addr textbuffer
; 184 lines graphics
; ANTIC mode e (160px 2bpp, 1 scan line per line)
.byte $4e
.addr framebuffer_top
.repeat half_height - 1
.byte $0e
.byte $4e
.addr framebuffer_bottom
.repeat half_height - 1
.byte $0e
.byte $41 ; jump and blank
.addr display_list
display_list_len = display_list_end - display_list_start
.byte 0
.repeat 85
.byte 1
.byte 2
.byte 3
.export start
; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
clc ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
adc arg2 + byte
sta dest + byte
.macro add16 dest, arg1, arg2
add 2, dest, arg1, arg2
.macro add32 dest, arg1, arg2
add 4, dest, arg2, dest
; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2
sec ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
sbc arg2 + byte
sta dest + byte
.macro sub16 dest, arg1, arg2
sub 2, dest, arg1, arg2
.macro sub32 dest, arg1, arg2
sub 4, dest, arg1, arg2
.macro shl bytes, arg
asl arg
.repeat bytes-1, i
rol arg + 1 + i
.macro shl16 arg
shl 2, arg
.macro shl24 arg
shl 3, arg
.macro shl32 arg
shl 4, arg
; 6 * bytes cycles
.macro copy bytes, dest, arg
.repeat bytes, byte ; 6 * bytes cycles
lda arg + byte ; 3 cyc
sta dest + byte ; 3 cyc
.macro copy16 dest, arg
copy 2, dest, arg
.macro copy32 dest, arg
copy 4, dest, arg
; 2 + 8 * byte cycles
.macro neg bytes, arg
sec ; 2 cyc
.repeat bytes, byte ; 8 * byte cycles
lda #00 ; 2 cyc
sbc arg + byte ; 3 cyc
sta arg + byte ; 3 cyc
; 18 cycles
.macro neg16 arg
neg 2, arg
; 34 cycles
.macro neg32 arg
neg 4, arg
; inner loop for imul16
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
.macro bitmul16 arg1, arg2, result, bitnum
.local zero
.local one
.local next
; does 16-bit adds
; arg1 and arg2 are treated as unsigned
; negative signed inputs must be flipped first
; 7 cycles up to the branch
; check if arg1 has 0 or 1 bit in this place
; 5 cycles either way
.if bitnum < 8
lda arg1 ; 3 cyc
and #(1 << (bitnum)) ; 2 cyc
lda arg1 + 1 ; 3 cyc
and #(1 << ((bitnum) - 8)) ; 2 cyc
bne one ; 2 cyc
zero: ; 18 cyc, 23 cyc
lsr result + 3 ; 5 cyc
jmp next ; 3 cyc
one: ; 32 cyc, 37 cyc
; 16-bit add on the top bits
clc ; 2 cyc
lda result + 2 ; 3 cyc
adc arg2 ; 3 cyc
sta result + 2 ; 3 cyc
lda result + 3 ; 3 cyc
adc arg2 + 1 ; 3 cyc
ror a ; 2 cyc - get a jump on the shift
sta result + 3 ; 3 cyc
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
.if bitnum >= 8
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
ror result ; 5 cyc
; 5 to 25 cycles
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the X register.
.local positive
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
inx ; 2 cyc
; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; 470-780 cyc
copy32 dest, FR2 ; 24 cyc
.macro shift_round_16 arg, shift
.repeat shift
shl32 arg
round16 arg
.macro imul16_round dest, arg1, arg2, shift
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; 470-780 cyc
shift_round_16 FR2, shift
copy16 dest, FR2 + 2 ; 12 cyc
; min 470 cycles
; max 780 cycles
.proc imul16_func
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
ldx #0 ; 2 cyc
; counts the number of sign bits in X
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
; zero out the 32-bit temp's top 16 bits
lda #0 ; 2 cyc
sta result + 2 ; 3 cyc
sta result + 3 ; 3 cyc
; the bottom two bytes will get cleared by the shifts
; unrolled loop for maximum speed, at the cost
; of a larger routine
; 440 to 696 cycles
.repeat 16, bitnum
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
bitmul16 arg1, arg2, result, bitnum
; In case of mixed input signs, return a negative result.
cpx #1 ; 2 cyc
bne positive_result ; 2 cyc
neg32 result ; 34 cyc
rts ; 6 cyc
.macro round16 arg
; Round top 16 bits of 32-bit fixed-point number in-place
.local zero
.local one
.local positive
.local negative
.local neg2
.local next
; no round - 5 cycles
; round pos, no carry - 17
; round pos, carry - 22
; round neg, no carry - 23
; round neg, carry - 28
; average = 5 / 2 + (17 + 22 + 23 + 28) / 8
; = 5 / 2 + 90 / 8
; = 2.5 + 11.25 = 13.75 cycles average on evenly distributed input
lda arg + 1 ; 3 cyc
bpl zero ; 2 cyc
; check sign bit
lda arg + 3 ; 3 cyc
bpl positive ; 2 cyc
lda arg + 2 ; 3 cyc
beq neg2 ; 2 cyc
dec arg + 2 ; 5 cyc
jmp next ; 3 cyc
dec arg + 2 ; 5 cyc
dec arg + 3 ; 5 cyc
jmp next ; 3 cyc
inc arg + 2 ; 5 cyc
beq next ; 2 cyc
inc arg + 3 ; 5 cyc
.proc mandelbrot
; input:
; cx: position scaled to 4.12 fixed point - -8..+7.9
; cy: position scaled to 4.12
; output:
; iter: iteration count at escape or 0
; zx = 0
; zy = 0
; zx_2 = 0
; zy_2 = 0
; zx_zy = 0
; dist = 0
; iter = 0
lda #00
ldx #(iter - zx + 1)
sta zx - 1,x
bne initloop
; iter++ & max-iters break
inc iter
bne keep_going
.macro quick_exit arg
.local keep_going
.local keep_going2
lda arg + 1
cmp #(4 << 4)
bmi keep_going
cmp #(256 - (4 << 4))
bpl keep_going2
; 4.12: (-8 .. +7.9)
; zx = zx_2 - zy_2 + cx
sub16 zx, zx_2, zy_2
add16 zx, zx, cx
quick_exit zx
; zy = zx_zy + zx_zy + cy
add16 zy, zx_zy, zx_zy
add16 zy, zy, cy
quick_exit zy
; zx_2 = zx * zx
imul16_round zx_2, zx, zx, 4
quick_exit zx_2
; zy_2 = zy * zy
imul16_round zy_2, zy, zy, 4
quick_exit zy_2
; zx_zy = zx * zy
imul16_round zx_zy, zx, zy, 4
quick_exit zx_zy
; dist = zx_2 + zy_2
add16 dist, zx_2, zy_2
quick_exit dist
; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters
jmp loop
.macro zoom_factor dest, src, zoom, aspect
.local cont
.local enough
; cx = (sx << (8 - zoom))
copy16 dest, src
ldx zoom
cpx #8
beq enough
shl16 dest
jmp cont
; cy = cy * (3 / 4)
; cx = cx * (5 / 4)
imul16_round dest, dest, aspect, 4
.proc pset
; screen coords in signed sx,sy
; iter holds the target to use
; @todo implement
; iter -> color
ldx iter
lda color_map,x
sta pixel_color
lda #(255 - 3)
sta pixel_mask
; sy -> line base address in temp
lda sy
bpl positive
; temp1 = top half
lda #.lobyte(framebuffer_top + stride * half_height)
sta pixel_ptr
lda #.hibyte(framebuffer_top + stride * half_height)
sta pixel_ptr + 1
jmp point
lda #.lobyte(framebuffer_bottom)
sta pixel_ptr
lda #.hibyte(framebuffer_bottom)
sta pixel_ptr + 1
; pixel_ptr += sy * stride
; temp * 40
; = temp * 32 + temp * 8
; = (temp << 5) + (temp << 3)
copy16 temp, sy
shl16 temp
shl16 temp
shl16 temp
add16 pixel_ptr, pixel_ptr, temp
shl16 temp
shl16 temp
add16 pixel_ptr, pixel_ptr, temp
; Ok so temp1 points to the start of the line, which is 40 bytes.
; Get the byte and bit offsets
lda sx
adc #half_width
sta temp
; pixel_shift = temp & 3
; pixel_color <<= pixel_shift (shifting in zeros)
; pixel_mask <<= pixel_shift (shifting in ones)
and #3
sta pixel_shift
lda #3
sbc pixel_shift
beq shift_done
asl pixel_color
asl pixel_color
rol pixel_mask
rol pixel_mask
jmp shift_loop
; pixel_offset = temp >> 2
lda temp
lsr a
lsr a
sta pixel_offset
; read, mask, or, write
lda (pixel_ptr),y
and pixel_mask
ora pixel_color
sta (pixel_ptr),y
.macro draw_text col, len, cstr
; clobbers A, X
.local loop
.local done
ldx #0
cpx #len
beq done
ldy cstr,x
lda char_map,y
sta textbuffer + col,x
jmp loop
.proc start
; ox = 0; oy = 0; zoom = 0
lda #0
sta ox
sta ox + 1
sta oy
sta oy + 1
; zoom = 2x
lda #1
sta zoom
; Disable display DMA
lda #0
; zero the range from framebuffer_top to framebuffer_end
lda #.lobyte(framebuffer_top)
sta temp
lda #.hibyte(framebuffer_top)
sta temp + 1
lda #0
ldy #0
sta (temp),y
bne zero_byte_loop
inc temp + 1
lda temp + 1
cmp #.hibyte(framebuffer_end)
bne zero_page_loop
; Copy the display list into properly aligned memory
; Can't cross 1024-byte boundaries :D
ldx #0
lda display_list_start,x
sta display_list,x
cpx #display_list_len
bne copy_byte_loop
; Set up the display list
lda #.lobyte(display_list)
sta DLISTL ; actual register
sta SDLSTL ; shadow register the OS will copy in
lda #.hibyte(display_list)
sta DLISTH ; actual register
sta SDLSTH ; shadow register the OS will copy in
; Status bar
draw_text 0, str_self_len, str_self
draw_text 40 - str_run_len, str_run_len, str_run
; Re-enable display DMA
lda #$22
; sy = -92 .. 91
lda #(256-half_height)
sta sy
lda #(256-1)
sta sy + 1
; sx = -80 .. 79
lda #(256-half_width)
sta sx
lda #(256-1)
sta sx + 1
zoom_factor cx, sx, zoom, aspect_x
zoom_factor cy, sy, zoom, aspect_y
jsr mandelbrot
jsr pset
lda sx
adc #1
sta sx
lda sx + 1
adc #0
sta sx + 1
lda sx
cmp #half_width
beq loop_sx_done
jmp loop_sx
lda sy
adc #1
sta sy
lda sy + 1
adc #0
sta sy + 1
lda sy
cmp #half_height
beq loop_sy_done
jmp loop_sy
draw_text 40 - str_done_len, str_done_len, str_done
; finished
jmp loop