mandel-6502/mandel.s

750 lines
14 KiB
ArmAsm
Raw Normal View History

; Our zero-page vars
2023-01-22 03:17:30 +00:00
sx = $80 ; i16: screen pixel x
sy = $82 ; i16: screen pixel y
ox = $84 ; fixed3.13: center point x
oy = $86 ; fixed3.13: center point y
cx = $84 ; fixed3.13: c_x
cy = $86 ; fixed3.13: c_y
zx = $88 ; fixed3.13: z_x
zy = $8a ; fixed3.13: z_y
zx_2 = $90 ; fixed6.26: z_x^2
zy_2 = $94 ; fixed6.26: z_y^2
zx_zy = $98 ; fixed6.26: z_x * z_y
dist = $9c ; fixed6.26: z_x^2 + z_y^2
iter = $a0 ; u8: iteration count
zoom = $a1 ; u8: zoom shift level
temp = $a2 ; u16
2023-01-22 16:20:59 +00:00
temp2 = $a4 ; u16
2023-01-22 17:09:12 +00:00
pixel_ptr = $b0 ; u16
pixel_color = $b2 ; u8
pixel_mask = $b3 ; u8
pixel_shift = $b4 ; u8
pixel_offset = $b5 ; u8
2022-12-29 05:08:16 +00:00
; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6
2023-01-22 03:17:30 +00:00
; High data
framebuffer_top = $8000
textbuffer = $8f00
2023-01-22 03:17:30 +00:00
framebuffer_bottom = $9000
display_list = $9f00
framebuffer_end = $a000
2023-01-22 03:17:30 +00:00
height = 184
half_height = height >> 1
width = 160
2023-01-22 16:20:59 +00:00
half_width = width >> 1
stride = width >> 2
2023-01-22 03:17:30 +00:00
width_ratio_3_13 = (5 << 11) ; 5/4
height_ratio_3_13 = (3 << 11) ; 5/4
DMACTL = $D400
DLISTL = $D402
DLISTH = $D403
; OS shadow registers
SDLSTL = $230
SDLSTH = $231
2023-01-22 03:17:30 +00:00
.data
2023-01-22 14:12:40 +00:00
strings:
str_self:
.byte "MANDEL-6502"
str_self_end:
2023-01-22 14:12:40 +00:00
str_speed:
.byte "ms/px"
str_speed_end:
str_run:
2023-01-22 15:23:46 +00:00
.byte " RUN"
str_run_end:
2023-01-22 14:12:40 +00:00
str_done:
.byte "DONE"
str_done_end:
str_self_len = str_self_end - str_self
str_speed_len = str_speed_end - str_speed
str_run_len = str_run_end - str_run
str_done_len = str_done_end - str_done
char_map:
; Map ATASCII string values to framebuffer font entries
; Sighhhhh
.repeat 32, i
.byte i + 64
.endrepeat
.repeat 64, i
.byte i
.endrepeat
.repeat 32, i
.byte 96 + i
.endrepeat
2023-01-22 14:12:40 +00:00
2023-01-22 03:17:30 +00:00
aspect:
; aspect ratio!
; pixels at 320w are 5:6 (narrow)
; pixels at 160w are 5:3 (wide)
;
; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4)
; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4)
;
; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624
; &horizontal range -80 .. 79.9 is -3.125 .. 3.124
;
; 184h is the equiv of 220.8h at square pixels
; 320 / 220.8 = 1.45 display aspect ratio
aspect_x:
.word 5 << (13 - 2)
aspect_y:
.word 3 << (13 - 2)
bit_masks:
.byte 3
.byte 3 << 2
.byte 3 << 4
.byte 3 << 6
display_list_start:
2023-01-22 17:09:12 +00:00
; 24 lines overscan
.repeat 3
2023-01-22 03:17:30 +00:00
.byte $70 ; 8 blank lines
.endrep
; 8 scan lines, 1 row of 40-column text
.byte $42
.addr textbuffer
2023-01-22 03:17:30 +00:00
; 184 lines graphics
; ANTIC mode e (160px 2bpp, 1 scan line per line)
.byte $4e
.addr framebuffer_top
.repeat half_height - 1
.byte $0e
.endrep
.byte $4e
.addr framebuffer_bottom
.repeat half_height - 1
.byte $0e
.endrep
.byte $41 ; jump and blank
.addr display_list
display_list_end:
display_list_len = display_list_end - display_list_start
2023-01-22 03:17:30 +00:00
2023-01-22 16:20:59 +00:00
color_map:
.byte 0
.repeat 85
.byte 1
.byte 2
.byte 3
.endrepeat
2022-12-29 05:08:16 +00:00
.code
.export start
; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
clc ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
adc arg2 + byte
sta dest + byte
.endrepeat
.endmacro
.macro add16 dest, arg1, arg2
add 2, dest, arg1, arg2
.endmacro
.macro add32 dest, arg1, arg2
add 2, dest, arg2, dest
.endmacro
; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2
sec ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
sbc arg2 + byte
sta dest + byte
.endrepeat
.endmacro
.macro sub16 dest, arg1, arg2
sub 2, dest, arg1, arg2
.endmacro
.macro sub32 dest, arg1, arg2
sub 4, dest, arg1, arg2
.endmacro
.macro shl bytes, arg
asl arg
2023-01-22 17:09:12 +00:00
.repeat bytes-1, i
rol arg + 1 + i
.endrepeat
.endmacro
.macro shl16 arg
shl 2, arg
.endmacro
.macro shl24 arg
shl 3, arg
.endmacro
.macro shl32 arg
shl 4, arg
.endmacro
; 6 * bytes cycles
.macro copy bytes, dest, arg
.repeat bytes, byte ; 6 * bytes cycles
lda arg + byte ; 3 cyc
sta dest + byte ; 3 cyc
.endrepeat
.endmacro
.macro copy16 dest, arg
copy 2, dest, arg
.endmacro
.macro copy32 dest, arg
copy 4, dest, arg
.endmacro
2022-12-30 08:43:44 +00:00
; 2 + 8 * byte cycles
.macro neg bytes, arg
sec ; 2 cyc
.repeat bytes, byte ; 8 * byte cycles
lda #00 ; 2 cyc
sbc arg + byte ; 3 cyc
sta arg + byte ; 3 cyc
.endrepeat
.endmacro
; 18 cycles
.macro neg16 arg
neg 2, arg
.endmacro
; 34 cycles
.macro neg32 arg
neg 4, arg
.endmacro
2023-01-22 16:20:59 +00:00
.macro extend_8_16 dest, src
; clobbers A, X
; 13-15 cycles
.local positive
.local negative
ldx #0 ; 2 cyc
lda src ; 3 cyc
sta dest ; 3 cyc
bpl positive ; 2 cyc
negative:
dex ; 2 cyc
positive:
stx dest + 1 ; 3 cyc
.endmacro
2022-12-31 02:25:43 +00:00
; inner loop for imul16
2023-01-05 04:21:51 +00:00
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
2022-12-30 04:18:21 +00:00
.macro bitmul16 arg1, arg2, result, bitnum
2023-01-05 03:52:56 +00:00
.local zero
2023-01-05 04:12:34 +00:00
.local one
.local next
2022-12-29 11:37:51 +00:00
2022-12-30 08:43:44 +00:00
; does 16-bit adds
2023-01-05 05:09:45 +00:00
; arg1 and arg2 are treated as unsigned
; negative signed inputs must be flipped first
2022-12-30 08:43:44 +00:00
2023-01-05 04:12:34 +00:00
; 7 cycles up to the branch
2022-12-30 04:18:21 +00:00
; check if arg1 has 0 or 1 bit in this place
2022-12-31 01:33:18 +00:00
; 5 cycles either way
2022-12-30 04:18:21 +00:00
.if bitnum < 8
2022-12-31 01:33:18 +00:00
lda arg1 ; 3 cyc
and #(1 << bitnum) ; 2 cyc
2022-12-30 04:18:21 +00:00
.else
2022-12-31 01:33:18 +00:00
lda arg1 + 1 ; 3 cyc
and #(1 << (bitnum - 8)) ; 2 cyc
2022-12-30 04:18:21 +00:00
.endif
2023-01-05 04:12:34 +00:00
bne one ; 2 cyc
2023-01-05 04:21:51 +00:00
zero: ; 18 cyc, 23 cyc
2023-01-05 04:12:34 +00:00
lsr result + 3 ; 5 cyc
jmp next ; 3 cyc
2022-12-29 11:37:51 +00:00
2023-01-05 04:21:51 +00:00
one: ; 32 cyc, 37 cyc
2022-12-29 11:37:51 +00:00
; 16-bit add on the top bits
2023-01-05 04:12:34 +00:00
clc ; 2 cyc
2022-12-31 02:21:31 +00:00
lda result + 2 ; 3 cyc
adc arg2 ; 3 cyc
sta result + 2 ; 3 cyc
lda result + 3 ; 3 cyc
adc arg2 + 1 ; 3 cyc
2023-01-05 05:09:45 +00:00
ror a ; 2 cyc - get a jump on the shift
2022-12-31 02:21:31 +00:00
sta result + 3 ; 3 cyc
next:
2022-12-31 01:33:18 +00:00
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
2022-12-30 04:18:21 +00:00
.if bitnum >= 8
2023-01-05 04:21:51 +00:00
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
2022-12-31 01:33:18 +00:00
ror result ; 5 cyc
2022-12-30 04:18:21 +00:00
.endif
2023-01-05 04:21:51 +00:00
2023-01-05 04:12:34 +00:00
2022-12-29 05:08:16 +00:00
.endmacro
2022-12-31 01:33:18 +00:00
; 5 to 25 cycles
2022-12-30 08:43:44 +00:00
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the X register.
.local positive
2022-12-31 01:33:18 +00:00
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
inx ; 2 cyc
2022-12-30 08:43:44 +00:00
positive:
.endmacro
2022-12-29 05:08:16 +00:00
; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
2023-01-22 03:17:30 +00:00
jsr imul16_func ; 470-780 cyc
copy32 dest, FR2 ; 24 cyc
.endmacro
2023-01-22 03:17:30 +00:00
.macro imul16_round dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; 470-780 cyc
round16 FR2 ; 5-28 cyc
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro
2023-01-05 04:37:16 +00:00
; min 470 cycles
; max 780 cycles
.proc imul16_func
2022-12-30 08:43:44 +00:00
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
2022-12-31 01:33:18 +00:00
ldx #0 ; 2 cyc
2022-12-30 08:43:44 +00:00
; counts the number of sign bits in X
2022-12-31 01:33:18 +00:00
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
2022-12-30 08:43:44 +00:00
2022-12-30 04:18:21 +00:00
; zero out the 32-bit temp's top 16 bits
2022-12-31 01:33:18 +00:00
lda #0 ; 2 cyc
sta result + 2 ; 3 cyc
sta result + 3 ; 3 cyc
2022-12-29 11:37:51 +00:00
; the bottom two bytes will get cleared by the shifts
2022-12-29 05:08:16 +00:00
2022-12-30 08:43:44 +00:00
; unrolled loop for maximum speed, at the cost
; of a larger routine
2023-01-05 04:37:16 +00:00
; 440 to 696 cycles
2022-12-29 05:08:16 +00:00
.repeat 16, bitnum
2023-01-05 04:37:16 +00:00
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
2022-12-30 08:43:44 +00:00
bitmul16 arg1, arg2, result, bitnum
2022-12-29 05:08:16 +00:00
.endrepeat
2022-12-30 04:18:21 +00:00
2022-12-30 08:43:44 +00:00
; In case of mixed input signs, return a negative result.
2022-12-31 01:33:18 +00:00
cpx #1 ; 2 cyc
bne positive_result ; 2 cyc
neg32 result ; 34 cyc
2022-12-30 08:43:44 +00:00
positive_result:
2022-12-31 01:33:18 +00:00
rts ; 6 cyc
2022-12-29 05:08:16 +00:00
.endproc
.macro round16 arg
; Round top 16 bits of 32-bit fixed-point number in-place
.local zero
.local one
.local positive
.local negative
.local neg2
.local next
; no round - 5 cycles
; round pos, no carry - 17
; round pos, carry - 22
; round neg, no carry - 23
; round neg, carry - 28
; average = 5 / 2 + (17 + 22 + 23 + 28) / 8
; = 5 / 2 + 90 / 8
; = 2.5 + 11.25 = 13.75 cycles average on evenly distributed input
lda arg + 1 ; 3 cyc
bpl zero ; 2 cyc
one:
; check sign bit
lda arg + 3 ; 3 cyc
bpl positive ; 2 cyc
negative:
lda arg + 2 ; 3 cyc
beq neg2 ; 2 cyc
dec arg + 2 ; 5 cyc
jmp next ; 3 cyc
neg2:
dec arg + 2 ; 5 cyc
dec arg + 3 ; 5 cyc
jmp next ; 3 cyc
positive:
inc arg + 2 ; 5 cyc
beq next ; 2 cyc
inc arg + 3 ; 5 cyc
zero:
next:
.endmacro
.proc mandelbrot
; input:
; cx: position scaled to 4.12 fixed point - -8..+7.9
; cy: position scaled to 4.12
;
; output:
; iter: iteration count at escape or 0
2022-12-29 05:08:16 +00:00
; zx = 0
; zy = 0
; zx_2 = 0
2022-12-30 08:55:48 +00:00
; zy_2 = 0
; zx_zy = 0
; dist = 0
; iter = 0
lda #00
ldx #(iter - zx + 1)
initloop:
sta zx - 1,x
dex
bne initloop
2022-12-29 05:08:16 +00:00
loop:
; 1939 - 3007 cyc
; iter++ & max-iters break = 7 cyc
inc iter ; 5 cyc
bne keep_going ; 2 cyc
rts
keep_going:
2022-12-29 05:08:16 +00:00
; 4.12: (-8 .. +7.9)
; zx = zx_2 - zy_2 + cx = 3 * 20 = 60 cyc
sub16 zx, zx_2, zy_2
add16 zx, zx, cx
; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc
sub16 zy, zx_zy, zx_zy
add16 zy, zy, cy
2022-12-29 05:08:16 +00:00
2023-01-05 19:58:32 +00:00
; 8.24: (-128 .. +127.9)
; zx_2 = zx * zx = 518 - 828 cyc
imul16 zx_2, zx, zx
; zy_2 = zy * zy = 518 - 828 cyc
imul16 zy_2, zy, zy
; zx_zy = zx * zy = 518 - 828 cyc
imul16 zx_zy, zx, zy
; dist = zx_2 + zy_2 = 38 cyc
add32 dist, zx_2, zy_2
; if dist >= 4 break, else continue iterating = 7 cyc
lda dist + 3 ; 3 cyc
cmp #4 ; 2 cyc
bmi still_in ; 2 cyc
rts
still_in:
; shift and round zx_2 to 4.12 = (60 + 5) - (60 + 28) = 65 - 88 cyc
.repeat 4 ; 60 cyc
shl24 zx_2 ; 15 cyc
.endrepeat
round16 zx_2 ; 5-28 cycles
2022-12-29 05:08:16 +00:00
; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
.repeat 4 ; 60 cyc
shl24 zy_2 ; 15 cyc
.endrepeat
round16 zy_2 ; 5-28 cycles
; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
.repeat 4 ; 60 cyc
shl24 zx_zy ; 15 cyc
.endrepeat
round16 zx_zy ; 5-28 cycles
2022-12-30 08:55:48 +00:00
; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters
jmp loop ; 3 cycles
2022-12-30 08:55:48 +00:00
2022-12-29 05:08:16 +00:00
.endproc
2022-12-30 04:32:58 +00:00
2023-01-22 03:17:30 +00:00
.macro zoom_factor dest, src, zoom, aspect
.local cont
.local enough
; cx = (sx << (8 - zoom))
copy16 dest, src
ldx zoom
cont:
cpx #8
beq enough
shl16 dest
inx
jmp cont
enough:
; cy = cy * (3 / 4)
; cx = cx * (5 / 4)
imul16_round dest, dest, aspect
.endmacro
.proc pset
; screen coords in signed sx,sy
; iter holds the target to use
; @todo implement
2023-01-22 16:20:59 +00:00
; iter -> color
ldx iter
lda color_map,x
sta pixel_color
lda #(255 - 3)
sta pixel_mask
; sy -> line base address in temp
lda sy
bpl positive
negative:
; temp1 = top half
lda #.lobyte(framebuffer_top + stride * half_height)
sta pixel_ptr
lda #.hibyte(framebuffer_top + stride * half_height)
sta pixel_ptr + 1
jmp point
positive:
lda #.lobyte(framebuffer_bottom)
sta pixel_ptr
lda #.hibyte(framebuffer_bottom)
sta pixel_ptr + 1
point:
; pixel_ptr += sy * stride
; temp * 40
; = temp * 32 + temp * 8
; = (temp << 5) + (temp << 3)
2023-01-22 17:09:12 +00:00
copy16 temp, sy
2023-01-22 16:20:59 +00:00
shl16 temp
shl16 temp
shl16 temp
2023-01-22 16:34:06 +00:00
add16 pixel_ptr, pixel_ptr, temp
2023-01-22 16:20:59 +00:00
shl16 temp
shl16 temp
2023-01-22 16:34:06 +00:00
add16 pixel_ptr, pixel_ptr, temp
2023-01-22 16:20:59 +00:00
; Ok so temp1 points to the start of the line, which is 40 bytes.
; Get the byte and bit offsets
lda sx
clc
adc #half_width
sta temp
; pixel_shift = temp & 3
; pixel_color <<= pixel_shift (shifting in zeros)
; pixel_mask <<= pixel_shift (shifting in ones)
and #3
sta pixel_shift
2023-01-22 17:37:37 +00:00
lda #3
sec
sbc pixel_shift
2023-01-22 16:20:59 +00:00
tax
shift_loop:
beq shift_done
asl pixel_color
2023-01-22 17:09:12 +00:00
asl pixel_color
sec
rol pixel_mask
2023-01-22 16:20:59 +00:00
sec
rol pixel_mask
dex
jmp shift_loop
shift_done:
; pixel_offset = temp >> 2
lda temp
lsr a
lsr a
sta pixel_offset
tay
; read, mask, or, write
2023-01-22 17:09:12 +00:00
lda (pixel_ptr),y
2023-01-22 16:20:59 +00:00
and pixel_mask
ora pixel_color
2023-01-22 17:09:12 +00:00
sta (pixel_ptr),y
2023-01-22 16:20:59 +00:00
2023-01-22 03:17:30 +00:00
rts
.endproc
.macro draw_text col, len, cstr
2023-01-22 14:12:40 +00:00
; clobbers A, X
.local loop
.local done
ldx #0
loop:
cpx #len
2023-01-22 14:12:40 +00:00
beq done
ldy cstr,x
lda char_map,y
2023-01-22 14:12:40 +00:00
sta textbuffer + col,x
inx
jmp loop
done:
.endmacro
2022-12-30 04:32:58 +00:00
.proc start
2022-12-30 08:43:44 +00:00
2023-01-22 03:17:30 +00:00
; ox = 0; oy = 0; zoom = 0
lda #0
sta ox
sta ox + 1
sta oy
sta oy + 1
sta zoom
2022-12-30 04:32:58 +00:00
2023-01-22 03:17:30 +00:00
; Disable display DMA
sta DMACTL
2023-01-22 03:17:30 +00:00
; zero the range from framebuffer_top to framebuffer_end
lda #.lobyte(framebuffer_top)
sta temp
2023-01-22 03:17:30 +00:00
lda #.hibyte(framebuffer_top)
sta temp + 1
zero_page_loop:
lda #0
ldy #0
zero_byte_loop:
sta (temp),y
iny
bne zero_byte_loop
inc temp + 1
lda temp + 1
cmp #.hibyte(framebuffer_end)
bne zero_page_loop
; Copy the display list into properly aligned memory
; Can't cross 1024-byte boundaries :D
ldx #0
copy_byte_loop:
lda display_list_start,x
sta display_list,x
inx
cpx #display_list_len
bne copy_byte_loop
; Set up the display list
lda #.lobyte(display_list)
sta DLISTL ; actual register
sta SDLSTL ; shadow register the OS will copy in
lda #.hibyte(display_list)
sta DLISTH ; actual register
sta SDLSTH ; shadow register the OS will copy in
2023-01-22 15:23:46 +00:00
; Status bar
draw_text 0, str_self_len, str_self
draw_text 40 - str_run_len, str_run_len, str_run
2023-01-22 14:12:40 +00:00
2023-01-22 03:17:30 +00:00
; Re-enable display DMA
lda #$22
sta DMACTL
main_loop:
; sy = -92 .. 91
lda #(256-half_height)
sta sy
lda #(256-1)
sta sy + 1
loop_sy:
; sx = -80 .. 79
lda #(256-half_width)
sta sx
lda #(256-1)
sta sx + 1
loop_sx:
zoom_factor cx, sx, zoom, aspect_x
zoom_factor cy, sy, zoom, aspect_y
2023-01-22 17:13:19 +00:00
jsr mandelbrot
inc iter
2023-01-22 03:17:30 +00:00
jsr pset
clc
2023-01-22 17:09:12 +00:00
lda sx
2023-01-22 03:17:30 +00:00
adc #1
sta sx
lda sx + 1
adc #0
sta sx + 1
2023-01-22 16:34:06 +00:00
lda sx
cmp #half_width
beq loop_sx_done
2023-01-22 03:17:30 +00:00
jmp loop_sx
loop_sx_done:
clc
2023-01-22 17:09:12 +00:00
lda sy
2023-01-22 03:17:30 +00:00
adc #1
sta sy
lda sy + 1
adc #0
sta sy + 1
2023-01-22 16:34:06 +00:00
lda sy
cmp #half_height
beq loop_sy_done
2023-01-22 03:17:30 +00:00
jmp loop_sy
loop_sy_done:
draw_text 40 - str_done_len, str_done_len, str_done
2023-01-22 14:12:40 +00:00
2023-01-05 04:12:34 +00:00
loop:
2023-01-22 03:17:30 +00:00
; finished
jmp loop
2022-12-30 04:32:58 +00:00
.endproc