mandel-6502/mandel.s

; Our zero-page vars
sx    = $80     ; i16: screen pixel x
sy    = $82     ; i16: screen pixel y
ox    = $84     ; fixed4.12: center point x
oy    = $86     ; fixed4.12: center point y
cx    = $88     ; fixed4.12: c_x
cy    = $8a     ; fixed4.12: c_y
zx    = $8c     ; fixed4.12: z_x
zy    = $8e     ; fixed4.12: z_y

zx_2  = $90     ; fixed4.12: z_x^2
zy_2  = $92     ; fixed4.12: z_y^2
zx_zy = $94     ; fixed4.12: z_x * z_y
dist  = $96     ; fixed4.12: z_x^2 + z_y^2

iter         = $a0 ; u8: iteration count

zoom         = $a1 ; u8: zoom shift level
count_frames = $a2 ; u8
count_pixels = $a3 ; u8
total_ms     = $a4 ; float48
total_pixels = $aa ; float48

temp         = $b0 ; u16
temp2        = $b2 ; u16
pixel_ptr    = $b4 ; u16
pixel_color  = $b6 ; u8
pixel_mask   = $b7 ; u8
pixel_shift  = $b8 ; u8
pixel_offset = $b9 ; u8


; FP registers in zero page
FR0    = $d4 ; float48
FRE    = $da
FR1    = $e0 ; float48
FR2    = $e6 ; float48
CIX    = $f2 ; u8 - index into INBUFF
INBUFF = $f3 ; u16 - pointer to ascii
FLPTR  = $fc ; u16 - pointer to user buffer float48

LBUFF  = $0580 ; result buffer for FASC routine

; FP ROM routine vectors
FASC   = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
IFP    = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
FADD   = $DA66 ; ADDITION       (FR0 += FR1)
FSUB   = $DA60 ; SUBTRACTION    (FR0 -= FR1)
FMUL   = $DADB ; MULTIPLICATION (FR0 *= FR1)
FDIV   = $DB28 ; DIVISION       (FR0 /= FR1)
ZF1    = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
FLD0R  = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
FLD1R  = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
FST0R  = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
FMOVE  = $DDB6 ; MOVE FR0 TO FR1

; High data
framebuffer_top    = $8000
textbuffer         = $8f00
framebuffer_bottom = $9000
display_list       = $9f00
framebuffer_end    = $a000

height = 184
half_height = height >> 1
width = 160
half_width = width >> 1
stride = width >> 2

DMACTL = $D400
DLISTL = $D402
DLISTH = $D403

; OS shadow registers
SDLSTL = $230
SDLSTH = $231

; interrupt stuff
XITVBV = $E462
SETVBV = $E45C

.struct float48
    exponent .byte
    mantissa .byte 6
.endstruct

.import mul_lobyte256
.import mul_hibyte256
.import mul_hibyte512

.data

strings:
str_self:
    .byte "MANDEL-6502"
str_self_end:
str_speed:
    .byte "ms/px"
str_speed_end:
str_run:
    .byte " RUN"
str_run_end:
str_done:
    .byte "DONE"
str_done_end:

str_self_len = str_self_end - str_self
str_speed_len = str_speed_end - str_speed
str_run_len = str_run_end - str_run
str_done_len = str_done_end - str_done

speed_start = str_self_len + 2
speed_len = 14 + str_speed_len


char_map:
    ; Map ATASCII string values to framebuffer font entries
    ; Sighhhhh
    .repeat 32, i
        .byte i + 64
    .endrepeat
    .repeat 64, i
        .byte i
    .endrepeat
    .repeat 32, i
        .byte 96 + i
    .endrepeat

aspect:
    ; aspect ratio!
    ; pixels at 320w are 5:6 (narrow)
    ; pixels at 160w are 5:3 (wide)
    ;
    ; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4)
    ; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4)
    ;
    ; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624
    ; &horizontal range -80 .. 79.9 is -3.125 .. 3.124
    ;
    ; 184h is the equiv of 220.8h at square pixels
    ; 320 / 220.8 = 1.45 display aspect ratio
aspect_x: ; fixed4.16 5/4
    .word 5 << (12 - 2)

aspect_y: ; fixed4.16 3/4
    .word 3 << (12 - 2)

ms_per_frame: ; float48 16.66666667
    .byte 64  ; exponent/sign
    .byte $16 ; BCD digits
    .byte $66
    .byte $66
    .byte $66
    .byte $67

display_list_start:
    ; 24 lines overscan
    .repeat 3
        .byte $70 ; 8 blank lines
    .endrep

    ; 8 scan lines, 1 row of 40-column text
    .byte $42
    .addr textbuffer

    ; 184 lines graphics
    ; ANTIC mode e (160px 2bpp, 1 scan line per line)
    .byte $4e
    .addr framebuffer_top
    .repeat half_height - 1
        .byte $0e
    .endrep
    .byte $4e
    .addr framebuffer_bottom
    .repeat half_height - 1
        .byte $0e
    .endrep

    .byte $41 ; jump and blank
    .addr display_list
display_list_end:
display_list_len = display_list_end - display_list_start

color_map:
    .byte 0
    .repeat 85
        .byte 1
        .byte 2
        .byte 3
    .endrepeat

.code

.export start

; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
    clc ; 2 cyc
    .repeat bytes, byte ; 9 * byte cycles
        lda arg1 + byte
        adc arg2 + byte
        sta dest + byte
    .endrepeat
.endmacro

.macro add16 dest, arg1, arg2
    add 2, dest, arg1, arg2
.endmacro

.macro add32 dest, arg1, arg2
    add 4, dest, arg2, dest
.endmacro

.macro add_carry dest
    lda dest
    adc #0
    sta dest
.endmacro

; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2
    sec ; 2 cyc
    .repeat bytes, byte ; 9 * byte cycles
        lda arg1 + byte
        sbc arg2 + byte
        sta dest + byte
    .endrepeat
.endmacro

.macro sub16 dest, arg1, arg2
    sub 2, dest, arg1, arg2
.endmacro

.macro sub32 dest, arg1, arg2
    sub 4, dest, arg1, arg2
.endmacro

.macro shl bytes, arg
    asl arg
    .repeat bytes-1, i
        rol arg + 1 + i
    .endrepeat
.endmacro

.macro shl16 arg
    shl 2, arg
.endmacro

.macro shl24 arg
    shl 3, arg
.endmacro

.macro shl32 arg
    shl 4, arg
.endmacro

; 6 * bytes cycles
.macro copy bytes, dest, arg
    .repeat bytes, byte ; 6 * bytes cycles
        lda arg + byte  ; 3 cyc
        sta dest + byte ; 3 cyc
    .endrepeat
.endmacro

.macro copy16 dest, arg
    copy 2, dest, arg
.endmacro

.macro copy32 dest, arg
    copy 4, dest, arg
.endmacro

.macro copyfloat dest, arg
    copy 6, dest, arg
.endmacro

; 2 + 8 * byte cycles
.macro neg bytes, arg
    sec ; 2 cyc
    .repeat bytes, byte ; 8 * byte cycles
        lda #00         ; 2 cyc
        sbc arg + byte  ; 3 cyc
        sta arg + byte  ; 3 cyc
    .endrepeat
.endmacro

; 18 cycles
.macro neg16 arg
    neg 2, arg
.endmacro

; 34 cycles
.macro neg32 arg
    neg 4, arg
.endmacro

; inner loop for imul16
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
.macro bitmul16 arg1, arg2, result, bitnum
    .local zero
    .local one
    .local next

    ; does 16-bit adds
    ; arg1 and arg2 are treated as unsigned
    ; negative signed inputs must be flipped first

    ; 7 cycles up to the branch

    ; check if arg1 has 0 or 1 bit in this place
    ; 5 cycles either way
    .if bitnum < 8
        lda arg1                 ; 3 cyc
        and #(1 << (bitnum))       ; 2 cyc
    .else
        lda arg1 + 1             ; 3 cyc
        and #(1 << ((bitnum) - 8)) ; 2 cyc
    .endif
    bne one ; 2 cyc

zero: ; 18 cyc, 23 cyc
    lsr result + 3 ; 5 cyc
    jmp next       ; 3 cyc

one: ; 32 cyc, 37 cyc
    ; 16-bit add on the top bits
    clc            ; 2 cyc
    lda result + 2 ; 3 cyc
    adc arg2       ; 3 cyc
    sta result + 2 ; 3 cyc
    lda result + 3 ; 3 cyc
    adc arg2 + 1   ; 3 cyc
    ror a          ; 2 cyc - get a jump on the shift
    sta result + 3 ; 3 cyc
next:
    ror result + 2 ; 5 cyc
    ror result + 1 ; 5 cyc
    .if bitnum >= 8
        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
        ; when it's all uninitialized data
        ror result ; 5 cyc
    .endif

.endmacro

; 5 to 25 cycles
.macro check_sign arg
    ; Check sign bit and flip argument to postive,
    ; keeping a count of sign bits in the Y register.
    .local positive
    lda arg + 1   ; 3 cyc
    bpl positive  ; 2 cyc
    neg16 arg     ; 18 cyc
    iny           ; 2 cyc
positive:
.endmacro

; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
    copy16 FR1, arg2  ; 12 cyc
    jsr imul16_func   ; 470-780 cyc
    copy32 dest, FR2  ; 24 cyc
.endmacro

.macro shift_round_16 arg, shift
    .repeat shift
        shl32 arg
    .endrepeat
    round16 arg
.endmacro

.macro imul16_round dest, arg1, arg2, shift
    copy16 FR0, arg1  ; 12 cyc
    copy16 FR1, arg2  ; 12 cyc
    jsr imul16_func   ; 470-780 cyc
    shift_round_16 FR2, shift
    copy16 dest, FR2 + 2  ; 12 cyc
.endmacro

; min 470 cycles
; max 780 cycles
.proc imul16_func_orig
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result

    ldy #0          ; 2 cyc
    ; counts the number of sign bits in Y
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc

    ; zero out the 32-bit temp's top 16 bits
    lda #0          ; 2 cyc
    sta result + 2  ; 3 cyc
    sta result + 3  ; 3 cyc
    ; the bottom two bytes will get cleared by the shifts

    ; unrolled loop for maximum speed, at the cost
    ; of a larger routine
    ; 440 to 696 cycles
    .repeat 16, bitnum
        ; bitnum < 8: 25 or 41 cycles
        ; bitnum >= 8: 30 or 46 cycles
        bitmul16 arg1, arg2, result, bitnum
    .endrepeat

    ; In case of mixed input signs, return a negative result.
    cpy #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
positive_result:

    rts ; 6 cyc
.endproc

; Adapted from https://everything2.com/title/Fast+6502+multiplication
.macro imul8 dest, arg1, arg2
    .local under256
    .local next
    .local small_product
    .scope
        mul_factor_a   = arg1
        mul_factor_x   = arg2
        mul_product_lo = dest
        mul_product_hi = dest + 1

        lda mul_factor_a      ; setup: 6 cycles
        ;ldx mul_factor_x

        clc                   ; (a + x)^2/2: 23 cycles
        adc mul_factor_x
        tax
        bcc under256
        lda mul_hibyte512,x
        bcs next
    under256:
        lda mul_hibyte256,x
        sec
    next:
        sta mul_product_hi
        lda mul_lobyte256,x

        ldx mul_factor_a      ; - a^2/2: 20 cycles
        sbc mul_lobyte256,x
        sta mul_product_lo
        lda mul_product_hi
        sbc mul_hibyte256,x
        sta mul_product_hi

        ldx mul_factor_x      ; + x & a & 1: 22 cycles
        txa                   ; (this is a kludge to correct a
        and mul_factor_a      ; roundoff error that makes odd * odd too low)
        and #1

        clc
        adc mul_product_lo
        bcc small_product
        inc mul_product_hi
    small_product:
        sec                   ; - x^2/2: 25 cycles
        sbc mul_lobyte256,x
        lda mul_product_hi
        sbc mul_hibyte256,x
        sta mul_product_hi
    .endscope
.endmacro

.proc imul16_func
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
    inter = temp2

    ldy #0          ; 2 cyc
    ; counts the number of sign bits in Y
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc

    lda #0
    sta result + 0
    sta result + 1
    sta result + 2
    sta result + 3

    imul8 inter, arg1, arg2
    add16 result, result, inter

    imul8 inter, arg1 + 1, arg2
    add16 result + 1, result + 1, inter

    imul8 inter, arg1, arg2 + 1
    add16 result + 1, result + 1, inter
    add_carry result + 3

    imul8 inter, arg1 + 1, arg2 + 1
    add16 result + 2, result + 2, inter

    ; In case of mixed input signs, return a negative result.
    cpy #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
positive_result:

    rts ; 6 cyc
.endproc

.macro round16 arg
    ; Round top 16 bits of 32-bit fixed-point number in-place
    .local increment
    .local high_half
    .local check_sign
    .local next

    ; low word > $8000: round up
    ;          = $8000: round up   if positive
    ;                   round down if negative
    ;          < $8000: round down

    lda arg + 1
    cmp #$80
    beq high_half
    bpl increment
    bmi next

high_half:
    lda arg
    beq check_sign
    bpl increment
    bmi next

check_sign:
    lda arg + 3
    bmi next

increment:       ; 5-10 cyc
    inc arg + 2  ; 5 cyc
    bne next     ; 2 cyc
    inc arg + 3  ; 5 cyc

next:

.endmacro

.proc mandelbrot
    ; input:
    ; cx: position scaled to 4.12 fixed point - -8..+7.9
    ; cy: position scaled to 4.12
    ;
    ; output:
    ; iter: iteration count at escape or 0

    ; zx = 0
    ; zy = 0
    ; zx_2 = 0
    ; zy_2 = 0
    ; zx_zy = 0
    ; dist = 0
    ; iter = 0
    lda #00
    ldx #(iter - zx + 1)
initloop:
    sta zx - 1,x
    dex
    bne initloop

loop:
    ; iter++ & max-iters break
    inc iter
    bne keep_going
    rts
keep_going:

    .macro quick_exit arg, max
        .local positive
        .local negative
        .local nope_out
        .local first_equal
        .local all_done

        ; check sign bit
        lda arg + 1
        bmi negative

    positive:
        cmp #((max) << 4)
        bmi all_done ; 'less than'
        rts

    negative:
        cmp #(256 - ((max) << 4))
        beq first_equal ; 'equal' on first byte
        bpl all_done    ; 'greater than'

    nope_out:
        rts

    first_equal:
        lda arg
        beq nope_out  ; 2nd byte 0 shows it's really 'equal'

    all_done:
    .endmacro

    ; 4.12: (-8 .. +7.9)
    ; zx = zx_2  - zy_2  + cx
    sub16 zx, zx_2, zy_2
    add16 zx, zx, cx
    quick_exit zx, 2

    ; zy = zx_zy + zx_zy + cy
    add16 zy, zx_zy, zx_zy
    add16 zy, zy, cy
    quick_exit zy, 2

    ; zx_2 = zx * zx
    imul16_round zx_2, zx, zx, 4

    ; zy_2 = zy * zy
    imul16_round zy_2, zy, zy, 4

    ; zx_zy = zx * zy
    imul16_round zx_zy, zx, zy, 4

    ; dist = zx_2 + zy_2
    add16 dist, zx_2, zy_2
    quick_exit dist, 4

    ; if may be in the lake, look for looping output with a small buffer
    ; as an optimization vs running to max iters
    jmp loop

peace_out:
    rts

.endproc

.macro zoom_factor dest, src, zoom, aspect
    .local cont
    .local enough

    ; cx = (sx << (8 - zoom))
    copy16 dest, src
    ldx zoom
cont:
    cpx #8
    beq enough
    shl16 dest
    inx
    jmp cont
enough:

    ; cy = cy * (3 / 4)
    ; cx = cx * (5 / 4)
    imul16_round dest, dest, aspect, 4
.endmacro

.proc pset
    ; screen coords in signed sx,sy
    ; iter holds the target to use
    ; @todo implement

    ; iter -> color
    ldx iter
    lda color_map,x
    sta pixel_color
    lda #(255 - 3)
    sta pixel_mask

    ; sy -> line base address in temp
    lda sy
    bpl positive

negative:
    ; temp1 = top half
    lda #.lobyte(framebuffer_top + stride * half_height)
    sta pixel_ptr
    lda #.hibyte(framebuffer_top + stride * half_height)
    sta pixel_ptr + 1
    jmp point

positive:

    lda #.lobyte(framebuffer_bottom)
    sta pixel_ptr
    lda #.hibyte(framebuffer_bottom)
    sta pixel_ptr + 1

point:

    ; pixel_ptr += sy * stride
    ;    temp * 40
    ; =  temp * 32  +  temp * 8
    ; = (temp << 5) + (temp << 3)
    copy16 temp, sy
    shl16 temp
    shl16 temp
    shl16 temp
    add16 pixel_ptr, pixel_ptr, temp
    shl16 temp
    shl16 temp
    add16 pixel_ptr, pixel_ptr, temp

    ; Ok so temp1 points to the start of the line, which is 40 bytes.
    ; Get the byte and bit offsets
    lda sx
    clc
    adc #half_width
    sta temp

    ; pixel_shift = temp & 3
    ; pixel_color <<= pixel_shift (shifting in zeros)
    ; pixel_mask <<= pixel_shift (shifting in ones)
    and #3
    sta pixel_shift
    lda #3
    sec
    sbc pixel_shift
    tax
shift_loop:
    beq shift_done
    asl pixel_color
    asl pixel_color
    sec
    rol pixel_mask
    sec
    rol pixel_mask
    dex
    jmp shift_loop
shift_done:

    ; pixel_offset = temp >> 2
    lda temp
    lsr a
    lsr a
    sta pixel_offset
    tay

    ; read, mask, or, write
    lda (pixel_ptr),y
    and pixel_mask
    ora pixel_color
    sta (pixel_ptr),y

    rts
.endproc

.macro draw_text col, len, cstr
    ; clobbers A, X
    .local loop
    .local done
    ldx #0
loop:
    cpx #len
    beq done
    ldy cstr,x
    lda char_map,y
    sta textbuffer + col,x
    inx
    jmp loop
done:
.endmacro

.proc vblank_handler
    inc count_frames
    jmp XITVBV
.endproc

.proc update_speed
    ; convert frames (u16) to fp
    ; add to frames_total
    ; convert pixels (u16) to fp
    ; add to pixels_total
    ; (frames_total * 16.66666667) / pixels_total
    ; convert to ATASCII
    ; draw text
.endproc

.proc start

    ; ox = 0; oy = 0; zoom = 0
    ; count_frames = 0; count_pixels = 0
    lda #0
    sta ox
    sta ox + 1
    sta oy
    sta oy + 1
    sta count_frames
    sta count_pixels

    ; total_ms = 0.0; total_pixels = 0.0
    ldx #total_ms
    jsr ZF1
    ldx #total_pixels
    jsr ZF1

    ; zoom = 2x
    lda #1
    sta zoom

    ; Disable display DMA
    lda #0
    sta DMACTL

    ; zero the range from framebuffer_top to framebuffer_end
    lda #.lobyte(framebuffer_top)
    sta temp
    lda #.hibyte(framebuffer_top)
    sta temp + 1

zero_page_loop:
    lda #0
    ldy #0
zero_byte_loop:
    sta (temp),y
    iny
    bne zero_byte_loop

    inc temp + 1
    lda temp + 1
    cmp #.hibyte(framebuffer_end)
    bne zero_page_loop

    ; Copy the display list into properly aligned memory
    ; Can't cross 1024-byte boundaries :D
    ldx #0
copy_byte_loop:
    lda display_list_start,x
    sta display_list,x
    inx
    cpx #display_list_len
    bne copy_byte_loop

    ; Set up the display list
    lda #.lobyte(display_list)
    sta DLISTL ; actual register
    sta SDLSTL ; shadow register the OS will copy in
    lda #.hibyte(display_list)
    sta DLISTH ; actual register
    sta SDLSTH ; shadow register the OS will copy in

    ; Status bar
    draw_text 0, str_self_len, str_self
    draw_text 40 - str_run_len, str_run_len, str_run

    ; Re-enable display DMA
    lda #$22
    sta DMACTL

    ; install the vblank handler
    lda #7 ; deferred
    ldx #.hibyte(vblank_handler)
    ldy #.lobyte(vblank_handler)
    jsr SETVBV

main_loop:
    ; sy = -92 .. 91
    lda #(256-half_height)
    sta sy
    lda #(256-1)
    sta sy + 1

loop_sy:
    ; sx = -80 .. 79
    lda #(256-half_width)
    sta sx
    lda #(256-1)
    sta sx + 1

loop_sx:
    zoom_factor cx, sx, zoom, aspect_x
    zoom_factor cy, sy, zoom, aspect_y
    jsr mandelbrot
    jsr pset


    ; check if we should update the counters
    ;
    ; count_pixels >= width? update!
    inc count_pixels
    lda count_pixels
    cmp #width
    bmi update_status

    ; count_frames >= 120? update!
    lda count_frames
    cmp #120 ; >= 2 seconds
    bmi skip_status

update_status:
    ; FR0 = (float)count_pixels & clear count_pixels
    lda count_pixels
    sta FR0
    lda #0
    sta FR0 + 1
    sta count_pixels
    jsr IFP

    ; FR1 = total_pixels
    ldx #.lobyte(total_pixels)
    ldy #.hibyte(total_pixels)
    jsr FLD1R

    ; FR0 += FR1
    jsr FADD

    ; total_pixels = FR0
    ldx #.lobyte(total_pixels)
    ldy #.hibyte(total_pixels)
    jsr FST0R


    ; FR0 = (float)count_frames & clear count_frames
    ; warning: this should really disable interrupts @TODO
    lda count_frames
    sta FR0
    lda #0
    sta FR0 + 1
    sta count_frames
    jsr IFP

    ; FR0 *= ms_per_frame
    ldx #.lobyte(ms_per_frame)
    ldy #.hibyte(ms_per_frame)
    jsr FLD1R
    jsr FMUL

    ; FR0 += total_ms
    ldx #total_ms
    ldy #0
    jsr FLD1R
    jsr FADD

    ; total_ms = FR0
    ldx #total_ms
    ldy #0
    jsr FST0R

    ; FR0 /= total_pixels
    ldx #total_pixels
    ldy #0
    jsr FLD1R
    jsr FDIV

    ; convert to ASCII in INBUFF
    jsr FASC

    ; find the last byte
    ldy #0
number_loop:
    lda (INBUFF),y
    bmi lastchar

    tax
    lda char_map,x
    sta textbuffer + speed_start,y

    iny
    bpl number_loop
lastchar:
    ; Y is last char
    ; trim that high bit
    and #$7f
    tax
    lda char_map,x
    sta textbuffer + speed_start,y

    ; Fill out any remaining spaces
    lda #0
space_loop:
    iny
    sta textbuffer + speed_start,y
    cpy #(20)
    bmi space_loop

skip_status:

    clc
    lda sx
    adc #1
    sta sx
    lda sx + 1
    adc #0
    sta sx + 1

    lda sx
    cmp #half_width
    beq loop_sx_done
    jmp loop_sx

loop_sx_done:

    clc
    lda sy
    adc #1
    sta sy
    lda sy + 1
    adc #0
    sta sy + 1

    lda sy
    cmp #half_height
    beq loop_sy_done
    jmp loop_sy

loop_sy_done:

    draw_text 40 - str_done_len, str_done_len, str_done

loop:
    ; finished
    jmp loop
.endproc