; Our zero-page vars
sx    = $80     ; i16: screen pixel x
sy    = $82     ; i16: screen pixel y
ox    = $84     ; fixed4.12: center point x
oy    = $86     ; fixed4.12: center point y
cx    = $88     ; fixed4.12: c_x
cy    = $8a     ; fixed4.12: c_y
zx    = $8c     ; fixed4.12: z_x
zy    = $8e     ; fixed4.12: z_y

zx_2  = $90     ; fixed4.12: z_x^2
zy_2  = $92     ; fixed4.12: z_y^2
zx_zy = $94     ; fixed4.12: z_x * z_y
dist  = $96     ; fixed4.12: z_x^2 + z_y^2

iter         = $a0 ; u8: iteration count

zoom         = $a1 ; u8: zoom shift level
count_frames = $a2 ; u8
count_pixels = $a3 ; u8
total_ms     = $a4 ; float48
total_pixels = $aa ; float48

z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start  = $b1 ; u8: index into z_buffer
z_buffer_end    = $b2 ; u8: index into z_buffer
temp            = $b4 ; u16
temp2           = $b6 ; u16
pixel_ptr       = $b8 ; u16
pixel_color     = $ba ; u8
pixel_mask      = $bb ; u8
pixel_shift     = $bc ; u8
pixel_offset    = $bd ; u8
fill_level      = $be ; u8
palette_offset  = $bf ; u8

; FP registers in zero page
FR0    = $d4 ; float48
FRE    = $da
FR1    = $e0 ; float48
FR2    = $e6 ; float48
CIX    = $f2 ; u8 - index into INBUFF
INBUFF = $f3 ; u16 - pointer to ascii
FLPTR  = $fc ; u16 - pointer to user buffer float48

CH1    = $02f2 ; previous character read from keyboard
CH     = $02fc ; current character read from keyboard

LBUFF  = $0580 ; result buffer for FASC routine

; FP ROM routine vectors
FASC   = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
IFP    = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
FADD   = $DA66 ; ADDITION       (FR0 += FR1)
FSUB   = $DA60 ; SUBTRACTION    (FR0 -= FR1)
FMUL   = $DADB ; MULTIPLICATION (FR0 *= FR1)
FDIV   = $DB28 ; DIVISION       (FR0 /= FR1)
ZF1    = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
FLD0R  = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
FLD1R  = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
FST0R  = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
FMOVE  = $DDB6 ; MOVE FR0 TO FR1

; High data
framebuffer_top    = $8000
textbuffer         = $8f00
framebuffer_bottom = $9000
display_list       = $9f00
framebuffer_end    = $a000

height = 184
half_height = height >> 1
width = 160
half_width = width >> 1
stride = width >> 2

EXTENDED_RAM = $4000 ; 16KiB bank on the XE
PORTB  = $D301 ; memory & bank-switch for XL/XE

DMACTL = $D400
DLISTL = $D402
DLISTH = $D403
WSYNC  = $D40A

; OS shadow registers
SDLSTL = $230
SDLSTH = $231

; interrupt stuff
SYSVBV = $E45F
XITVBV = $E462
SETVBV = $E45C

COLOR0 = $2C4
COLOR1 = $2C5
COLOR2 = $2C6
COLOR3 = $2C7
COLOR4 = $2C8

; Keycodes!
KEY_PLUS  = $06
KEY_MINUS = $0e
KEY_UP    = $8e
KEY_DOWN  = $8f
KEY_LEFT  = $86
KEY_RIGHT = $87

.struct float48
    exponent .byte
    mantissa .byte 5
.endstruct

.import mul_lobyte256
.import mul_hibyte256
.import mul_hibyte512

.data

strings:
str_self:
    .byte "MANDEL-6502"
str_self_end:
str_speed:
    .byte " ms/px"
str_speed_end:
str_run:
    .byte " RUN"
str_run_end:
str_done:
    .byte "DONE"
str_done_end:

str_self_len = str_self_end - str_self
str_speed_len = str_speed_end - str_speed
str_run_len = str_run_end - str_run
str_done_len = str_done_end - str_done
speed_precision = 6

speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1
speed_len = 14 + str_speed_len


char_map:
    ; Map ATASCII string values to framebuffer font entries
    ; Sighhhhh
    .repeat 32, i
        .byte i + 64
    .endrepeat
    .repeat 64, i
        .byte i
    .endrepeat
    .repeat 32, i
        .byte 96 + i
    .endrepeat

hex_chars:
    .byte "0123456789abcdef"

aspect:
    ; aspect ratio!
    ; pixels at 320w are 5:6 (narrow)
    ; pixels at 160w are 5:3 (wide)
    ;
    ; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4)
    ; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4)
    ;
    ; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624
    ; &horizontal range -80 .. 79.9 is -3.125 .. 3.124
    ;
    ; 184h is the equiv of 220.8h at square pixels
    ; 320 / 220.8 = 1.45 display aspect ratio
aspect_x: ; fixed4.16 5/4
    .word 5 << (12 - 2)

aspect_y: ; fixed4.16 3/4
    .word 3 << (12 - 2)

ms_per_frame: ; float48 16.66666667
    .byte 64  ; exponent/sign
    .byte $16 ; BCD digits
    .byte $66
    .byte $66
    .byte $66
    .byte $67

display_list_start:
    ; 24 lines overscan
    .repeat 3
        .byte $70 ; 8 blank lines
    .endrep

    ; 8 scan lines, 1 row of 40-column text
    .byte $42
    .addr textbuffer

    ; 184 lines graphics
    ; ANTIC mode e (160px 2bpp, 1 scan line per line)
    .byte $4e
    .addr framebuffer_top
    .repeat half_height - 1
        .byte $0e
    .endrep
    .byte $4e
    .addr framebuffer_bottom
    .repeat half_height - 1
        .byte $0e
    .endrep

    .byte $41 ; jump and blank
    .addr display_list
display_list_end:
display_list_len = display_list_end - display_list_start

color_map:
    .byte 0
    .repeat 85
        .byte 1
        .byte 2
        .byte 3
    .endrepeat

palette:
    .byte $00
    .byte $46
    .byte $78
    .byte $b4
.code

z_buffer_len = 16
z_buffer_mask = z_buffer_len - 1
z_buffer:
    ; the last N zx/zy values
    .repeat z_buffer_len
        .word 0
        .word 0
    .endrepeat

.export start

max_fill_level = 6
fill_masks:
    .byte %00011111
    .byte %00001111
    .byte %00000111
    .byte %00000011
    .byte %00000001
    .byte %00000000

viewport_zoom:
    .byte 1
    .byte 8

viewport_ox:
    .word $0000
    .word $f110

viewport_oy:
    .word $0000
    .word $fbe0

; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
    clc ; 2 cyc
    .repeat bytes, byte ; 9 * byte cycles
        lda arg1 + byte
        adc arg2 + byte
        sta dest + byte
    .endrepeat
.endmacro

.macro add16 dest, arg1, arg2
    add 2, dest, arg1, arg2
.endmacro

.macro add32 dest, arg1, arg2
    add 4, dest, arg2, dest
.endmacro

.macro add_carry dest
    lda dest
    adc #0
    sta dest
.endmacro

; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2
    sec ; 2 cyc
    .repeat bytes, byte ; 9 * byte cycles
        lda arg1 + byte
        sbc arg2 + byte
        sta dest + byte
    .endrepeat
.endmacro

.macro sub16 dest, arg1, arg2
    sub 2, dest, arg1, arg2
.endmacro

.macro sub32 dest, arg1, arg2
    sub 4, dest, arg1, arg2
.endmacro

.macro shl bytes, arg
    asl arg
    .repeat bytes-1, i
        rol arg + 1 + i
    .endrepeat
.endmacro

.macro shl16 arg
    shl 2, arg
.endmacro

.macro shl24 arg
    shl 3, arg
.endmacro

.macro shl32 arg
    shl 4, arg
.endmacro

; 6 * bytes cycles
.macro copy bytes, dest, arg
    .repeat bytes, byte ; 6 * bytes cycles
        lda arg + byte  ; 3 cyc
        sta dest + byte ; 3 cyc
    .endrepeat
.endmacro

.macro copy16 dest, arg
    copy 2, dest, arg
.endmacro

.macro copy32 dest, arg
    copy 4, dest, arg
.endmacro

.macro copyfloat dest, arg
    copy 6, dest, arg
.endmacro

; 2 + 8 * byte cycles
.macro neg bytes, arg
    sec ; 2 cyc
    .repeat bytes, byte ; 8 * byte cycles
        lda #00         ; 2 cyc
        sbc arg + byte  ; 3 cyc
        sta arg + byte  ; 3 cyc
    .endrepeat
.endmacro

; 18 cycles
.macro neg16 arg
    neg 2, arg
.endmacro

; 34 cycles
.macro neg32 arg
    neg 4, arg
.endmacro

.macro shift_round_16 arg, shift
    .repeat shift
        shl32 arg
    .endrepeat
    round16 arg
.endmacro

.macro imul16_round dest, arg1, arg2, shift
    copy16 FR0, arg1  ; 12 cyc
    copy16 FR1, arg2  ; 12 cyc
    jsr imul16_func   ; ? cyc
    shift_round_16 FR2, shift
    copy16 dest, FR2 + 2  ; 12 cyc
.endmacro

.macro sqr16_round dest, arg, shift
    ;imul16_round dest, arg, arg, shift
    copy16 FR0, arg   ; 12 cyc
    jsr sqr16_func      ; ? cyc
    shift_round_16 FR2, shift
    copy16 dest, FR2 + 2  ; 12 cyc
.endmacro

; clobbers a, x
.macro sqr8 dest, arg
    ldx arg
    lda sqr_lobyte,x
    sta dest
    lda sqr_hibyte,x
    sta dest + 1
.endmacro

; lookup table for top byte -> PORTB value for bank-switch
;.align 256 ; warning - if not aligned this will cost an extra cycle sometimes
bank_switch_table:
    .repeat 256, i
        .byte ((i & $c0) >> 4) | $e1
    .endrepeat

.macro bank_switch bank
    lda #((bank << 2) | $e1)
    sta PORTB
.endmacro

.macro imul8 dest, arg1, arg2, xe
    .if xe
        ; using 64KB lookup table
        ; 58-77 cycles
        ; clobbers x, y, dest to dest + 3
        .scope
            output = dest
            ptr = dest + 2 ; scratch space assumed

            ; bottom 14 bits except the LSB are the per-bank table index
            ; add $4000 for the bank pointer
            lda arg1     ; 3 cyc
            and #$fe     ; 2 cyc
            sta ptr      ; 3 cyc
            lda arg2     ; 3 cyc
            and #$3f     ; 2 cyc
            clc          ; 2 cyc
            adc #$40     ; 2 cyc
            sta ptr + 1  ; 3 cyc
            
            ; top 2 bits are the table bank selector
            ldx arg2                ; 3 cyc
            lda bank_switch_table,x ; 4 cyc
            sta PORTB               ; 4 cyc


            ; copy the entry into output
            ldy #0       ; 2 cyc
            lda (ptr),y  ; 5 cyc
            sta output   ; 3 cyc
            iny          ; 2 cyc
            lda (ptr),y  ; 5 cyc
            sta output+1 ; 3 cyc

            ; note: we are not restoring memory to save 6 cycles!
            ; this means those 16kb have to be switched back to base RAM
            ; if we need to use them anywhere else
            ;;; restore memory
            ;;lda #$81     ; 2 cyc - disabled
            ;;sta PORTB    ; 4 cyc - disabled

            ; check that 1 bit we skipped to fit into space
            lda arg1     ; 3 cyc
            and #1       ; 2 cyc
            beq done     ; 2 cyc

            ; add the second param one last time for the skipped bit
            clc          ; 2 cyc
            lda arg2     ; 3 cyc
            adc output   ; 3 cyc
            sta output   ; 3 cyc
            lda #0       ; 2 cyc
            adc output+1 ; 3 cyc
            sta output+1 ; 3 cyc

        done:
        .endscope
    .else
        ; Using base 48k RAM compatibility mode
        ; Small table of half squares
        ; Adapted from https://everything2.com/title/Fast+6502+multiplication
        ; 81-92 cycles
        .scope
            mul_factor_a   = arg1
            mul_factor_x   = arg2
            mul_product_lo = dest
            mul_product_hi = dest + 1

            lda mul_factor_a      ; 3 cyc

            ; (a + x)^2/2
            clc                   ; 2 cyc         
            adc mul_factor_x      ; 3 cyc
            tax                   ; 2 cyc
            bcc under256          ; 2 cyc
            lda mul_hibyte512,x   ; 4 cyc
            bcs next              ; 2 cyc
        under256:
            lda mul_hibyte256,x   ; 4 cyc
            sec                   ; 2 cyc
        next:
            sta mul_product_hi    ; 3 cyc
            lda mul_lobyte256,x   ; 4 cyc

            ; - a^2/2
            ldx mul_factor_a      ; 3 cyc
            sbc mul_lobyte256,x   ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
            sbc mul_hibyte256,x   ; 4 cyc
            sta mul_product_hi    ; 3 cyc

            ; + x & a & 1:
            ; (this is a kludge to correct a
            ; roundoff error that makes odd * odd too low)
            ldx mul_factor_x      ; 3 cyc
            txa                   ; 2 cyc
            and mul_factor_a      ; 3 cyc
            and #1                ; 2 cyc

            clc                   ; 2 cyc
            adc mul_product_lo    ; 3 cyc
            bcc small_product     ; 2 cyc
            inc mul_product_hi    ; 5 cyc

            ; - x^2/2
        small_product:
            sec                   ; 2 cyc
            sbc mul_lobyte256,x   ; 4 cyc
            sta mul_product_lo    ; 3 cyc
            lda mul_product_hi    ; 3 cyc
            sbc mul_hibyte256,x   ; 4 cyc
            sta mul_product_hi    ; 3 cyc
        .endscope
    .endif
.endmacro

.proc imul8xe_init

    bank_switch 0
    lda #0
    sta EXTENDED_RAM
    bank_switch 1
    lda #1
    sta EXTENDED_RAM
    bank_switch 0
    lda EXTENDED_RAM
    beq init

    ; no bank switching available, we just overwrite the value in base ram
    rts

init:

    ; patch imul16_func into a forwarding thunk to imul16xe_func
    lda #$4c ; 'jmp' opcode
    sta imul16_func
    lda #.lobyte(imul16xe_func)
    sta imul16_func + 1
    lda #.hibyte(imul16xe_func)
    sta imul16_func + 2

    ; ditto for sqr16_func -> sqr16xe_func
    lda #$4c ; 'jmp' opcode
    sta sqr16_func
    lda #.lobyte(sqr16xe_func)
    sta sqr16_func + 1
    lda #.hibyte(sqr16xe_func)
    sta sqr16_func + 2

    ; create the lookup table
    ; go through the input set, in four 16KB chunks

    arg1 = FR1
    arg2 = FR2
    result = FR0

    lda #$00
    sta arg1
    sta arg2

    ; $00 * $00 -> $3f * $ff
    bank_switch 0
    jsr imul8xe_init_section

    ; $40 * $00 -> $7f * $ff
    bank_switch 1
    jsr imul8xe_init_section

    ; $80 * $00 -> $bf * $ff
    bank_switch 2
    jsr imul8xe_init_section

    ; $c0 * $00 -> $ff * $ff
    bank_switch 3
    jsr imul8xe_init_section

    rts
.endproc

; Initialize a 16 KB chunk of the table
; input: multipliers in temp
; output: new multipliers in temp
; clobbers: temp, temp2
.proc imul8xe_init_section
    arg1 = FR1
    arg2 = FR2
    result = FR0
    ptr = temp2

    lda #$00
    sta ptr
    lda #$40
    sta ptr + 1

    ldy #0

    ; outer loop: $00 -> $3f
outer_loop:

    ; reset result to 0
    lda #0
    sta result
    sta result + 1

    ; inner loop: $00 -> $ff
inner_loop:

    ; copy result to data set
    lda result
    sta (ptr),y
    lda result + 1
    iny
    sta (ptr),y
    dey

    ; result += 2 * arg2
    clc
    lda arg2
    adc result
    sta result
    lda #0
    adc result + 1
    sta result + 1
    clc
    lda arg2
    adc result
    sta result
    lda #0
    adc result + 1
    sta result + 1

    ; inner loop check
    inc arg1
    inc arg1
    inc ptr
    inc ptr
    bne inner_loop

    ; outer loop check
    inc arg2
    inc ptr + 1
    lda ptr + 1
    cmp #$80
    bne outer_loop

    rts

.endproc

.macro imul16_impl xe
    .local arg1
    .local arg2
    .local result
    .local inter
    .local arg1_pos
    .local arg2_pos
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
    inter = temp2

    ; h1l1 * h2l2
    ; (h1*256 + l1) * (h2*256 + l2)
    ; h1*256*(h2*256 + l2) + l1*(h2*256 + l2)
    ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2

    imul8 result, arg1, arg2, xe
    lda #0
    sta result + 2
    sta result + 3

    imul8 inter, arg1 + 1, arg2, xe
    add16 result + 1, result + 1, inter
    add_carry result + 3

    imul8 inter, arg1, arg2 + 1, xe
    add16 result + 1, result + 1, inter
    add_carry result + 3

    imul8 inter, arg1 + 1, arg2 + 1, xe
    add16 result + 2, result + 2, inter

    ; In case of negative inputs, adjust high word
    ; https://stackoverflow.com/a/28827013
    lda arg1 + 1
    bpl arg1_pos
    sub16 result + 2, result + 2, arg2
arg1_pos:
    lda arg2 + 1
    bpl arg2_pos
    sub16 result + 2, result + 2, arg1
arg2_pos:

    rts ; 6 cyc
.endmacro

.macro sqr16_impl xe
    .local arg
    .local result
    .local inter
    .local arg_pos
    arg = FR0    ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
    inter = temp2

    ; hl * hl
    ; (h*256 + l) * (h*256 + l)
    ; h*256*(h*256 + l) + l*(h*256 + l)
    ; h*h*256*256 + h*l*256 + h*l*256 + l*l

    sqr8 result, arg
    lda #0
    sta result + 2
    sta result + 3

    imul8 inter, arg + 1, arg, xe
    add16 result + 1, result + 1, inter
    add_carry result + 3
    add16 result + 1, result + 1, inter
    add_carry result + 3

    sqr8 inter, arg + 1, arg + 1, xe
    add16 result + 2, result + 2, inter

    ; In case of negative inputs, adjust high word
    ; https://stackoverflow.com/a/28827013
    lda arg + 1
    bpl arg_pos
    sub16 result + 2, result + 2, arg
    sub16 result + 2, result + 2, arg
arg_pos:

    rts ; 6 cyc
.endmacro

.proc imul16_func
    imul16_impl 0
.endproc

.proc imul16xe_func
    imul16_impl 1
.endproc

.proc sqr16_func
    imul16_impl 0
.endproc

.proc sqr16xe_func
    imul16_impl 1
.endproc

.macro round16 arg
    ; Round top 16 bits of 32-bit fixed-point number in-place
    .local increment
    .local high_half
    .local check_sign
    .local next

    ; low word > $8000: round up
    ;          = $8000: round up   if positive
    ;                   round down if negative
    ;          < $8000: round down

    lda arg + 1
    cmp #$80
    beq high_half
    bpl increment
    bmi next

high_half:
    lda arg
    beq check_sign
    bpl increment
    bmi next

check_sign:
    lda arg + 3
    bmi next

increment:       ; 5-10 cyc
    inc arg + 2  ; 5 cyc
    bne next     ; 2 cyc
    inc arg + 3  ; 5 cyc

next:

.endmacro

.proc mandelbrot
    ; input:
    ; cx: position scaled to 4.12 fixed point - -8..+7.9
    ; cy: position scaled to 4.12
    ;
    ; output:
    ; iter: iteration count at escape or 0

    ; zx = 0
    ; zy = 0
    ; zx_2 = 0
    ; zy_2 = 0
    ; zx_zy = 0
    ; dist = 0
    ; iter = 0
    lda #00
    ldx #(iter - zx + 1)
initloop:
    sta zx - 1,x
    dex
    bne initloop
    sta z_buffer_start
    sta z_buffer_end

loop:
    ; iter++ & max-iters break
    inc iter
    bne keep_going
    jmp exit_path
keep_going:

    .macro quick_exit arg, max
        .local positive
        .local negative
        .local nope_out
        .local first_equal
        .local all_done

        ; check sign bit
        lda arg + 1
        bmi negative

    positive:
        cmp #((max) << 4)
        bmi all_done ; 'less than'
        jmp exit_path

    negative:
        cmp #(256 - ((max) << 4))
        beq first_equal ; 'equal' on first byte
        bpl all_done    ; 'greater than'

    nope_out:
        jmp exit_path
    
    first_equal:
        lda arg
        beq nope_out  ; 2nd byte 0 shows it's really 'equal'

    all_done:
    .endmacro

    ; 4.12: (-8 .. +7.9)
    ; zx = zx_2  - zy_2  + cx
    sub16 zx, zx_2, zy_2
    add16 zx, zx, cx
    quick_exit zx, 2

    ; zy = zx_zy + zx_zy + cy
    add16 zy, zx_zy, zx_zy
    add16 zy, zy, cy
    quick_exit zy, 2

    ; zx_2 = zx * zx
    sqr16_round zx_2, zx, 4

    ; zy_2 = zy * zy
    sqr16_round zy_2, zy, 4

    ; zx_zy = zx * zy
    imul16_round zx_zy, zx, zy, 4

    ; dist = zx_2 + zy_2
    add16 dist, zx_2, zy_2
    quick_exit dist, 4

    ; if may be in the lake, look for looping output with a small buffer
    ; as an optimization vs running to max iters
    lda z_buffer_active
    beq skip_z_buffer

    ldx z_buffer_start
    cpx z_buffer_end
    beq z_nothing_to_read

z_buffer_loop:
    .macro z_compare arg
        .local compare_no_match
        lda z_buffer,x
        inx
        cmp arg
        bne compare_no_match
        iny
    compare_no_match:
    .endmacro
    .macro z_advance
        .local skip_reset_x
        cpx #(z_buffer_len * 4)
        bmi skip_reset_x
        ldx #0
    skip_reset_x:
    .endmacro
    .macro z_store arg
        lda arg
        sta z_buffer,x
        inx
    .endmacro

    ; Compare the previously stored z values
    ldy #0
    z_compare zx
    z_compare zx + 1
    z_compare zy
    z_compare zy + 1

    cpy #4
    bne z_no_matches
    jmp z_exit

z_no_matches:
    z_advance

    cpx z_buffer_end
    bne z_buffer_loop

z_nothing_to_read:

    ; Store and expand
    z_store zx
    z_store zx + 1
    z_store zy
    z_store zy + 1
    z_advance
    stx z_buffer_end

    ; Increment the start roller if necessary (limit size)
    lda iter
    cmp #(z_buffer_len * 4)
    bmi skip_inc_start
    lda z_buffer_start
    clc
    adc #4
    tax
    z_advance
    stx z_buffer_start
skip_inc_start:

skip_z_buffer:

    jmp loop

z_exit:
    lda #0
    sta iter

exit_path:
    ldx #0
    lda iter
    bne next
    inx
next:
    stx z_buffer_active
    rts

.endproc

.macro scale_zoom dest
    ; clobbers X, flags
    .local cont
    .local enough

    ; cx = (sx << (8 - zoom))
    ldx zoom
cont:
    cpx #8
    beq enough
    shl16 dest
    inx
    jmp cont
enough:
.endmacro

.macro zoom_factor dest, src, zoom, aspect
    ; clobbers A, X, flags, etc
    copy16 dest, src
    scale_zoom dest

    ; cy = cy * (3 / 4)
    ; cx = cx * (5 / 4)
    imul16_round dest, dest, aspect, 4
.endmacro

.proc pset
    ; screen coords in signed sx,sy
    ; iter holds the target to use
    ; @todo implement

    ; iter -> color
    ldx iter
    lda color_map,x
    sta pixel_color
    lda #(255 - 3)
    sta pixel_mask

    ; sy -> line base address in temp
    lda sy
    bpl positive

negative:
    ; temp1 = top half
    lda #.lobyte(framebuffer_top + stride * half_height)
    sta pixel_ptr
    lda #.hibyte(framebuffer_top + stride * half_height)
    sta pixel_ptr + 1
    jmp point

positive:

    lda #.lobyte(framebuffer_bottom)
    sta pixel_ptr
    lda #.hibyte(framebuffer_bottom)
    sta pixel_ptr + 1

point:

    ; pixel_ptr += sy * stride
    ;    temp * 40
    ; =  temp * 32  +  temp * 8
    ; = (temp << 5) + (temp << 3)
    copy16 temp, sy
    shl16 temp
    shl16 temp
    shl16 temp
    add16 pixel_ptr, pixel_ptr, temp
    shl16 temp
    shl16 temp
    add16 pixel_ptr, pixel_ptr, temp

    ; Ok so temp1 points to the start of the line, which is 40 bytes.
    ; Get the byte and bit offsets
    lda sx
    clc
    adc #half_width
    sta temp

    ; pixel_shift = temp & 3
    ; pixel_color <<= pixel_shift (shifting in zeros)
    ; pixel_mask <<= pixel_shift (shifting in ones)
    and #3
    sta pixel_shift
    lda #3
    sec
    sbc pixel_shift
    tax
shift_loop:
    beq shift_done
    asl pixel_color
    asl pixel_color
    sec
    rol pixel_mask
    sec
    rol pixel_mask
    dex
    jmp shift_loop
shift_done:

    ; pixel_offset = temp >> 2
    lda temp
    lsr a
    lsr a
    sta pixel_offset
    tay

    ; read, mask, or, write
    lda (pixel_ptr),y
    and pixel_mask
    ora pixel_color
    sta (pixel_ptr),y

    rts
.endproc

.macro draw_text_indirect col, len, strptr
    ; clobbers A, X
    .local loop
    .local done
    ldx #0
loop:
    cpx #len
    beq done
    txa
    tay
    lda (strptr),y
    tay
    lda char_map,y
    sta textbuffer + col,x
    inx
    jmp loop
done:
.endmacro

.macro draw_text col, len, cstr
    ; clobbers A, X
    .local loop
    .local done
    ldx #0
loop:
    cpx #len
    beq done
    ldy cstr,x
    lda char_map,y
    sta textbuffer + col,x
    inx
    jmp loop
done:
.endmacro

.proc vblank_handler
    inc count_frames
    inc palette_offset
    jsr update_palette
    jmp XITVBV
.endproc

.proc update_palette
    lda palette
    sta COLOR4

    clc
    lda palette_offset
    and #$f0
    adc palette + 1
    sta COLOR0

    clc
    lda palette_offset
    and #$f0
    adc palette + 2
    sta COLOR1

    clc
    lda palette_offset
    and #$f0
    adc palette + 3
    sta COLOR2

    rts
.endproc

.proc update_speed
    ; convert frames (u16) to fp
    ; add to frames_total
    ; convert pixels (u16) to fp
    ; add to pixels_total
    ; (frames_total * 16.66666667) / pixels_total
    ; convert to ATASCII
    ; draw text
.endproc

.proc keycheck
    ; clobbers all
    ; returns 255 in A if state change or 0 if no change

    ; check keyboard buffer
    lda CH
    cmp #$ff
    beq skip_char

    ; Clear the keyboard buffer and re-enable interrupts
    ldx #$ff
    stx CH

    tay

    lda zoom
    cpy #KEY_PLUS
    beq plus
    cpy #KEY_MINUS
    beq minus

    ; temp = $0010 << (8 - zoom)
    lda #$10
    sta temp
    lda #$00
    sta temp + 1
    scale_zoom temp

    cpy #KEY_UP
    beq up
    cpy #KEY_DOWN
    beq down
    cpy #KEY_LEFT
    beq left
    cpy #KEY_RIGHT
    beq right

skip_char:
    lda #0
    rts

plus:
    lda zoom
    cmp #8
    bpl skip_char
    inc zoom
    jmp done
minus:
    lda zoom
    cmp #1
    bmi skip_char
    dec zoom
    jmp done
up:
    sub16 oy, oy, temp 
    jmp done
down:
    add16 oy, oy, temp
    jmp done
left:
    sub16 ox, ox, temp
    jmp done
right:
    add16 ox, ox, temp
done:
    lda #255
    rts

.endproc

.proc clear_screen
    ; zero the range from framebuffer_top to display_list
    lda #.lobyte(framebuffer_top)
    sta temp
    lda #.hibyte(framebuffer_top)
    sta temp + 1

zero_page_loop:
    lda #0
    ldy #0
zero_byte_loop:
    sta (temp),y
    iny
    bne zero_byte_loop

    inc temp + 1
    lda temp + 1
    cmp #.hibyte(display_list)
    bne zero_page_loop

    rts
.endproc

.proc status_bar
    ; Status bar
    draw_text 0, str_self_len, str_self
    draw_text 40 - str_run_len, str_run_len, str_run

    rts
.endproc

.proc start

    jsr imul8xe_init

    ; initialize viewport
    ldx #0 ; overview
    ;ldx #1 ; closeup
    lda viewport_zoom,x
    sta zoom

    txa
    asl a
    tax
    lda viewport_ox,x
    sta ox
    lda viewport_oy,x
    sta oy
    inx
    lda viewport_ox,x
    sta ox + 1
    lda viewport_oy,x
    sta oy + 1

    ; count_frames = 0; count_pixels = 0
    lda #0
    sta count_frames
    sta count_pixels

    ; total_ms = 0.0; total_pixels = 0.0
    ldx #total_ms
    jsr ZF1
    ldx #total_pixels
    jsr ZF1

    ; Disable display DMA
    lda #0
    sta DMACTL

    jsr clear_screen

    ; Copy the display list into properly aligned memory
    ; Can't cross 1024-byte boundaries :D
    ldx #0
copy_byte_loop:
    lda display_list_start,x
    sta display_list,x
    inx
    cpx #display_list_len
    bne copy_byte_loop

    ; Set up the display list
    lda #.lobyte(display_list)
    sta DLISTL ; actual register
    sta SDLSTL ; shadow register the OS will copy in
    lda #.hibyte(display_list)
    sta DLISTH ; actual register
    sta SDLSTH ; shadow register the OS will copy in

    ; Re-enable display DMA
    lda #$22
    sta DMACTL

    ; Initialize the palette
    lda #0
    sta palette_offset
    jsr update_palette

    ; install the vblank handler
    lda #7 ; deferred
    ldx #.hibyte(vblank_handler)
    ldy #.lobyte(vblank_handler)
    jsr SETVBV

main_loop:
    jsr clear_screen
    jsr status_bar

    lda #0
    sta fill_level

fill_loop:

    ; sy = -92 .. 91
    lda #(256-half_height)
    sta sy
    lda #(256-1)
    sta sy + 1

loop_sy:
    ; sx = -80 .. 79
    lda #(256-half_width)
    sta sx
    lda #(256-1)
    sta sx + 1

loop_sx:
    ; check the fill mask
    ldy #0

loop_skip_level:
    cpy fill_level
    beq current_level

    lda fill_masks,y
    and sx
    bne not_skipped_mask1

    lda fill_masks,y
    and sy
    beq skipped_mask

not_skipped_mask1:
    iny
    jmp loop_skip_level

current_level:
    lda fill_masks,y
    and sx
    bne skipped_mask

    lda fill_masks,y
    and sy
    beq not_skipped_mask

skipped_mask:
    jmp skipped

not_skipped_mask:

    ; run the fractal!
    zoom_factor cx, sx, zoom, aspect_x
    add16 cx, cx, ox
    zoom_factor cy, sy, zoom, aspect_y
    add16 cy, cy, oy
    jsr mandelbrot
    jsr pset

    jsr keycheck
    beq no_key
    ; @fixme clear the pixel stats
    jmp main_loop

no_key:
    ; check if we should update the counters
    ;
    ; count_pixels >= width? update!
    inc count_pixels
    lda count_pixels
    cmp #width
    bmi update_status

    ; count_frames >= 120? update!
    lda count_frames
    cmp #120 ; >= 2 seconds
    bmi skipped

update_status:
    ; FR0 = (float)count_pixels & clear count_pixels
    lda count_pixels
    sta FR0
    lda #0
    sta FR0 + 1
    sta count_pixels
    jsr IFP

    ; FR1 = total_pixels
    ldx #.lobyte(total_pixels)
    ldy #.hibyte(total_pixels)
    jsr FLD1R

    ; FR0 += FR1
    jsr FADD

    ; total_pixels = FR0
    ldx #.lobyte(total_pixels)
    ldy #.hibyte(total_pixels)
    jsr FST0R


    ; FR0 = (float)count_frames & clear count_frames
    ; warning: this should really disable interrupts @TODO
    lda count_frames
    sta FR0
    lda #0
    sta FR0 + 1
    sta count_frames
    jsr IFP

    ; FR0 *= ms_per_frame
    ldx #.lobyte(ms_per_frame)
    ldy #.hibyte(ms_per_frame)
    jsr FLD1R
    jsr FMUL

    ; FR0 += total_ms
    ldx #total_ms
    ldy #0
    jsr FLD1R
    jsr FADD

    ; total_ms = FR0
    ldx #total_ms
    ldy #0
    jsr FST0R

    ; FR0 /= total_pixels
    ldx #total_pixels
    ldy #0
    jsr FLD1R
    jsr FDIV

    ; convert to ASCII in INBUFF
    jsr FASC

    ; print the first 6 digits
    draw_text_indirect speed_start, speed_precision, INBUFF
    draw_text speed_start + speed_precision, str_speed_len, str_speed

skipped:

    clc
    lda sx
    adc #1
    sta sx
    lda sx + 1
    adc #0
    sta sx + 1

    lda sx
    cmp #half_width
    beq loop_sx_done
    jmp loop_sx

loop_sx_done:

    clc
    lda sy
    adc #1
    sta sy
    lda sy + 1
    adc #0
    sta sy + 1

    lda sy
    cmp #half_height
    beq loop_sy_done
    jmp loop_sy

loop_sy_done:

fill_loop_done:
    inc fill_level
    lda fill_level
    cmp #max_fill_level
    beq loop
    jmp fill_loop

loop:
    ; finished
    draw_text 40 - str_done_len, str_done_len, str_done
    jsr keycheck
    beq loop
    jmp main_loop

.endproc