; Our zero-page vars
sx    = $80     ; 8 bits: screen pixel x
sy    = $81     ; 8 bits: screen pixel y
cx    = $82     ; 16 bits fixed point
cy    = $84     ; 16 bits fixed point
zx    = $86     ; 16 bits fixed point
zy    = $88     ; 16 bits fixed point
zx_2  = $8a     ; 32 bits fixed point
zy_2  = $8e     ; 32 bits fixed point
zx_zy = $92     ; 32 bits fixed point
dist  = $96     ; 32 bits fixed point  
iter  = $9a     ; 8 bits iteration count

temp  = $a0     ; debug temp area

; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6

.code

.export start

; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
    clc ; 2 cyc
    .repeat bytes, byte ; 9 * byte cycles
        lda arg1 + byte
        adc arg2 + byte
        sta dest + byte
    .endrepeat
.endmacro

.macro add16 dest, arg1, arg2
    add 2, dest, arg1, arg2
.endmacro

.macro add32 dest, arg1, arg2
    add 2, dest, arg2, dest
.endmacro

; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2
    sec ; 2 cyc
    .repeat bytes, byte ; 9 * byte cycles
        lda arg1 + byte
        sbc arg2 + byte
        sta dest + byte
    .endrepeat
.endmacro

.macro sub16 dest, arg1, arg2
    sub 2, dest, arg1, arg2
.endmacro

.macro sub32 dest, arg1, arg2
    sub 4, dest, arg1, arg2
.endmacro

.macro shl bytes, arg
    asl arg
    .repeat bytes-1
        rol arg
    .endrepeat
.endmacro

.macro shl16 arg
    shl 2, arg
.endmacro

.macro shl24 arg
    shl 3, arg
.endmacro

.macro shl32 arg
    shl 4, arg
.endmacro

; 6 * bytes cycles
.macro copy bytes, dest, arg
    .repeat bytes, byte ; 6 * bytes cycles
        lda arg + byte  ; 3 cyc
        sta dest + byte ; 3 cyc
    .endrepeat
.endmacro

.macro copy16 dest, arg
    copy 2, dest, arg
.endmacro

.macro copy32 dest, arg
    copy 4, dest, arg
.endmacro

; 2 + 8 * byte cycles
.macro neg bytes, arg
    sec ; 2 cyc
    .repeat bytes, byte ; 8 * byte cycles
        lda #00         ; 2 cyc
        sbc arg + byte  ; 3 cyc
        sta arg + byte  ; 3 cyc
    .endrepeat
.endmacro

; 18 cycles
.macro neg16 arg
    neg 2, arg
.endmacro

; 34 cycles
.macro neg32 arg
    neg 4, arg
.endmacro

; inner loop for imul16
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
.macro bitmul16 arg1, arg2, result, bitnum
    .local zero
    .local one
    .local next

    ; does 16-bit adds
    ; arg1 and arg2 are treated as unsigned
    ; negative signed inputs must be flipped first

    ; 7 cycles up to the branch

    ; check if arg1 has 0 or 1 bit in this place
    ; 5 cycles either way
    .if bitnum < 8
        lda arg1                 ; 3 cyc
        and #(1 << bitnum)       ; 2 cyc
    .else
        lda arg1 + 1             ; 3 cyc
        and #(1 << (bitnum - 8)) ; 2 cyc
    .endif
    bne one ; 2 cyc

zero: ; 18 cyc, 23 cyc
    lsr result + 3 ; 5 cyc
    jmp next       ; 3 cyc

one: ; 32 cyc, 37 cyc
    ; 16-bit add on the top bits
    clc            ; 2 cyc
    lda result + 2 ; 3 cyc
    adc arg2       ; 3 cyc
    sta result + 2 ; 3 cyc
    lda result + 3 ; 3 cyc
    adc arg2 + 1   ; 3 cyc
    ror a          ; 2 cyc - get a jump on the shift
    sta result + 3 ; 3 cyc
next:
    ror result + 2 ; 5 cyc
    ror result + 1 ; 5 cyc
    .if bitnum >= 8
        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
        ; when it's all uninitialized data
        ror result ; 5 cyc
    .endif


.endmacro

; 5 to 25 cycles
.macro check_sign arg
    ; Check sign bit and flip argument to postive,
    ; keeping a count of sign bits in the X register.
    .local positive
    lda arg + 1   ; 3 cyc
    bpl positive  ; 2 cyc
    neg16 arg     ; 18 cyc
    inx           ; 2 cyc
positive:
.endmacro

; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
    copy16 FR0, arg1  ; 12 cyc
    copy16 FR1, arg2  ; 12 cyc
    jsr imul16_func   ; 470-780
    copy32 dest, FR2  ; 24 cyc
.endmacro

; min 470 cycles
; max 780 cycles
.proc imul16_func
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result

    ldx #0          ; 2 cyc
    ; counts the number of sign bits in X
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
    
    ; zero out the 32-bit temp's top 16 bits
    lda #0          ; 2 cyc
    sta result + 2  ; 3 cyc
    sta result + 3  ; 3 cyc
    ; the bottom two bytes will get cleared by the shifts

    ; unrolled loop for maximum speed, at the cost
    ; of a larger routine
    ; 440 to 696 cycles
    .repeat 16, bitnum
        ; bitnum < 8: 25 or 41 cycles
        ; bitnum >= 8: 30 or 46 cycles
        bitmul16 arg1, arg2, result, bitnum
    .endrepeat

    ; In case of mixed input signs, return a negative result.
    cpx #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
positive_result:

    rts ; 6 cyc
.endproc

.macro round16 arg
    ; Round top 16 bits of 32-bit fixed-point number in-place
    .local zero
    .local one
    .local positive
    .local negative
    .local neg2
    .local next

    ; no round            - 5 cycles
    ; round pos, no carry - 17
    ; round pos, carry    - 22
    ; round neg, no carry - 23
    ; round neg, carry    - 28
    ; average = 5 / 2 + (17 + 22 + 23 + 28) / 8
    ;         = 5 / 2 + 90 / 8
    ;         = 2.5 + 11.25 = 13.75 cycles average on evenly distributed input

    lda arg + 1  ; 3 cyc
    bpl zero     ; 2 cyc

one:
    ; check sign bit
    lda arg + 3  ; 3 cyc
    bpl positive ; 2 cyc

negative:
    lda arg + 2  ; 3 cyc
    beq neg2     ; 2 cyc

    dec arg + 2  ; 5 cyc
    jmp next     ; 3 cyc

neg2:
    dec arg + 2  ; 5 cyc
    dec arg + 3  ; 5 cyc
    jmp next     ; 3 cyc

positive:
    inc arg + 2  ; 5 cyc
    beq next     ; 2 cyc
    inc arg + 3  ; 5 cyc

zero:
next:

.endmacro


.proc mandelbrot
    ; input:
    ; cx: position scaled to 4.12 fixed point - -8..+7.9
    ; cy: position scaled to 4.12
    ;
    ; output:
    ; iter: iteration count at escape or 0

    ; zx = 0
    ; zy = 0
    ; zx_2 = 0
    ; zy_2 = 0
    ; zx_zy = 0
    ; dist = 0
    ; iter = 0
    lda #00
    ldx iter - zx
initloop:
    sta zx,x
    dex
    bne initloop

loop:
    ; 1939 - 3007 cyc

    ; iter++ & max-iters break = 7 cyc
    inc iter       ; 5 cyc
    bne keep_going ; 2 cyc
    rts
keep_going:

    ; 4.12: (-8 .. +7.9)
    ; zx = zx_2  - zy_2  + cx   = 3 * 20 = 60 cyc
    sub16 zx, zx_2, zy_2
    add16 zx, zx, cx

    ; zy = zx_zy + zx_zy + cy   = 3 * 20 = 60 cyc
    sub16 zy, zx_zy, zx_zy
    add16 zy, zy, cy

    ; 8.24: (-128 .. +127.9)
    ; zx_2 = zx * zx            = 518 - 828 cyc
    imul16 zx_2, zx, zx

    ; zy_2 = zy * zy            = 518 - 828 cyc
    imul16 zy_2, zy, zy

    ; zx_zy = zx * zy           = 518 - 828 cyc
    imul16 zx_zy, zx, zy

    ; dist = zx_2 + zy_2        = 38 cyc
    add32 dist, zx_2, zy_2

    ; if dist >= 4 break, else continue iterating = 7 cyc
    lda dist + 3  ; 3 cyc
    cmp #4        ; 2 cyc
    bmi still_in  ; 2 cyc
    rts
still_in:

    ; shift and round zx_2 to 4.12 = (60 + 5) - (60 + 28) = 65 - 88 cyc
    .repeat 4      ; 60 cyc
        shl24 zx_2 ; 15 cyc
    .endrepeat
    round16 zx_2   ; 5-28 cycles

    ; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
    .repeat 4      ; 60 cyc
        shl24 zy_2 ; 15 cyc
    .endrepeat
    round16 zy_2   ; 5-28 cycles

    ; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 65 - 88 cyc
    .repeat 4       ; 60 cyc
        shl24 zx_zy ; 15 cyc
    .endrepeat
    round16 zx_zy   ; 5-28 cycles

    ; if may be in the lake, look for looping output with a small buffer
    ; as an optimization vs running to max iters
    jmp loop ; 3 cycles

.endproc

.proc start

looplong:
    ; cx = -0.5
    lda #$f7
    sta cx
    lda #$ff
    sta cx + 1

    ; cy = 1
    lda #$10
    sta cy
    lda #$00
    sta cy + 1

    jsr mandelbrot
    ; should have 32-bit -15 in FR2

    ; save the completed iter count for debugging
    lda iter
    sta temp

loop:
    ; keep looping over so we can work in the debugger
    jmp looplong
.endproc