; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6
FRX = $ec

.code

.export start

.macro sext16to32 arg
    .local plus
    .local minus
    lda arg+1
    asl ; sign -> carry
    lda #$ff
    bcc plus
    lda #$00
plus:
    sta arg+2
    sta arg+3
.endmacro

.macro copy bytes, arg1, arg2
    .repeat 2, byte
        lda arg1+byte
        sta arg2+byte
    .endrepeat
.endmacro

.macro copy16 arg1, arg2
    copy 2, arg1, arg2
.endmacro

.macro copy32 arg1, arg2
    copy 4, arg1, arg2
.endmacro

; 2 + 8 * byte cycles
.macro neg bytes, arg
    sec ; 2 cyc
    .repeat bytes, byte ; 8 * byte cycles
        lda #00         ; 2 cyc
        sbc arg + byte  ; 3 cyc
        sta arg + byte  ; 3 cyc
    .endrepeat
.endmacro

; 18 cycles
.macro neg16 arg
    neg 2, arg
.endmacro

; 34 cycles
.macro neg32 arg
    neg 4, arg
.endmacro

; 2 + 9 * bytes cycles
.macro add bytes, arg1, arg2
    clc               ; 2 cyc
    .repeat bytes, byte
        lda arg1+byte ; 3 cyc
        adc arg2+byte ; 3 cyc
        sta arg1+byte ; 3 cyc
    .endrepeat
.endmacro

; 20 cycles
.macro add16 arg1, arg2
    add 2, arg1, arg2
.endmacro

; 38 cycles
.macro add32 arg1, arg2
    add 4, arg1, arg2
.endmacro

.macro shl bytes, arg
    asl arg
    .repeat bytes-1, byte
        rol arg+byte+1
    .endrepeat
.endmacro

.macro shl16 arg
    shl 2, arg
.endmacro

.macro shl24 arg
    shl 3, arg
.endmacro

.macro shl32 arg
    shl 4, arg
.endmacro

.macro shr bytes, arg
    lsr arg
    .repeat bytes-1, byte
        ror arg+byte+1
    .endrepeat
.endmacro

.macro shr16 arg
    shr 2, arg
.endmacro

.macro shr24 arg
    shr 3, arg
.endmacro

.macro shr32 arg
    shr 4, arg
.endmacro

; 24 to 44 cycles
.macro bitmul16 arg1, arg2, result, bitnum
    .local next

    ; does 16-bit adds
    ; arg1 must be 0 or positive
    ; arg2 must be 0 or positive

    clc ; 2 cyc

    ; check if arg1 has 0 or 1 bit in this place
    ; 5 cycles either way
    .if bitnum < 8
        lda arg1                 ; 3 cyc
        and #(1 << bitnum)       ; 2 cyc
    .else
        lda arg1 + 1             ; 3 cyc
        and #(1 << (bitnum - 8)) ; 2 cyc
    .endif
    beq next ; 2 cyc

    ; 16-bit add on the top bits
    lda result + 2 ; 3 cyc
    adc arg2       ; 3 cyc
    sta result + 2 ; 3 cyc
    lda result + 3 ; 3 cyc
    adc arg2 + 1   ; 3 cyc
    ror a          ; 2 cyc - get a jump on the shift
    sta result + 3 ; 3 cyc

    ; Shift the 32-bit result down by one bit,
    ; saving the previous carry.
    ror result + 3 ; 5 cyc
next:
    ror result + 2 ; 5 cyc
    ror result + 1 ; 5 cyc
    .if bitnum >= 8
        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
        ; when it's all uninitialized data
        ror result ; 5 cyc
    .endif
.endmacro

; 5 to 25 cycles
.macro check_sign arg
    ; Check sign bit and flip argument to postive,
    ; keeping a count of sign bits in the X register.
    .local positive
    lda arg + 1   ; 3 cyc
    bpl positive  ; 2 cyc
    neg16 arg     ; 18 cyc
    inx           ; 2 cyc
positive:
.endmacro

; min 454 cycles
; max 756 cycles
.proc imul16
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result

    ldx #0          ; 2 cyc
    ; counts the number of sign bits in X
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
    
    ; zero out the 32-bit temp's top 16 bits
    lda #0          ; 2 cyc
    sta result + 2  ; 3 cyc
    sta result + 3  ; 3 cyc
    ; the bottom two bytes will get cleared by the shifts

    ; unrolled loop for maximum speed, at the cost
    ; of a larger routine
    ; 424 to 672 cycles
    .repeat 16, bitnum
        ; first half: 24 to 40 cycles
        ; second half: 29 to 44 cycles
        bitmul16 arg1, arg2, result, bitnum
    .endrepeat

    ; In case of mixed input signs, return a negative result.
    cpx #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
positive_result:

    rts ; 6 cyc
.endproc

.proc iter
    ; (cx and cy should be pre-scaled to 6.26 fixed point)
    ; zx = 0
    ; zy = 0
    ; zx_2 = 0
    ; zy_2 = 0
    ; zx_zy = 0

    ; still working on the fixed-point
loop:
    ; iters++

    ; 6.26:
    ; zx = zx_2 + zy_2 + cx
    ; zy = zx_zy + zx_zy + cy
    ; round to 6.10.

    ; 12.20:
    ; zx_2 = zx * zx
    ; zy_2 = zy * zy
    ; dist = zx_2 + zy_2
    ; if dist >= 4 break, else continue iterating

    ; round zx_2, zy_2, dist to 6.26

    ; if may be in the lake, look for looping output with a small buffer
    ; as an optimization vs running to max iters

.endproc

.proc start

loop:
    ; FR0 = 5
    ; FR1 = -3
    lda #5
    sta FR0
    lda #0
    sta FR0 + 1
    lda #$fd
    sta FR1
    lda #$ff
    sta FR1 + 1

    jsr imul16
    ; should have 32-bit -15 in FR2

    jmp loop
.endproc