; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6
FRX = $ec
EEXP = $ed
NSIGN = $ee
ESIGN = $ef
FLPTR = $fc
FPTR2 = $fe

; FP routines
AFP = $D800
FASC = $D8E6
IFP = $D9AA
FIP = $D9D2
ZFR0 = $DA44
ZFI = $DA46
FSUB = $DA60
FADD = $DA66
FMUL = $DADB
FDIV = $DB28
PLYVEL = $DD40
FLD0R = $DD49 ; from pointer in X/Y
FLD0P = $DD89 ; from pointer in FLPTR
FLD1R = $DD89
FLD1P = $DD9c
FST0R = $DDA7
FST0P = $DDAB
FMOVE = $DDB6 ; FR0 -> FR1
EXP = $DDC0
EXP10 = $DDCC
LOG = $decd
LOG10 = $ded1


.code

.export start

.macro sext16to32 arg
    .local plus
    .local minus
    lda arg+1
    asl ; sign -> carry
    lda #$ff
    bcc plus
    lda #$00
plus:
    sta arg+2
    sta arg+3
.endmacro

.macro copy bytes, arg1, arg2
    .repeat 2, byte
        lda arg1+byte
        sta arg2+byte
    .endrepeat
.endmacro

.macro copy16 arg1, arg2
    copy 2, arg1, arg2
.endmacro

.macro copy32 arg1, arg2
    copy 4, arg1, arg2
.endmacro

; 2 + 8 * byte cycles
.macro neg bytes, arg
    sec ; 2 cyc
    .repeat bytes, byte ; 8 * byte cycles
        lda #00         ; 2 cyc
        sbc arg + byte  ; 3 cyc
        sta arg + byte  ; 3 cyc
    .endrepeat
.endmacro

; 18 cycles
.macro neg16 arg
    neg 2, arg
.endmacro

; 34 cycles
.macro neg32 arg
    neg 4, arg
.endmacro

.macro add bytes, arg1, arg2
    clc
    .repeat bytes, byte
        lda arg1+byte
        adc arg2+byte
        sta arg1+byte
    .endrepeat
.endmacro

.macro add16 arg1, arg2
    add 2, arg1, arg2
.endmacro

.macro add32 arg1, arg2
    add 4, arg1, arg2
.endmacro

.macro shl bytes, arg
    asl arg
    .repeat bytes-1, byte
        rol arg+byte+1
    .endrepeat
.endmacro

.macro shl16 arg
    shl 2, arg
.endmacro

.macro shl24 arg
    shl 3, arg
.endmacro

.macro shl32 arg
    shl 4, arg
.endmacro

.macro shr bytes, arg
    lsr arg
    .repeat bytes-1, byte
        ror arg+byte+1
    .endrepeat
.endmacro

.macro shr16 arg
    shr 2, arg
.endmacro

.macro shr24 arg
    shr 3, arg
.endmacro

.macro shr32 arg
    shr 4, arg
.endmacro

.macro bitmul16 arg1, arg2, result, bitnum
    .local next

    ; does 16-bit adds
    ; arg1 must be 0 or positive
    ; arg2 must be 0 or positive

    clc

    ; check if arg1 has 0 or 1 bit in this place
    .if bitnum < 8
        lda arg1
        and #(1 << bitnum)
    .else
        lda arg1 + 1
        and #(1 << (bitnum - 8))
    .endif
    beq next

    ; 16-bit add on the top bits
    lda result + 2
    adc arg2
    sta result + 2
    lda result + 3
    adc arg2 + 1
    sta result + 3

next:
    ; Shift the 32-bit result down by one bit,
    ; saving the previous carry.
    ror result + 3
    ror result + 2
    ror result + 1
    .if bitnum >= 8
        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
        ; when it's all uninitialized data
        ror result
    .endif
.endmacro

.macro check_sign arg
    ; Check sign bit and flip argument to postive,
    ; keeping a count of sign bits in the X register.
    .local positive
    lda arg + 1
    bpl positive
    neg16 arg
    inx
positive:
.endmacro

.proc imul16
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result

    ldx #0
    ; counts the number of sign bits in X
    check_sign arg1
    check_sign arg2
    
    ; zero out the 32-bit temp's top 16 bits
    lda #0
    sta result + 2
    sta result + 3
    ; the bottom two bytes will get cleared by the shifts

    ; unrolled loop for maximum speed, at the cost
    ; of a larger routine
    .repeat 16, bitnum
        bitmul16 arg1, arg2, result, bitnum
    .endrepeat

    ; In case of mixed input signs, return a negative result.
    cpx #1
    bne positive_result
    neg32 result
positive_result:

    rts
.endproc

.proc iter
    ; (cx and cy should be pre-scaled to 6.26 fixed point)
    ; zx = 0
    ; zy = 0
    ; zx_2 = 0
    ; zy_2 = 0
    ; zx_zy = 0

loop:
    ; iters++

    ; 6.26:
    ; zx = zx_2 + zy_2 + cx
    ; zy = zx_zy + zx_zy + cy
    ; round to 6.10.

    ; 12.20:
    ; zx_2 = zx * zx
    ; zy_2 = zy * zy
    ; dist = zx_2 + zy_2
    ; if dist >= 4 break, else continue iterating

    ; round zx_2, zy_2, dist to 6.26

    ; if may be in the lake, look for looping output with a small buffer
    ; as an optimization vs running to max iters

.endproc

.proc start

loop:
    ; FR0 = 5
    ; FR1 = -3
    lda #5
    sta FR0
    lda #0
    sta FR0 + 1
    lda #$fd
    sta FR1
    lda #$ff
    sta FR1 + 1

    jsr imul16
    ; should have 32-bit -15 in FR2

    jmp loop
.endproc