; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6
FRX = $ec
EEXP = $ed
NSIGN = $ee
ESIGN = $ef
FLPTR = $fc
FPTR2 = $fe

; FP routines
AFP = $D800
FASC = $D8E6
IFP = $D9AA
FIP = $D9D2
ZFR0 = $DA44
ZFI = $DA46
FSUB = $DA60
FADD = $DA66
FMUL = $DADB
FDIV = $DB28
PLYVEL = $DD40
FLD0R = $DD49 ; from pointer in X/Y
FLD0P = $DD89 ; from pointer in FLPTR
FLD1R = $DD89
FLD1P = $DD9c
FST0R = $DDA7
FST0P = $DDAB
FMOVE = $DDB6 ; FR0 -> FR1
EXP = $DDC0
EXP10 = $DDCC
LOG = $decd
LOG10 = $ded1


.code

.export start

.proc start
loop:
    jmp loop
.endproc

.proc mandelfloat
.endproc

.macro sext16to32 arg
    .local plus
    .local minus
    lda arg+1
    bpl plus
    lda #$ff
    jmp minus
plus:
    lda #$00
minus:
    sta arg+2
    sta arg+3
.endmacro

.macro copy bytes, arg1, arg2
    .repeat 2, byte
        lda arg1+byte
        sta arg2+byte
    .endrepeat
.endmacro

.macro copy16 arg1, arg2
    copy 2, arg1, arg2
.endmacro

.macro copy32 arg1, arg2
    copy 4, arg1, arg2
.endmacro

.macro add bytes, arg1, arg2
    clc
    .repeat bytes, byte
        lda arg1+byte
        adc arg2+byte
        sta arg1+byte
    .endrepeat
.endmacro

.macro add16 arg1, arg2
    add 2, arg1, arg2
.endmacro

.macro add32 arg1, arg2
    add 4, arg1, arg2
.endmacro

.macro shl bytes, arg
    asl arg
    .repeat bytes-1, byte
        rol arg+byte+1
    .endrepeat
.endmacro

.macro shl16 arg
    shl 2, arg
.endmacro

.macro shl24 arg
    shl 3, arg
.endmacro

.macro shl32 arg
    shl 4, arg
.endmacro

.macro shr bytes, arg
    lsr arg
    .repeat bytes-1, byte
        ror arg+byte+1
    .endrepeat
.endmacro

.macro shr16 arg
    shr 2, arg
.endmacro

.macro shr24 arg
    shr 3, arg
.endmacro

.macro shr32 arg
    shr 4, arg
.endmacro

.macro checkbit arg, bits
    .if bits < 8
        lda arg
        and #(1 << bits)
    .else
        lda arg + 1
        and #(1 << (bits - 8))
    .endif
.endmacro

.macro bitmul arg1, arg2, res, bits
    .local next
    checkbit arg2, bits
    beq next
    add32 res, arg1
next:
    shl32 arg1
.endmacro

.proc imul16
    ; 16-bit arg in FR0
    ; 16-bit arg in FR1
    ; 16-bit result in FR0

    ; sign-extend the argument
    sext16to32 FR0

    ; zero out the 32-bit temp
    lda #0
    sta FRX
    sta FRX+1
    sta FRX+2
    sta FRX+3

    ; shift and add :D
    .repeat 16, bitnum
        bitmul FR0, FR1, FRX, bitnum
    .endrepeat

    ; Re-normalize the ones place
    shr24 FRX
    shr24 FRX
    shr24 FRX

    ; @fixme round the last bit

    ; And copy out our result
    copy16 FRX+2, FR0
    ; @fixme could save a few cycles by combining the last two ops

.endproc

.proc iter
    ; (cx and cy should be pre-scaled to 6.26 fixed point)

    ; zx = 0
    ; zx_2 = 0
    ; zy = 0
    ; zx_2 = 0

loop:
    ; iters++

    ; zx_next = zx_2 + zy_2 + cx
    ; zy_next = 2 * zx * zy + cy
    ; (detect overflows to -4 or +4 and break if necessary)
    ; (re-downshift into zx and zy as 3.13 fixed point; round.)

    ; zx_2 = zx * zx
    ; zy_2 = zy * zy
    ; dist = zx_2 + zy_2

    ; if dist >= 4 break, else continue iterating

.endproc