mandel-6502/mandel.s

; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6
FRX = $ec
EEXP = $ed
NSIGN = $ee
ESIGN = $ef
FLPTR = $fc
FPTR2 = $fe

; FP routines
AFP = $D800
FASC = $D8E6
IFP = $D9AA
FIP = $D9D2
ZFR0 = $DA44
ZFI = $DA46
FSUB = $DA60
FADD = $DA66
FMUL = $DADB
FDIV = $DB28
PLYVEL = $DD40
FLD0R = $DD49 ; from pointer in X/Y
FLD0P = $DD89 ; from pointer in FLPTR
FLD1R = $DD89
FLD1P = $DD9c
FST0R = $DDA7
FST0P = $DDAB
FMOVE = $DDB6 ; FR0 -> FR1
EXP = $DDC0
EXP10 = $DDCC
LOG = $decd
LOG10 = $ded1


.code

.export start

.macro sext16to32 arg
    .local plus
    .local minus
    lda arg+1
    asl ; sign -> carry
    lda #$ff
    bcc plus
    lda #$00
plus:
    sta arg+2
    sta arg+3
.endmacro

.macro copy bytes, arg1, arg2
    .repeat 2, byte
        lda arg1+byte
        sta arg2+byte
    .endrepeat
.endmacro

.macro copy16 arg1, arg2
    copy 2, arg1, arg2
.endmacro

.macro copy32 arg1, arg2
    copy 4, arg1, arg2
.endmacro

; 2 + 8 * byte cycles
.macro neg bytes, arg
    sec ; 2 cyc
    .repeat bytes, byte ; 8 * byte cycles
        lda #00         ; 2 cyc
        sbc arg + byte  ; 3 cyc
        sta arg + byte  ; 3 cyc
    .endrepeat
.endmacro

; 18 cycles
.macro neg16 arg
    neg 2, arg
.endmacro

; 34 cycles
.macro neg32 arg
    neg 4, arg
.endmacro

.macro add bytes, arg1, arg2
    clc
    .repeat bytes, byte
        lda arg1+byte
        adc arg2+byte
        sta arg1+byte
    .endrepeat
.endmacro

.macro add16 arg1, arg2
    add 2, arg1, arg2
.endmacro

.macro add32 arg1, arg2
    add 4, arg1, arg2
.endmacro

.macro shl bytes, arg
    asl arg
    .repeat bytes-1, byte
        rol arg+byte+1
    .endrepeat
.endmacro

.macro shl16 arg
    shl 2, arg
.endmacro

.macro shl24 arg
    shl 3, arg
.endmacro

.macro shl32 arg
    shl 4, arg
.endmacro

.macro shr bytes, arg
    lsr arg
    .repeat bytes-1, byte
        ror arg+byte+1
    .endrepeat
.endmacro

.macro shr16 arg
    shr 2, arg
.endmacro

.macro shr24 arg
    shr 3, arg
.endmacro

.macro shr32 arg
    shr 4, arg
.endmacro

.macro bitmul16 arg1, arg2, result, bitnum
    .local next

    ; does 16-bit adds
    ; arg1 must be 0 or positive
    ; arg2 must be 0 or positive

    clc

    ; check if arg1 has 0 or 1 bit in this place
    .if bitnum < 8
        lda arg1
        and #(1 << bitnum)
    .else
        lda arg1 + 1
        and #(1 << (bitnum - 8))
    .endif
    beq next

    ; 16-bit add on the top bits
    lda result + 2
    adc arg2
    sta result + 2
    lda result + 3
    adc arg2 + 1
    sta result + 3

next:
    ; Shift the 32-bit result down by one bit,
    ; saving the previous carry.
    ror result + 3
    ror result + 2
    ror result + 1
    .if bitnum >= 8
        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
        ; when it's all uninitialized data
        ror result
    .endif
.endmacro

.macro check_sign arg
    ; Check sign bit and flip argument to postive,
    ; keeping a count of sign bits in the X register.
    .local positive
    lda arg + 1
    bpl positive
    neg16 arg
    inx
positive:
.endmacro

.proc imul16
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result

    ldx #0
    ; counts the number of sign bits in X
    check_sign arg1
    check_sign arg2
    
    ; zero out the 32-bit temp's top 16 bits
    lda #0
    sta result + 2
    sta result + 3
    ; the bottom two bytes will get cleared by the shifts

    ; unrolled loop for maximum speed, at the cost
    ; of a larger routine
    .repeat 16, bitnum
        bitmul16 arg1, arg2, result, bitnum
    .endrepeat

    ; In case of mixed input signs, return a negative result.
    cpx #1
    bne positive_result
    neg32 result
positive_result:

    rts
.endproc

.proc iter
    ; (cx and cy should be pre-scaled to 6.26 fixed point)
    ; zx = 0
    ; zy = 0
    ; zx_2 = 0
    ; zy_2 = 0
    ; zx_zy = 0

loop:
    ; iters++

    ; 6.26:
    ; zx = zx_2 + zy_2 + cx
    ; zy = zx_zy + zx_zy + cy
    ; round to 6.10.

    ; 12.20:
    ; zx_2 = zx * zx
    ; zy_2 = zy * zy
    ; dist = zx_2 + zy_2
    ; if dist >= 4 break, else continue iterating

    ; round zx_2, zy_2, dist to 6.26

    ; if may be in the lake, look for looping output with a small buffer
    ; as an optimization vs running to max iters

.endproc

.proc start

loop:
    ; FR0 = 5
    ; FR1 = -3
    lda #5
    sta FR0
    lda #0
    sta FR0 + 1
    lda #$fd
    sta FR1
    lda #$ff
    sta FR1 + 1

    jsr imul16
    ; should have 32-bit -15 in FR2

    jmp loop
.endproc
stuff 2022-12-29 05:08:16 +00:00			`; FP registers in zero page`
			`FR0 = $d4`
			`FRE = $da`
			`FR1 = $e0`
			`FR2 = $e6`
			`FRX = $ec`
			`EEXP = $ed`
			`NSIGN = $ee`
			`ESIGN = $ef`
			`FLPTR = $fc`
			`FPTR2 = $fe`

			`; FP routines`
			`AFP = $D800`
			`FASC = $D8E6`
			`IFP = $D9AA`
			`FIP = $D9D2`
			`ZFR0 = $DA44`
			`ZFI = $DA46`
			`FSUB = $DA60`
			`FADD = $DA66`
			`FMUL = $DADB`
			`FDIV = $DB28`
			`PLYVEL = $DD40`
			`FLD0R = $DD49 ; from pointer in X/Y`
			`FLD0P = $DD89 ; from pointer in FLPTR`
			`FLD1R = $DD89`
			`FLD1P = $DD9c`
			`FST0R = $DDA7`
			`FST0P = $DDAB`
			`FMOVE = $DDB6 ; FR0 -> FR1`
			`EXP = $DDC0`
			`EXP10 = $DDCC`
			`LOG = $decd`
			`LOG10 = $ded1`


			`.code`

			`.export start`

			`.macro sext16to32 arg`
			`.local plus`
			`.local minus`
			`lda arg+1`
looks workable 2022-12-30 08:43:44 +00:00			`asl ; sign -> carry`
stuff 2022-12-29 05:08:16 +00:00			`lda #$ff`
looks workable 2022-12-30 08:43:44 +00:00			`bcc plus`
stuff 2022-12-29 05:08:16 +00:00			`lda #$00`
looks workable 2022-12-30 08:43:44 +00:00			`plus:`
stuff 2022-12-29 05:08:16 +00:00			`sta arg+2`
			`sta arg+3`
			`.endmacro`

			`.macro copy bytes, arg1, arg2`
			`.repeat 2, byte`
			`lda arg1+byte`
			`sta arg2+byte`
			`.endrepeat`
			`.endmacro`

			`.macro copy16 arg1, arg2`
			`copy 2, arg1, arg2`
			`.endmacro`

			`.macro copy32 arg1, arg2`
			`copy 4, arg1, arg2`
			`.endmacro`

looks workable 2022-12-30 08:43:44 +00:00			`; 2 + 8 * byte cycles`
			`.macro neg bytes, arg`
			`sec ; 2 cyc`
			`.repeat bytes, byte ; 8 * byte cycles`
			`lda #00 ; 2 cyc`
			`sbc arg + byte ; 3 cyc`
			`sta arg + byte ; 3 cyc`
			`.endrepeat`
			`.endmacro`

			`; 18 cycles`
			`.macro neg16 arg`
			`neg 2, arg`
			`.endmacro`

			`; 34 cycles`
			`.macro neg32 arg`
			`neg 4, arg`
			`.endmacro`

stuff 2022-12-29 05:08:16 +00:00			`.macro add bytes, arg1, arg2`
			`clc`
			`.repeat bytes, byte`
			`lda arg1+byte`
			`adc arg2+byte`
			`sta arg1+byte`
			`.endrepeat`
			`.endmacro`

			`.macro add16 arg1, arg2`
			`add 2, arg1, arg2`
			`.endmacro`

			`.macro add32 arg1, arg2`
			`add 4, arg1, arg2`
			`.endmacro`

			`.macro shl bytes, arg`
			`asl arg`
			`.repeat bytes-1, byte`
			`rol arg+byte+1`
			`.endrepeat`
			`.endmacro`

			`.macro shl16 arg`
			`shl 2, arg`
			`.endmacro`

			`.macro shl24 arg`
			`shl 3, arg`
			`.endmacro`

			`.macro shl32 arg`
			`shl 4, arg`
			`.endmacro`

			`.macro shr bytes, arg`
			`lsr arg`
			`.repeat bytes-1, byte`
			`ror arg+byte+1`
			`.endrepeat`
			`.endmacro`

			`.macro shr16 arg`
			`shr 2, arg`
			`.endmacro`

			`.macro shr24 arg`
			`shr 3, arg`
			`.endmacro`

			`.macro shr32 arg`
			`shr 4, arg`
			`.endmacro`

nice 2022-12-30 04:18:21 +00:00			`.macro bitmul16 arg1, arg2, result, bitnum`
stuff 2022-12-29 05:08:16 +00:00			`.local next`
hmm 2022-12-29 11:37:51 +00:00
looks workable 2022-12-30 08:43:44 +00:00			`; does 16-bit adds`
			`; arg1 must be 0 or positive`
			`; arg2 must be 0 or positive`

hmm 2022-12-29 11:37:51 +00:00			`clc`
nice 2022-12-30 04:18:21 +00:00
			`; check if arg1 has 0 or 1 bit in this place`
			`.if bitnum < 8`
			`lda arg1`
			`and #(1 << bitnum)`
			`.else`
			`lda arg1 + 1`
			`and #(1 << (bitnum - 8))`
			`.endif`
stuff 2022-12-29 05:08:16 +00:00			`beq next`
hmm 2022-12-29 11:37:51 +00:00
			`; 16-bit add on the top bits`
nice 2022-12-30 04:18:21 +00:00			`lda result + 2`
			`adc arg2`
			`sta result + 2`
			`lda result + 3`
			`adc arg2 + 1`
			`sta result + 3`
hmm 2022-12-29 11:37:51 +00:00
stuff 2022-12-29 05:08:16 +00:00			`next:`
nice 2022-12-30 04:18:21 +00:00			`; Shift the 32-bit result down by one bit,`
			`; saving the previous carry.`
			`ror result + 3`
			`ror result + 2`
			`ror result + 1`
			`.if bitnum >= 8`
			`; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte`
			`; when it's all uninitialized data`
			`ror result`
			`.endif`
stuff 2022-12-29 05:08:16 +00:00			`.endmacro`

looks workable 2022-12-30 08:43:44 +00:00			`.macro check_sign arg`
			`; Check sign bit and flip argument to postive,`
			`; keeping a count of sign bits in the X register.`
			`.local positive`
			`lda arg + 1`
			`bpl positive`
			`neg16 arg`
			`inx`
			`positive:`
			`.endmacro`
stuff 2022-12-29 05:08:16 +00:00
looks workable 2022-12-30 08:43:44 +00:00			`.proc imul16`
			`arg1 = FR0 ; 16-bit arg (clobbered)`
			`arg2 = FR1 ; 16-bit arg (clobbered)`
			`result = FR2 ; 32-bit result`

			`ldx #0`
			`; counts the number of sign bits in X`
			`check_sign arg1`
			`check_sign arg2`

nice 2022-12-30 04:18:21 +00:00			`; zero out the 32-bit temp's top 16 bits`
stuff 2022-12-29 05:08:16 +00:00			`lda #0`
looks workable 2022-12-30 08:43:44 +00:00			`sta result + 2`
			`sta result + 3`
hmm 2022-12-29 11:37:51 +00:00			`; the bottom two bytes will get cleared by the shifts`
stuff 2022-12-29 05:08:16 +00:00
looks workable 2022-12-30 08:43:44 +00:00			`; unrolled loop for maximum speed, at the cost`
			`; of a larger routine`
stuff 2022-12-29 05:08:16 +00:00			`.repeat 16, bitnum`
looks workable 2022-12-30 08:43:44 +00:00			`bitmul16 arg1, arg2, result, bitnum`
stuff 2022-12-29 05:08:16 +00:00			`.endrepeat`
nice 2022-12-30 04:18:21 +00:00
looks workable 2022-12-30 08:43:44 +00:00			`; In case of mixed input signs, return a negative result.`
			`cpx #1`
			`bne positive_result`
			`neg32 result`
			`positive_result:`

nice 2022-12-30 04:18:21 +00:00			`rts`
stuff 2022-12-29 05:08:16 +00:00			`.endproc`

			`.proc iter`
			`; (cx and cy should be pre-scaled to 6.26 fixed point)`
			`; zx = 0`
			`; zy = 0`
			`; zx_2 = 0`
maybe 2022-12-30 08:55:48 +00:00			`; zy_2 = 0`
			`; zx_zy = 0`
stuff 2022-12-29 05:08:16 +00:00
			`loop:`
			`; iters++`

maybe 2022-12-30 08:55:48 +00:00			`; 6.26:`
			`; zx = zx_2 + zy_2 + cx`
			`; zy = zx_zy + zx_zy + cy`
			`; round to 6.10.`
stuff 2022-12-29 05:08:16 +00:00
maybe 2022-12-30 08:55:48 +00:00			`; 12.20:`
stuff 2022-12-29 05:08:16 +00:00			`; zx_2 = zx * zx`
			`; zy_2 = zy * zy`
			`; dist = zx_2 + zy_2`
			`; if dist >= 4 break, else continue iterating`

maybe 2022-12-30 08:55:48 +00:00			`; round zx_2, zy_2, dist to 6.26`

			`; if may be in the lake, look for looping output with a small buffer`
			`; as an optimization vs running to max iters`

stuff 2022-12-29 05:08:16 +00:00			`.endproc`
works for 3 * 5 = 15 2022-12-30 04:32:58 +00:00
			`.proc start`
looks workable 2022-12-30 08:43:44 +00:00
			`loop:`
			`; FR0 = 5`
			`; FR1 = -3`
works for 3 * 5 = 15 2022-12-30 04:32:58 +00:00			`lda #5`
looks workable 2022-12-30 08:43:44 +00:00			`sta FR0`
works for 3 * 5 = 15 2022-12-30 04:32:58 +00:00			`lda #0`
			`sta FR0 + 1`
looks workable 2022-12-30 08:43:44 +00:00			`lda #$fd`
			`sta FR1`
			`lda #$ff`
works for 3 * 5 = 15 2022-12-30 04:32:58 +00:00			`sta FR1 + 1`

			`jsr imul16`
looks workable 2022-12-30 08:43:44 +00:00			`; should have 32-bit -15 in FR2`
works for 3 * 5 = 15 2022-12-30 04:32:58 +00:00
			`jmp loop`
			`.endproc`