mandel-6502/mandel.s

; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6
FRX = $ec

.code

.export start

; 2 + 8 * byte cycles
.macro neg bytes, arg
    sec ; 2 cyc
    .repeat bytes, byte ; 8 * byte cycles
        lda #00         ; 2 cyc
        sbc arg + byte  ; 3 cyc
        sta arg + byte  ; 3 cyc
    .endrepeat
.endmacro

; 18 cycles
.macro neg16 arg
    neg 2, arg
.endmacro

; 34 cycles
.macro neg32 arg
    neg 4, arg
.endmacro

; inner loop for imul16
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
.macro bitmul16 arg1, arg2, result, bitnum
    .local zero
    .local one
    .local next

    ; does 16-bit adds
    ; arg1 and arg2 are treated as unsigned
    ; negative signed inputs must be flipped first

    ; 7 cycles up to the branch

    ; check if arg1 has 0 or 1 bit in this place
    ; 5 cycles either way
    .if bitnum < 8
        lda arg1                 ; 3 cyc
        and #(1 << bitnum)       ; 2 cyc
    .else
        lda arg1 + 1             ; 3 cyc
        and #(1 << (bitnum - 8)) ; 2 cyc
    .endif
    bne one ; 2 cyc

zero: ; 18 cyc, 23 cyc
    lsr result + 3 ; 5 cyc
    jmp next       ; 3 cyc

one: ; 32 cyc, 37 cyc
    ; 16-bit add on the top bits
    clc            ; 2 cyc
    lda result + 2 ; 3 cyc
    adc arg2       ; 3 cyc
    sta result + 2 ; 3 cyc
    lda result + 3 ; 3 cyc
    adc arg2 + 1   ; 3 cyc
    ror a          ; 2 cyc - get a jump on the shift
    sta result + 3 ; 3 cyc
next:
    ror result + 2 ; 5 cyc
    ror result + 1 ; 5 cyc
    .if bitnum >= 8
        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
        ; when it's all uninitialized data
        ror result ; 5 cyc
    .endif


.endmacro

; 5 to 25 cycles
.macro check_sign arg
    ; Check sign bit and flip argument to postive,
    ; keeping a count of sign bits in the X register.
    .local positive
    lda arg + 1   ; 3 cyc
    bpl positive  ; 2 cyc
    neg16 arg     ; 18 cyc
    inx           ; 2 cyc
positive:
.endmacro

; min 470 cycles
; max 780 cycles
.proc imul16
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result

    ldx #0          ; 2 cyc
    ; counts the number of sign bits in X
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
    
    ; zero out the 32-bit temp's top 16 bits
    lda #0          ; 2 cyc
    sta result + 2  ; 3 cyc
    sta result + 3  ; 3 cyc
    ; the bottom two bytes will get cleared by the shifts

    ; unrolled loop for maximum speed, at the cost
    ; of a larger routine
    ; 440 to 696 cycles
    .repeat 16, bitnum
        ; bitnum < 8: 25 or 41 cycles
        ; bitnum >= 8: 30 or 46 cycles
        bitmul16 arg1, arg2, result, bitnum
    .endrepeat

    ; In case of mixed input signs, return a negative result.
    cpx #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
positive_result:

    rts ; 6 cyc
.endproc

.macro round16_incdec arg
    ; Round top 16 bits of 32-bit fixed-point number in-place
    .local zero
    .local one
    .local positive
    .local negative
    .local neg2
    .local next

    ; no round            - 5 cycles
    ; round pos, no carry - 17
    ; round pos, carry    - 22
    ; round neg, no carry - 23
    ; round neg, carry    - 28
    ; average = 5 / 2 + (17 + 22 + 23 + 28) / 8
    ;         = 5 / 2 + 90 / 8
    ;         = 2.5 + 11.25 = 13.75 cycles average on evenly distributed input

    lda arg + 1  ; 3 cyc
    bpl zero     ; 2 cyc

one:
    ; check sign bit
    lda arg + 3  ; 3 cyc
    bpl positive ; 2 cyc

negative:
    lda arg + 2  ; 3 cyc
    beq neg2     ; 2 cyc

    dec arg + 2  ; 5 cyc
    jmp next     ; 3 cyc

neg2:
    dec arg + 2  ; 5 cyc
    dec arg + 3  ; 5 cyc
    jmp next     ; 3 cyc

positive:
    inc arg + 2  ; 5 cyc
    beq next     ; 2 cyc
    inc arg + 3  ; 5 cyc

zero:
next:

.endmacro


.proc iter
    ; still working on the fixed-point
    ; should we just use 16-bit adds?
    ; does that require extra rounding?
    ; is the integer precision right?

    ; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9)
    ; zx = 0
    ; zy = 0
    ; zx_2 = 0
    ; zy_2 = 0
    ; zx_zy = 0

loop:
    ; 1652 - 2651 cyc

    ; iters++ = 2 cyc

    ; 4.12: (-8 .. +7.9)
    ; zx = zx_2  + zy_2  + cx   = 3 * 20 = 60 cyc
    ; zy = zx_zy + zx_zy + cy   = 3 * 20 = 60 cyc

    ; 8.24: (-128 .. +127.9)
    ; zx_2 = zx * zx            = 470 - 780 cyc
    ; zy_2 = zy * zy            = 470 - 780 cyc
    ; zx_zy = zx * zy           = 470 - 780 cyc
    ; dist = zx_2 + zy_2        = 38 cyc
    ; if dist >= 4 break, else continue iterating = 7 cyc

    ; shift and round zx_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
    ; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
    ; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc

    ; if may be in the lake, look for looping output with a small buffer
    ; as an optimization vs running to max iters

.endproc

.proc start

looplong:
    ; FR0 = 5
    ; FR1 = -3
    lda #5
    sta FR0
    lda #0
    sta FR0 + 1
    lda #$fd
    sta FR1
    lda #$ff
    sta FR1 + 1

    jsr imul16
    ; should have 32-bit -15 in FR2

loop:
    jmp loop
.endproc
stuff 2022-12-29 05:08:16 +00:00			`; FP registers in zero page`
			`FR0 = $d4`
			`FRE = $da`
			`FR1 = $e0`
			`FR2 = $e6`
			`FRX = $ec`

			`.code`

			`.export start`

looks workable 2022-12-30 08:43:44 +00:00			`; 2 + 8 * byte cycles`
			`.macro neg bytes, arg`
			`sec ; 2 cyc`
			`.repeat bytes, byte ; 8 * byte cycles`
			`lda #00 ; 2 cyc`
			`sbc arg + byte ; 3 cyc`
			`sta arg + byte ; 3 cyc`
			`.endrepeat`
			`.endmacro`

			`; 18 cycles`
			`.macro neg16 arg`
			`neg 2, arg`
			`.endmacro`

			`; 34 cycles`
			`.macro neg32 arg`
			`neg 4, arg`
			`.endmacro`

woo 2022-12-31 02:25:43 +00:00			`; inner loop for imul16`
hmm 2023-01-05 04:21:51 +00:00			`; bitnum < 8: 25 or 41 cycles`
			`; bitnum >= 8: 30 or 46 cycles`
nice 2022-12-30 04:18:21 +00:00			`.macro bitmul16 arg1, arg2, result, bitnum`
wip fix 2023-01-05 03:52:56 +00:00			`.local zero`
hmm 2023-01-05 04:12:34 +00:00			`.local one`
			`.local next`
hmm 2022-12-29 11:37:51 +00:00
looks workable 2022-12-30 08:43:44 +00:00			`; does 16-bit adds`
comment tweaks 2023-01-05 05:09:45 +00:00			`; arg1 and arg2 are treated as unsigned`
			`; negative signed inputs must be flipped first`
looks workable 2022-12-30 08:43:44 +00:00
hmm 2023-01-05 04:12:34 +00:00			`; 7 cycles up to the branch`
nice 2022-12-30 04:18:21 +00:00
			`; check if arg1 has 0 or 1 bit in this place`
annotate cycle counts 2022-12-31 01:33:18 +00:00			`; 5 cycles either way`
nice 2022-12-30 04:18:21 +00:00			`.if bitnum < 8`
annotate cycle counts 2022-12-31 01:33:18 +00:00			`lda arg1 ; 3 cyc`
			`and #(1 << bitnum) ; 2 cyc`
nice 2022-12-30 04:18:21 +00:00			`.else`
annotate cycle counts 2022-12-31 01:33:18 +00:00			`lda arg1 + 1 ; 3 cyc`
			`and #(1 << (bitnum - 8)) ; 2 cyc`
nice 2022-12-30 04:18:21 +00:00			`.endif`
hmm 2023-01-05 04:12:34 +00:00			`bne one ; 2 cyc`

hmm 2023-01-05 04:21:51 +00:00			`zero: ; 18 cyc, 23 cyc`
hmm 2023-01-05 04:12:34 +00:00			`lsr result + 3 ; 5 cyc`
			`jmp next ; 3 cyc`
hmm 2022-12-29 11:37:51 +00:00
hmm 2023-01-05 04:21:51 +00:00			`one: ; 32 cyc, 37 cyc`
hmm 2022-12-29 11:37:51 +00:00			`; 16-bit add on the top bits`
hmm 2023-01-05 04:12:34 +00:00			`clc ; 2 cyc`
woo 2022-12-31 02:21:31 +00:00			`lda result + 2 ; 3 cyc`
			`adc arg2 ; 3 cyc`
			`sta result + 2 ; 3 cyc`
			`lda result + 3 ; 3 cyc`
			`adc arg2 + 1 ; 3 cyc`
comment tweaks 2023-01-05 05:09:45 +00:00			`ror a ; 2 cyc - get a jump on the shift`
woo 2022-12-31 02:21:31 +00:00			`sta result + 3 ; 3 cyc`
save a few bytes by removing a dupe tail from two cases now 2023-01-05 04:33:42 +00:00			`next:`
annotate cycle counts 2022-12-31 01:33:18 +00:00			`ror result + 2 ; 5 cyc`
			`ror result + 1 ; 5 cyc`
nice 2022-12-30 04:18:21 +00:00			`.if bitnum >= 8`
hmm 2023-01-05 04:21:51 +00:00			`; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte`
			`; when it's all uninitialized data`
annotate cycle counts 2022-12-31 01:33:18 +00:00			`ror result ; 5 cyc`
nice 2022-12-30 04:18:21 +00:00			`.endif`
hmm 2023-01-05 04:21:51 +00:00
hmm 2023-01-05 04:12:34 +00:00
stuff 2022-12-29 05:08:16 +00:00			`.endmacro`

annotate cycle counts 2022-12-31 01:33:18 +00:00			`; 5 to 25 cycles`
looks workable 2022-12-30 08:43:44 +00:00			`.macro check_sign arg`
			`; Check sign bit and flip argument to postive,`
			`; keeping a count of sign bits in the X register.`
			`.local positive`
annotate cycle counts 2022-12-31 01:33:18 +00:00			`lda arg + 1 ; 3 cyc`
			`bpl positive ; 2 cyc`
			`neg16 arg ; 18 cyc`
			`inx ; 2 cyc`
looks workable 2022-12-30 08:43:44 +00:00			`positive:`
			`.endmacro`
stuff 2022-12-29 05:08:16 +00:00
update cycle count for imul16 2023-01-05 04:37:16 +00:00			`; min 470 cycles`
			`; max 780 cycles`
looks workable 2022-12-30 08:43:44 +00:00			`.proc imul16`
			`arg1 = FR0 ; 16-bit arg (clobbered)`
			`arg2 = FR1 ; 16-bit arg (clobbered)`
			`result = FR2 ; 32-bit result`

annotate cycle counts 2022-12-31 01:33:18 +00:00			`ldx #0 ; 2 cyc`
looks workable 2022-12-30 08:43:44 +00:00			`; counts the number of sign bits in X`
annotate cycle counts 2022-12-31 01:33:18 +00:00			`check_sign arg1 ; 5 to 25 cyc`
			`check_sign arg2 ; 5 to 25 cyc`
looks workable 2022-12-30 08:43:44 +00:00
nice 2022-12-30 04:18:21 +00:00			`; zero out the 32-bit temp's top 16 bits`
annotate cycle counts 2022-12-31 01:33:18 +00:00			`lda #0 ; 2 cyc`
			`sta result + 2 ; 3 cyc`
			`sta result + 3 ; 3 cyc`
hmm 2022-12-29 11:37:51 +00:00			`; the bottom two bytes will get cleared by the shifts`
stuff 2022-12-29 05:08:16 +00:00
looks workable 2022-12-30 08:43:44 +00:00			`; unrolled loop for maximum speed, at the cost`
			`; of a larger routine`
update cycle count for imul16 2023-01-05 04:37:16 +00:00			`; 440 to 696 cycles`
stuff 2022-12-29 05:08:16 +00:00			`.repeat 16, bitnum`
update cycle count for imul16 2023-01-05 04:37:16 +00:00			`; bitnum < 8: 25 or 41 cycles`
			`; bitnum >= 8: 30 or 46 cycles`
looks workable 2022-12-30 08:43:44 +00:00			`bitmul16 arg1, arg2, result, bitnum`
stuff 2022-12-29 05:08:16 +00:00			`.endrepeat`
nice 2022-12-30 04:18:21 +00:00
looks workable 2022-12-30 08:43:44 +00:00			`; In case of mixed input signs, return a negative result.`
annotate cycle counts 2022-12-31 01:33:18 +00:00			`cpx #1 ; 2 cyc`
			`bne positive_result ; 2 cyc`
			`neg32 result ; 34 cyc`
looks workable 2022-12-30 08:43:44 +00:00			`positive_result:`

annotate cycle counts 2022-12-31 01:33:18 +00:00			`rts ; 6 cyc`
stuff 2022-12-29 05:08:16 +00:00			`.endproc`

Added two version of 16-bit rounding round16_incdec uses inc and dec round16_addsub uses adc and sbc the incdec version is the same when no rounding is needed but saves about 8 cycles on the rounding cases, for an average savings of 4.5 cycles for randomly distributed inputs untested so far 2023-01-05 17:06:07 +00:00			`.macro round16_incdec arg`
			`; Round top 16 bits of 32-bit fixed-point number in-place`
			`.local zero`
			`.local one`
			`.local positive`
			`.local negative`
			`.local neg2`
			`.local next`

			`; no round - 5 cycles`
			`; round pos, no carry - 17`
			`; round pos, carry - 22`
			`; round neg, no carry - 23`
			`; round neg, carry - 28`
			`; average = 5 / 2 + (17 + 22 + 23 + 28) / 8`
			`; = 5 / 2 + 90 / 8`
			`; = 2.5 + 11.25 = 13.75 cycles average on evenly distributed input`

			`lda arg + 1 ; 3 cyc`
			`bpl zero ; 2 cyc`

			`one:`
			`; check sign bit`
			`lda arg + 3 ; 3 cyc`
			`bpl positive ; 2 cyc`

			`negative:`
			`lda arg + 2 ; 3 cyc`
			`beq neg2 ; 2 cyc`

			`dec arg + 2 ; 5 cyc`
			`jmp next ; 3 cyc`

			`neg2:`
			`dec arg + 2 ; 5 cyc`
			`dec arg + 3 ; 5 cyc`
			`jmp next ; 3 cyc`

			`positive:`
			`inc arg + 2 ; 5 cyc`
			`beq next ; 2 cyc`
			`inc arg + 3 ; 5 cyc`

			`zero:`
			`next:`

			`.endmacro`

one last round sketch combining copy and round 2023-01-05 19:17:13 +00:00
Added two version of 16-bit rounding round16_incdec uses inc and dec round16_addsub uses adc and sbc the incdec version is the same when no rounding is needed but saves about 8 cycles on the rounding cases, for an average savings of 4.5 cycles for randomly distributed inputs untested so far 2023-01-05 17:06:07 +00:00
stuff 2022-12-29 05:08:16 +00:00			`.proc iter`
update round & sketch out the iter cycle count 2023-01-05 19:32:15 +00:00			`; still working on the fixed-point`
			`; should we just use 16-bit adds?`
			`; does that require extra rounding?`
			`; is the integer precision right?`

			`; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9)`
stuff 2022-12-29 05:08:16 +00:00			`; zx = 0`
			`; zy = 0`
			`; zx_2 = 0`
maybe 2022-12-30 08:55:48 +00:00			`; zy_2 = 0`
			`; zx_zy = 0`
stuff 2022-12-29 05:08:16 +00:00
			`loop:`
whoops missed a few cycs 2023-01-05 19:55:41 +00:00			`; 1652 - 2651 cyc`
update round & sketch out the iter cycle count 2023-01-05 19:32:15 +00:00
			`; iters++ = 2 cyc`
stuff 2022-12-29 05:08:16 +00:00
update round & sketch out the iter cycle count 2023-01-05 19:32:15 +00:00			`; 4.12: (-8 .. +7.9)`
			`; zx = zx_2 + zy_2 + cx = 3 * 20 = 60 cyc`
			`; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc`
stuff 2022-12-29 05:08:16 +00:00
show range 2023-01-05 19:58:32 +00:00			`; 8.24: (-128 .. +127.9)`
update round & sketch out the iter cycle count 2023-01-05 19:32:15 +00:00			`; zx_2 = zx * zx = 470 - 780 cyc`
			`; zy_2 = zy * zy = 470 - 780 cyc`
			`; zx_zy = zx * zy = 470 - 780 cyc`
			`; dist = zx_2 + zy_2 = 38 cyc`
			`; if dist >= 4 break, else continue iterating = 7 cyc`
stuff 2022-12-29 05:08:16 +00:00
whoops missed a few cycs 2023-01-05 19:55:41 +00:00			`; shift and round zx_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc`
			`; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc`
			`; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc`
maybe 2022-12-30 08:55:48 +00:00
			`; if may be in the lake, look for looping output with a small buffer`
			`; as an optimization vs running to max iters`

stuff 2022-12-29 05:08:16 +00:00			`.endproc`
works for 3 * 5 = 15 2022-12-30 04:32:58 +00:00
			`.proc start`
looks workable 2022-12-30 08:43:44 +00:00
hmm 2023-01-05 04:12:34 +00:00			`looplong:`
looks workable 2022-12-30 08:43:44 +00:00			`; FR0 = 5`
			`; FR1 = -3`
works for 3 * 5 = 15 2022-12-30 04:32:58 +00:00			`lda #5`
looks workable 2022-12-30 08:43:44 +00:00			`sta FR0`
works for 3 * 5 = 15 2022-12-30 04:32:58 +00:00			`lda #0`
			`sta FR0 + 1`
looks workable 2022-12-30 08:43:44 +00:00			`lda #$fd`
			`sta FR1`
			`lda #$ff`
works for 3 * 5 = 15 2022-12-30 04:32:58 +00:00			`sta FR1 + 1`

			`jsr imul16`
looks workable 2022-12-30 08:43:44 +00:00			`; should have 32-bit -15 in FR2`
works for 3 * 5 = 15 2022-12-30 04:32:58 +00:00
hmm 2023-01-05 04:12:34 +00:00			`loop:`
works for 3 * 5 = 15 2022-12-30 04:32:58 +00:00			`jmp loop`
			`.endproc`