2022-12-29 05:08:16 +00:00
|
|
|
; FP registers in zero page
|
|
|
|
FR0 = $d4
|
|
|
|
FRE = $da
|
|
|
|
FR1 = $e0
|
|
|
|
FR2 = $e6
|
|
|
|
FRX = $ec
|
|
|
|
|
|
|
|
.code
|
|
|
|
|
|
|
|
.export start
|
|
|
|
|
2022-12-30 08:43:44 +00:00
|
|
|
; 2 + 8 * byte cycles
|
|
|
|
.macro neg bytes, arg
|
|
|
|
sec ; 2 cyc
|
|
|
|
.repeat bytes, byte ; 8 * byte cycles
|
|
|
|
lda #00 ; 2 cyc
|
|
|
|
sbc arg + byte ; 3 cyc
|
|
|
|
sta arg + byte ; 3 cyc
|
|
|
|
.endrepeat
|
|
|
|
.endmacro
|
|
|
|
|
|
|
|
; 18 cycles
|
|
|
|
.macro neg16 arg
|
|
|
|
neg 2, arg
|
|
|
|
.endmacro
|
|
|
|
|
|
|
|
; 34 cycles
|
|
|
|
.macro neg32 arg
|
|
|
|
neg 4, arg
|
|
|
|
.endmacro
|
|
|
|
|
2022-12-31 02:25:43 +00:00
|
|
|
; inner loop for imul16
|
2023-01-05 04:21:51 +00:00
|
|
|
; bitnum < 8: 25 or 41 cycles
|
|
|
|
; bitnum >= 8: 30 or 46 cycles
|
2022-12-30 04:18:21 +00:00
|
|
|
.macro bitmul16 arg1, arg2, result, bitnum
|
2023-01-05 03:52:56 +00:00
|
|
|
.local zero
|
2023-01-05 04:12:34 +00:00
|
|
|
.local one
|
|
|
|
.local next
|
2022-12-29 11:37:51 +00:00
|
|
|
|
2022-12-30 08:43:44 +00:00
|
|
|
; does 16-bit adds
|
2023-01-05 05:09:45 +00:00
|
|
|
; arg1 and arg2 are treated as unsigned
|
|
|
|
; negative signed inputs must be flipped first
|
2022-12-30 08:43:44 +00:00
|
|
|
|
2023-01-05 04:12:34 +00:00
|
|
|
; 7 cycles up to the branch
|
2022-12-30 04:18:21 +00:00
|
|
|
|
|
|
|
; check if arg1 has 0 or 1 bit in this place
|
2022-12-31 01:33:18 +00:00
|
|
|
; 5 cycles either way
|
2022-12-30 04:18:21 +00:00
|
|
|
.if bitnum < 8
|
2022-12-31 01:33:18 +00:00
|
|
|
lda arg1 ; 3 cyc
|
|
|
|
and #(1 << bitnum) ; 2 cyc
|
2022-12-30 04:18:21 +00:00
|
|
|
.else
|
2022-12-31 01:33:18 +00:00
|
|
|
lda arg1 + 1 ; 3 cyc
|
|
|
|
and #(1 << (bitnum - 8)) ; 2 cyc
|
2022-12-30 04:18:21 +00:00
|
|
|
.endif
|
2023-01-05 04:12:34 +00:00
|
|
|
bne one ; 2 cyc
|
|
|
|
|
2023-01-05 04:21:51 +00:00
|
|
|
zero: ; 18 cyc, 23 cyc
|
2023-01-05 04:12:34 +00:00
|
|
|
lsr result + 3 ; 5 cyc
|
|
|
|
jmp next ; 3 cyc
|
2022-12-29 11:37:51 +00:00
|
|
|
|
2023-01-05 04:21:51 +00:00
|
|
|
one: ; 32 cyc, 37 cyc
|
2022-12-29 11:37:51 +00:00
|
|
|
; 16-bit add on the top bits
|
2023-01-05 04:12:34 +00:00
|
|
|
clc ; 2 cyc
|
2022-12-31 02:21:31 +00:00
|
|
|
lda result + 2 ; 3 cyc
|
|
|
|
adc arg2 ; 3 cyc
|
|
|
|
sta result + 2 ; 3 cyc
|
|
|
|
lda result + 3 ; 3 cyc
|
|
|
|
adc arg2 + 1 ; 3 cyc
|
2023-01-05 05:09:45 +00:00
|
|
|
ror a ; 2 cyc - get a jump on the shift
|
2022-12-31 02:21:31 +00:00
|
|
|
sta result + 3 ; 3 cyc
|
2023-01-05 04:33:42 +00:00
|
|
|
next:
|
2022-12-31 01:33:18 +00:00
|
|
|
ror result + 2 ; 5 cyc
|
|
|
|
ror result + 1 ; 5 cyc
|
2022-12-30 04:18:21 +00:00
|
|
|
.if bitnum >= 8
|
2023-01-05 04:21:51 +00:00
|
|
|
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
|
|
|
|
; when it's all uninitialized data
|
2022-12-31 01:33:18 +00:00
|
|
|
ror result ; 5 cyc
|
2022-12-30 04:18:21 +00:00
|
|
|
.endif
|
2023-01-05 04:21:51 +00:00
|
|
|
|
2023-01-05 04:12:34 +00:00
|
|
|
|
2022-12-29 05:08:16 +00:00
|
|
|
.endmacro
|
|
|
|
|
2022-12-31 01:33:18 +00:00
|
|
|
; 5 to 25 cycles
|
2022-12-30 08:43:44 +00:00
|
|
|
.macro check_sign arg
|
|
|
|
; Check sign bit and flip argument to postive,
|
|
|
|
; keeping a count of sign bits in the X register.
|
|
|
|
.local positive
|
2022-12-31 01:33:18 +00:00
|
|
|
lda arg + 1 ; 3 cyc
|
|
|
|
bpl positive ; 2 cyc
|
|
|
|
neg16 arg ; 18 cyc
|
|
|
|
inx ; 2 cyc
|
2022-12-30 08:43:44 +00:00
|
|
|
positive:
|
|
|
|
.endmacro
|
2022-12-29 05:08:16 +00:00
|
|
|
|
2023-01-05 04:37:16 +00:00
|
|
|
; min 470 cycles
|
|
|
|
; max 780 cycles
|
2022-12-30 08:43:44 +00:00
|
|
|
.proc imul16
|
|
|
|
arg1 = FR0 ; 16-bit arg (clobbered)
|
|
|
|
arg2 = FR1 ; 16-bit arg (clobbered)
|
|
|
|
result = FR2 ; 32-bit result
|
|
|
|
|
2022-12-31 01:33:18 +00:00
|
|
|
ldx #0 ; 2 cyc
|
2022-12-30 08:43:44 +00:00
|
|
|
; counts the number of sign bits in X
|
2022-12-31 01:33:18 +00:00
|
|
|
check_sign arg1 ; 5 to 25 cyc
|
|
|
|
check_sign arg2 ; 5 to 25 cyc
|
2022-12-30 08:43:44 +00:00
|
|
|
|
2022-12-30 04:18:21 +00:00
|
|
|
; zero out the 32-bit temp's top 16 bits
|
2022-12-31 01:33:18 +00:00
|
|
|
lda #0 ; 2 cyc
|
|
|
|
sta result + 2 ; 3 cyc
|
|
|
|
sta result + 3 ; 3 cyc
|
2022-12-29 11:37:51 +00:00
|
|
|
; the bottom two bytes will get cleared by the shifts
|
2022-12-29 05:08:16 +00:00
|
|
|
|
2022-12-30 08:43:44 +00:00
|
|
|
; unrolled loop for maximum speed, at the cost
|
|
|
|
; of a larger routine
|
2023-01-05 04:37:16 +00:00
|
|
|
; 440 to 696 cycles
|
2022-12-29 05:08:16 +00:00
|
|
|
.repeat 16, bitnum
|
2023-01-05 04:37:16 +00:00
|
|
|
; bitnum < 8: 25 or 41 cycles
|
|
|
|
; bitnum >= 8: 30 or 46 cycles
|
2022-12-30 08:43:44 +00:00
|
|
|
bitmul16 arg1, arg2, result, bitnum
|
2022-12-29 05:08:16 +00:00
|
|
|
.endrepeat
|
2022-12-30 04:18:21 +00:00
|
|
|
|
2022-12-30 08:43:44 +00:00
|
|
|
; In case of mixed input signs, return a negative result.
|
2022-12-31 01:33:18 +00:00
|
|
|
cpx #1 ; 2 cyc
|
|
|
|
bne positive_result ; 2 cyc
|
|
|
|
neg32 result ; 34 cyc
|
2022-12-30 08:43:44 +00:00
|
|
|
positive_result:
|
|
|
|
|
2022-12-31 01:33:18 +00:00
|
|
|
rts ; 6 cyc
|
2022-12-29 05:08:16 +00:00
|
|
|
.endproc
|
|
|
|
|
2023-01-05 17:06:07 +00:00
|
|
|
.macro round16_incdec arg
|
|
|
|
; Round top 16 bits of 32-bit fixed-point number in-place
|
|
|
|
.local zero
|
|
|
|
.local one
|
|
|
|
.local positive
|
|
|
|
.local negative
|
|
|
|
.local neg2
|
|
|
|
.local next
|
|
|
|
|
|
|
|
; no round - 5 cycles
|
|
|
|
; round pos, no carry - 17
|
|
|
|
; round pos, carry - 22
|
|
|
|
; round neg, no carry - 23
|
|
|
|
; round neg, carry - 28
|
|
|
|
; average = 5 / 2 + (17 + 22 + 23 + 28) / 8
|
|
|
|
; = 5 / 2 + 90 / 8
|
|
|
|
; = 2.5 + 11.25 = 13.75 cycles average on evenly distributed input
|
|
|
|
|
|
|
|
lda arg + 1 ; 3 cyc
|
|
|
|
bpl zero ; 2 cyc
|
|
|
|
|
|
|
|
one:
|
|
|
|
; check sign bit
|
|
|
|
lda arg + 3 ; 3 cyc
|
|
|
|
bpl positive ; 2 cyc
|
|
|
|
|
|
|
|
negative:
|
|
|
|
lda arg + 2 ; 3 cyc
|
|
|
|
beq neg2 ; 2 cyc
|
|
|
|
|
|
|
|
dec arg + 2 ; 5 cyc
|
|
|
|
jmp next ; 3 cyc
|
|
|
|
|
|
|
|
neg2:
|
|
|
|
dec arg + 2 ; 5 cyc
|
|
|
|
dec arg + 3 ; 5 cyc
|
|
|
|
jmp next ; 3 cyc
|
|
|
|
|
|
|
|
positive:
|
|
|
|
inc arg + 2 ; 5 cyc
|
|
|
|
beq next ; 2 cyc
|
|
|
|
inc arg + 3 ; 5 cyc
|
|
|
|
|
|
|
|
zero:
|
|
|
|
next:
|
|
|
|
|
|
|
|
.endmacro
|
|
|
|
|
|
|
|
.macro round16_addsub arg
|
|
|
|
; Round top 16 bits of 32-bit fixed-point number in-place
|
|
|
|
.local zero
|
|
|
|
.local one
|
|
|
|
.local positive
|
|
|
|
.local negative
|
|
|
|
.local neg2
|
|
|
|
.local next
|
|
|
|
|
|
|
|
; no round - 5 cycles
|
|
|
|
; one, pos - 28 cycles
|
|
|
|
; one, neg - 31 cycles
|
|
|
|
; average = 5 / 2 + (28 + 31) / 4
|
|
|
|
; = 5/2 + 59 / 4
|
|
|
|
; = 2.5 + 14.75
|
|
|
|
; = 17.25 cycles average on evenly distributed data
|
|
|
|
|
|
|
|
lda arg + 1 ; 3 cyc
|
|
|
|
bpl zero ; 2 cyc
|
|
|
|
|
|
|
|
one:
|
|
|
|
; check sign bit
|
|
|
|
lda arg + 3 ; 3 cyc
|
|
|
|
bpl positive ; 2 cyc
|
|
|
|
|
|
|
|
negative:
|
|
|
|
sec ; 2 cyc
|
|
|
|
lda arg + 2 ; 3 cyc
|
|
|
|
sbc #1 ; 2 cyc
|
|
|
|
sta arg + 2 ; 3 cyc
|
|
|
|
lda arg + 3 ; 3 cyc
|
|
|
|
sbc #0 ; 2 cyc
|
|
|
|
lda arg + 3 ; 3 cyc
|
|
|
|
jmp next ; 3 cyc
|
|
|
|
|
|
|
|
positive:
|
|
|
|
clc ; 2 cyc
|
|
|
|
lda arg + 2 ; 3 cyc
|
|
|
|
adc #1 ; 2 cyc
|
|
|
|
sta arg + 2 ; 3 cyc
|
|
|
|
lda arg + 3 ; 3 cyc
|
|
|
|
adc #0 ; 2 cyc
|
|
|
|
sta arg + 3 ; 3 cyc
|
|
|
|
|
|
|
|
zero:
|
|
|
|
next:
|
|
|
|
|
|
|
|
.endmacro
|
|
|
|
|
|
|
|
|
2022-12-29 05:08:16 +00:00
|
|
|
.proc iter
|
|
|
|
; (cx and cy should be pre-scaled to 6.26 fixed point)
|
|
|
|
; zx = 0
|
|
|
|
; zy = 0
|
|
|
|
; zx_2 = 0
|
2022-12-30 08:55:48 +00:00
|
|
|
; zy_2 = 0
|
|
|
|
; zx_zy = 0
|
2022-12-29 05:08:16 +00:00
|
|
|
|
2022-12-30 09:05:52 +00:00
|
|
|
; still working on the fixed-point
|
2022-12-29 05:08:16 +00:00
|
|
|
loop:
|
|
|
|
; iters++
|
|
|
|
|
2022-12-30 08:55:48 +00:00
|
|
|
; 6.26:
|
|
|
|
; zx = zx_2 + zy_2 + cx
|
|
|
|
; zy = zx_zy + zx_zy + cy
|
|
|
|
; round to 6.10.
|
2022-12-29 05:08:16 +00:00
|
|
|
|
2022-12-30 08:55:48 +00:00
|
|
|
; 12.20:
|
2022-12-29 05:08:16 +00:00
|
|
|
; zx_2 = zx * zx
|
|
|
|
; zy_2 = zy * zy
|
|
|
|
; dist = zx_2 + zy_2
|
|
|
|
; if dist >= 4 break, else continue iterating
|
|
|
|
|
2022-12-30 08:55:48 +00:00
|
|
|
; round zx_2, zy_2, dist to 6.26
|
|
|
|
|
|
|
|
; if may be in the lake, look for looping output with a small buffer
|
|
|
|
; as an optimization vs running to max iters
|
|
|
|
|
2022-12-29 05:08:16 +00:00
|
|
|
.endproc
|
2022-12-30 04:32:58 +00:00
|
|
|
|
|
|
|
.proc start
|
2022-12-30 08:43:44 +00:00
|
|
|
|
2023-01-05 04:12:34 +00:00
|
|
|
looplong:
|
2022-12-30 08:43:44 +00:00
|
|
|
; FR0 = 5
|
|
|
|
; FR1 = -3
|
2022-12-30 04:32:58 +00:00
|
|
|
lda #5
|
2022-12-30 08:43:44 +00:00
|
|
|
sta FR0
|
2022-12-30 04:32:58 +00:00
|
|
|
lda #0
|
|
|
|
sta FR0 + 1
|
2022-12-30 08:43:44 +00:00
|
|
|
lda #$fd
|
|
|
|
sta FR1
|
|
|
|
lda #$ff
|
2022-12-30 04:32:58 +00:00
|
|
|
sta FR1 + 1
|
|
|
|
|
|
|
|
jsr imul16
|
2022-12-30 08:43:44 +00:00
|
|
|
; should have 32-bit -15 in FR2
|
2022-12-30 04:32:58 +00:00
|
|
|
|
2023-01-05 04:12:34 +00:00
|
|
|
loop:
|
2022-12-30 04:32:58 +00:00
|
|
|
jmp loop
|
|
|
|
.endproc
|