mandel-6502/mandel.s

239 lines
5.1 KiB
ArmAsm
Raw Normal View History

2022-12-29 05:08:16 +00:00
; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6
FRX = $ec
.code
.export start
2022-12-30 08:43:44 +00:00
; 2 + 8 * byte cycles
.macro neg bytes, arg
sec ; 2 cyc
.repeat bytes, byte ; 8 * byte cycles
lda #00 ; 2 cyc
sbc arg + byte ; 3 cyc
sta arg + byte ; 3 cyc
.endrepeat
.endmacro
; 18 cycles
.macro neg16 arg
neg 2, arg
.endmacro
; 34 cycles
.macro neg32 arg
neg 4, arg
.endmacro
2022-12-31 02:25:43 +00:00
; inner loop for imul16
2023-01-05 04:21:51 +00:00
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
2022-12-30 04:18:21 +00:00
.macro bitmul16 arg1, arg2, result, bitnum
2023-01-05 03:52:56 +00:00
.local zero
2023-01-05 04:12:34 +00:00
.local one
.local next
2022-12-29 11:37:51 +00:00
2022-12-30 08:43:44 +00:00
; does 16-bit adds
2023-01-05 05:09:45 +00:00
; arg1 and arg2 are treated as unsigned
; negative signed inputs must be flipped first
2022-12-30 08:43:44 +00:00
2023-01-05 04:12:34 +00:00
; 7 cycles up to the branch
2022-12-30 04:18:21 +00:00
; check if arg1 has 0 or 1 bit in this place
2022-12-31 01:33:18 +00:00
; 5 cycles either way
2022-12-30 04:18:21 +00:00
.if bitnum < 8
2022-12-31 01:33:18 +00:00
lda arg1 ; 3 cyc
and #(1 << bitnum) ; 2 cyc
2022-12-30 04:18:21 +00:00
.else
2022-12-31 01:33:18 +00:00
lda arg1 + 1 ; 3 cyc
and #(1 << (bitnum - 8)) ; 2 cyc
2022-12-30 04:18:21 +00:00
.endif
2023-01-05 04:12:34 +00:00
bne one ; 2 cyc
2023-01-05 04:21:51 +00:00
zero: ; 18 cyc, 23 cyc
2023-01-05 04:12:34 +00:00
lsr result + 3 ; 5 cyc
jmp next ; 3 cyc
2022-12-29 11:37:51 +00:00
2023-01-05 04:21:51 +00:00
one: ; 32 cyc, 37 cyc
2022-12-29 11:37:51 +00:00
; 16-bit add on the top bits
2023-01-05 04:12:34 +00:00
clc ; 2 cyc
2022-12-31 02:21:31 +00:00
lda result + 2 ; 3 cyc
adc arg2 ; 3 cyc
sta result + 2 ; 3 cyc
lda result + 3 ; 3 cyc
adc arg2 + 1 ; 3 cyc
2023-01-05 05:09:45 +00:00
ror a ; 2 cyc - get a jump on the shift
2022-12-31 02:21:31 +00:00
sta result + 3 ; 3 cyc
next:
2022-12-31 01:33:18 +00:00
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
2022-12-30 04:18:21 +00:00
.if bitnum >= 8
2023-01-05 04:21:51 +00:00
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
2022-12-31 01:33:18 +00:00
ror result ; 5 cyc
2022-12-30 04:18:21 +00:00
.endif
2023-01-05 04:21:51 +00:00
2023-01-05 04:12:34 +00:00
2022-12-29 05:08:16 +00:00
.endmacro
2022-12-31 01:33:18 +00:00
; 5 to 25 cycles
2022-12-30 08:43:44 +00:00
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the X register.
.local positive
2022-12-31 01:33:18 +00:00
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
inx ; 2 cyc
2022-12-30 08:43:44 +00:00
positive:
.endmacro
2022-12-29 05:08:16 +00:00
2023-01-05 04:37:16 +00:00
; min 470 cycles
; max 780 cycles
2022-12-30 08:43:44 +00:00
.proc imul16
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
2022-12-31 01:33:18 +00:00
ldx #0 ; 2 cyc
2022-12-30 08:43:44 +00:00
; counts the number of sign bits in X
2022-12-31 01:33:18 +00:00
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
2022-12-30 08:43:44 +00:00
2022-12-30 04:18:21 +00:00
; zero out the 32-bit temp's top 16 bits
2022-12-31 01:33:18 +00:00
lda #0 ; 2 cyc
sta result + 2 ; 3 cyc
sta result + 3 ; 3 cyc
2022-12-29 11:37:51 +00:00
; the bottom two bytes will get cleared by the shifts
2022-12-29 05:08:16 +00:00
2022-12-30 08:43:44 +00:00
; unrolled loop for maximum speed, at the cost
; of a larger routine
2023-01-05 04:37:16 +00:00
; 440 to 696 cycles
2022-12-29 05:08:16 +00:00
.repeat 16, bitnum
2023-01-05 04:37:16 +00:00
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
2022-12-30 08:43:44 +00:00
bitmul16 arg1, arg2, result, bitnum
2022-12-29 05:08:16 +00:00
.endrepeat
2022-12-30 04:18:21 +00:00
2022-12-30 08:43:44 +00:00
; In case of mixed input signs, return a negative result.
2022-12-31 01:33:18 +00:00
cpx #1 ; 2 cyc
bne positive_result ; 2 cyc
neg32 result ; 34 cyc
2022-12-30 08:43:44 +00:00
positive_result:
2022-12-31 01:33:18 +00:00
rts ; 6 cyc
2022-12-29 05:08:16 +00:00
.endproc
.macro round16_incdec arg
; Round top 16 bits of 32-bit fixed-point number in-place
.local zero
.local one
.local positive
.local negative
.local neg2
.local next
; no round - 5 cycles
; round pos, no carry - 17
; round pos, carry - 22
; round neg, no carry - 23
; round neg, carry - 28
; average = 5 / 2 + (17 + 22 + 23 + 28) / 8
; = 5 / 2 + 90 / 8
; = 2.5 + 11.25 = 13.75 cycles average on evenly distributed input
lda arg + 1 ; 3 cyc
bpl zero ; 2 cyc
one:
; check sign bit
lda arg + 3 ; 3 cyc
bpl positive ; 2 cyc
negative:
lda arg + 2 ; 3 cyc
beq neg2 ; 2 cyc
dec arg + 2 ; 5 cyc
jmp next ; 3 cyc
neg2:
dec arg + 2 ; 5 cyc
dec arg + 3 ; 5 cyc
jmp next ; 3 cyc
positive:
inc arg + 2 ; 5 cyc
beq next ; 2 cyc
inc arg + 3 ; 5 cyc
zero:
next:
.endmacro
2022-12-29 05:08:16 +00:00
.proc iter
; still working on the fixed-point
; should we just use 16-bit adds?
; does that require extra rounding?
; is the integer precision right?
; (cx and cy should be pre-scaled to 4.12 fixed point - -8..+7.9)
2022-12-29 05:08:16 +00:00
; zx = 0
; zy = 0
; zx_2 = 0
2022-12-30 08:55:48 +00:00
; zy_2 = 0
; zx_zy = 0
2022-12-29 05:08:16 +00:00
loop:
2023-01-05 19:55:41 +00:00
; 1652 - 2651 cyc
; iters++ = 2 cyc
2022-12-29 05:08:16 +00:00
; 4.12: (-8 .. +7.9)
; zx = zx_2 + zy_2 + cx = 3 * 20 = 60 cyc
; zy = zx_zy + zx_zy + cy = 3 * 20 = 60 cyc
2022-12-29 05:08:16 +00:00
2023-01-05 19:58:32 +00:00
; 8.24: (-128 .. +127.9)
; zx_2 = zx * zx = 470 - 780 cyc
; zy_2 = zy * zy = 470 - 780 cyc
; zx_zy = zx * zy = 470 - 780 cyc
; dist = zx_2 + zy_2 = 38 cyc
; if dist >= 4 break, else continue iterating = 7 cyc
2022-12-29 05:08:16 +00:00
2023-01-05 19:55:41 +00:00
; shift and round zx_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
; shift and round zy_2 to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
; shift and round zx_zy to 4.12 = (20 + 5) - (20 + 28) = 25 - 48 cyc
2022-12-30 08:55:48 +00:00
; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters
2022-12-29 05:08:16 +00:00
.endproc
2022-12-30 04:32:58 +00:00
.proc start
2022-12-30 08:43:44 +00:00
2023-01-05 04:12:34 +00:00
looplong:
2022-12-30 08:43:44 +00:00
; FR0 = 5
; FR1 = -3
2022-12-30 04:32:58 +00:00
lda #5
2022-12-30 08:43:44 +00:00
sta FR0
2022-12-30 04:32:58 +00:00
lda #0
sta FR0 + 1
2022-12-30 08:43:44 +00:00
lda #$fd
sta FR1
lda #$ff
2022-12-30 04:32:58 +00:00
sta FR1 + 1
jsr imul16
2022-12-30 08:43:44 +00:00
; should have 32-bit -15 in FR2
2022-12-30 04:32:58 +00:00
2023-01-05 04:12:34 +00:00
loop:
2022-12-30 04:32:58 +00:00
jmp loop
.endproc