mandel-6502/mandel.s
2022-12-30 01:05:52 -08:00

246 lines
4.1 KiB
ArmAsm

; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6
FRX = $ec
.code
.export start
.macro sext16to32 arg
.local plus
.local minus
lda arg+1
asl ; sign -> carry
lda #$ff
bcc plus
lda #$00
plus:
sta arg+2
sta arg+3
.endmacro
.macro copy bytes, arg1, arg2
.repeat 2, byte
lda arg1+byte
sta arg2+byte
.endrepeat
.endmacro
.macro copy16 arg1, arg2
copy 2, arg1, arg2
.endmacro
.macro copy32 arg1, arg2
copy 4, arg1, arg2
.endmacro
; 2 + 8 * byte cycles
.macro neg bytes, arg
sec ; 2 cyc
.repeat bytes, byte ; 8 * byte cycles
lda #00 ; 2 cyc
sbc arg + byte ; 3 cyc
sta arg + byte ; 3 cyc
.endrepeat
.endmacro
; 18 cycles
.macro neg16 arg
neg 2, arg
.endmacro
; 34 cycles
.macro neg32 arg
neg 4, arg
.endmacro
.macro add bytes, arg1, arg2
clc
.repeat bytes, byte
lda arg1+byte
adc arg2+byte
sta arg1+byte
.endrepeat
.endmacro
.macro add16 arg1, arg2
add 2, arg1, arg2
.endmacro
.macro add32 arg1, arg2
add 4, arg1, arg2
.endmacro
.macro shl bytes, arg
asl arg
.repeat bytes-1, byte
rol arg+byte+1
.endrepeat
.endmacro
.macro shl16 arg
shl 2, arg
.endmacro
.macro shl24 arg
shl 3, arg
.endmacro
.macro shl32 arg
shl 4, arg
.endmacro
.macro shr bytes, arg
lsr arg
.repeat bytes-1, byte
ror arg+byte+1
.endrepeat
.endmacro
.macro shr16 arg
shr 2, arg
.endmacro
.macro shr24 arg
shr 3, arg
.endmacro
.macro shr32 arg
shr 4, arg
.endmacro
.macro bitmul16 arg1, arg2, result, bitnum
.local next
; does 16-bit adds
; arg1 must be 0 or positive
; arg2 must be 0 or positive
clc
; check if arg1 has 0 or 1 bit in this place
.if bitnum < 8
lda arg1
and #(1 << bitnum)
.else
lda arg1 + 1
and #(1 << (bitnum - 8))
.endif
beq next
; 16-bit add on the top bits
lda result + 2
adc arg2
sta result + 2
lda result + 3
adc arg2 + 1
sta result + 3
next:
; Shift the 32-bit result down by one bit,
; saving the previous carry.
ror result + 3
ror result + 2
ror result + 1
.if bitnum >= 8
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
ror result
.endif
.endmacro
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the X register.
.local positive
lda arg + 1
bpl positive
neg16 arg
inx
positive:
.endmacro
.proc imul16
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
ldx #0
; counts the number of sign bits in X
check_sign arg1
check_sign arg2
; zero out the 32-bit temp's top 16 bits
lda #0
sta result + 2
sta result + 3
; the bottom two bytes will get cleared by the shifts
; unrolled loop for maximum speed, at the cost
; of a larger routine
.repeat 16, bitnum
bitmul16 arg1, arg2, result, bitnum
.endrepeat
; In case of mixed input signs, return a negative result.
cpx #1
bne positive_result
neg32 result
positive_result:
rts
.endproc
.proc iter
; (cx and cy should be pre-scaled to 6.26 fixed point)
; zx = 0
; zy = 0
; zx_2 = 0
; zy_2 = 0
; zx_zy = 0
; still working on the fixed-point
loop:
; iters++
; 6.26:
; zx = zx_2 + zy_2 + cx
; zy = zx_zy + zx_zy + cy
; round to 6.10.
; 12.20:
; zx_2 = zx * zx
; zy_2 = zy * zy
; dist = zx_2 + zy_2
; if dist >= 4 break, else continue iterating
; round zx_2, zy_2, dist to 6.26
; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters
.endproc
.proc start
loop:
; FR0 = 5
; FR1 = -3
lda #5
sta FR0
lda #0
sta FR0 + 1
lda #$fd
sta FR1
lda #$ff
sta FR1 + 1
jsr imul16
; should have 32-bit -15 in FR2
jmp loop
.endproc