; FP registers in zero page FR0 = $d4 FRE = $da FR1 = $e0 FR2 = $e6 FRX = $ec .code .export start .macro sext16to32 arg .local plus .local minus lda arg+1 asl ; sign -> carry lda #$ff bcc plus lda #$00 plus: sta arg+2 sta arg+3 .endmacro .macro copy bytes, arg1, arg2 .repeat 2, byte lda arg1+byte sta arg2+byte .endrepeat .endmacro .macro copy16 arg1, arg2 copy 2, arg1, arg2 .endmacro .macro copy32 arg1, arg2 copy 4, arg1, arg2 .endmacro ; 2 + 8 * byte cycles .macro neg bytes, arg sec ; 2 cyc .repeat bytes, byte ; 8 * byte cycles lda #00 ; 2 cyc sbc arg + byte ; 3 cyc sta arg + byte ; 3 cyc .endrepeat .endmacro ; 18 cycles .macro neg16 arg neg 2, arg .endmacro ; 34 cycles .macro neg32 arg neg 4, arg .endmacro ; 2 + 9 * bytes cycles .macro add bytes, arg1, arg2 clc ; 2 cyc .repeat bytes, byte lda arg1+byte ; 3 cyc adc arg2+byte ; 3 cyc sta arg1+byte ; 3 cyc .endrepeat .endmacro ; 20 cycles .macro add16 arg1, arg2 add 2, arg1, arg2 .endmacro ; 38 cycles .macro add32 arg1, arg2 add 4, arg1, arg2 .endmacro .macro shl bytes, arg asl arg .repeat bytes-1, byte rol arg+byte+1 .endrepeat .endmacro .macro shl16 arg shl 2, arg .endmacro .macro shl24 arg shl 3, arg .endmacro .macro shl32 arg shl 4, arg .endmacro .macro shr bytes, arg lsr arg .repeat bytes-1, byte ror arg+byte+1 .endrepeat .endmacro .macro shr16 arg shr 2, arg .endmacro .macro shr24 arg shr 3, arg .endmacro .macro shr32 arg shr 4, arg .endmacro ; 24 to 44 cycles .macro bitmul16 arg1, arg2, result, bitnum .local next ; does 16-bit adds ; arg1 must be 0 or positive ; arg2 must be 0 or positive clc ; 2 cyc ; check if arg1 has 0 or 1 bit in this place ; 5 cycles either way .if bitnum < 8 lda arg1 ; 3 cyc and #(1 << bitnum) ; 2 cyc .else lda arg1 + 1 ; 3 cyc and #(1 << (bitnum - 8)) ; 2 cyc .endif beq next ; 2 cyc ; 16-bit add on the top bits lda result + 2 ; 3 cyc adc arg2 ; 3 cyc sta result + 2 ; 3 cyc lda result + 3 ; 3 cyc adc arg2 + 1 ; 3 cyc ror a ; 2 cyc - get a jump on the shift sta result + 3 ; 3 cyc ; Shift the 32-bit result down by one bit, ; saving the previous carry. ror result + 3 ; 5 cyc next: ror result + 2 ; 5 cyc ror result + 1 ; 5 cyc .if bitnum >= 8 ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte ; when it's all uninitialized data ror result ; 5 cyc .endif .endmacro ; 5 to 25 cycles .macro check_sign arg ; Check sign bit and flip argument to postive, ; keeping a count of sign bits in the X register. .local positive lda arg + 1 ; 3 cyc bpl positive ; 2 cyc neg16 arg ; 18 cyc inx ; 2 cyc positive: .endmacro ; min 454 cycles ; max 756 cycles .proc imul16 arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result ldx #0 ; 2 cyc ; counts the number of sign bits in X check_sign arg1 ; 5 to 25 cyc check_sign arg2 ; 5 to 25 cyc ; zero out the 32-bit temp's top 16 bits lda #0 ; 2 cyc sta result + 2 ; 3 cyc sta result + 3 ; 3 cyc ; the bottom two bytes will get cleared by the shifts ; unrolled loop for maximum speed, at the cost ; of a larger routine ; 424 to 672 cycles .repeat 16, bitnum ; first half: 24 to 40 cycles ; second half: 29 to 44 cycles bitmul16 arg1, arg2, result, bitnum .endrepeat ; In case of mixed input signs, return a negative result. cpx #1 ; 2 cyc bne positive_result ; 2 cyc neg32 result ; 34 cyc positive_result: rts ; 6 cyc .endproc .proc iter ; (cx and cy should be pre-scaled to 6.26 fixed point) ; zx = 0 ; zy = 0 ; zx_2 = 0 ; zy_2 = 0 ; zx_zy = 0 ; still working on the fixed-point loop: ; iters++ ; 6.26: ; zx = zx_2 + zy_2 + cx ; zy = zx_zy + zx_zy + cy ; round to 6.10. ; 12.20: ; zx_2 = zx * zx ; zy_2 = zy * zy ; dist = zx_2 + zy_2 ; if dist >= 4 break, else continue iterating ; round zx_2, zy_2, dist to 6.26 ; if may be in the lake, look for looping output with a small buffer ; as an optimization vs running to max iters .endproc .proc start loop: ; FR0 = 5 ; FR1 = -3 lda #5 sta FR0 lda #0 sta FR0 + 1 lda #$fd sta FR1 lda #$ff sta FR1 + 1 jsr imul16 ; should have 32-bit -15 in FR2 jmp loop .endproc