; FP registers in zero page FR0 = $d4 FRE = $da FR1 = $e0 FR2 = $e6 FRX = $ec .code .export start ; 2 + 8 * byte cycles .macro neg bytes, arg sec ; 2 cyc .repeat bytes, byte ; 8 * byte cycles lda #00 ; 2 cyc sbc arg + byte ; 3 cyc sta arg + byte ; 3 cyc .endrepeat .endmacro ; 18 cycles .macro neg16 arg neg 2, arg .endmacro ; 34 cycles .macro neg32 arg neg 4, arg .endmacro ; inner loop for imul16 ; bitnum < 8: 25 or 41 cycles ; bitnum >= 8: 30 or 46 cycles .macro bitmul16 arg1, arg2, result, bitnum .local zero .local one .local next ; does 16-bit adds ; arg1 must be 0 or positive ; arg2 must be 0 or positive ; 7 cycles up to the branch ; check if arg1 has 0 or 1 bit in this place ; 5 cycles either way .if bitnum < 8 lda arg1 ; 3 cyc and #(1 << bitnum) ; 2 cyc .else lda arg1 + 1 ; 3 cyc and #(1 << (bitnum - 8)) ; 2 cyc .endif bne one ; 2 cyc zero: ; 18 cyc, 23 cyc lsr result + 3 ; 5 cyc jmp next ; 3 cyc one: ; 32 cyc, 37 cyc ; 16-bit add on the top bits clc ; 2 cyc lda result + 2 ; 3 cyc adc arg2 ; 3 cyc sta result + 2 ; 3 cyc lda result + 3 ; 3 cyc adc arg2 + 1 ; 3 cyc ror a ; 2 cyc sta result + 3 ; 3 cyc next: ror result + 2 ; 5 cyc ror result + 1 ; 5 cyc .if bitnum >= 8 ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte ; when it's all uninitialized data ror result ; 5 cyc .endif .endmacro ; 5 to 25 cycles .macro check_sign arg ; Check sign bit and flip argument to postive, ; keeping a count of sign bits in the X register. .local positive lda arg + 1 ; 3 cyc bpl positive ; 2 cyc neg16 arg ; 18 cyc inx ; 2 cyc positive: .endmacro ; min 454 cycles ; max 756 cycles .proc imul16 arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result ldx #0 ; 2 cyc ; counts the number of sign bits in X check_sign arg1 ; 5 to 25 cyc check_sign arg2 ; 5 to 25 cyc ; zero out the 32-bit temp's top 16 bits lda #0 ; 2 cyc sta result + 2 ; 3 cyc sta result + 3 ; 3 cyc ; the bottom two bytes will get cleared by the shifts ; unrolled loop for maximum speed, at the cost ; of a larger routine ; 424 to 672 cycles .repeat 16, bitnum ; first half: 22 to 40 cycles ; second half: 29 to 47 cycles bitmul16 arg1, arg2, result, bitnum .endrepeat ; In case of mixed input signs, return a negative result. cpx #1 ; 2 cyc bne positive_result ; 2 cyc neg32 result ; 34 cyc positive_result: rts ; 6 cyc .endproc .proc iter ; (cx and cy should be pre-scaled to 6.26 fixed point) ; zx = 0 ; zy = 0 ; zx_2 = 0 ; zy_2 = 0 ; zx_zy = 0 ; still working on the fixed-point loop: ; iters++ ; 6.26: ; zx = zx_2 + zy_2 + cx ; zy = zx_zy + zx_zy + cy ; round to 6.10. ; 12.20: ; zx_2 = zx * zx ; zy_2 = zy * zy ; dist = zx_2 + zy_2 ; if dist >= 4 break, else continue iterating ; round zx_2, zy_2, dist to 6.26 ; if may be in the lake, look for looping output with a small buffer ; as an optimization vs running to max iters .endproc .proc start looplong: ; FR0 = 5 ; FR1 = -3 lda #5 sta FR0 lda #0 sta FR0 + 1 lda #$fd sta FR1 lda #$ff sta FR1 + 1 jsr imul16 ; should have 32-bit -15 in FR2 loop: jmp loop .endproc