diff --git a/mandel.s b/mandel.s index 386094f..ab7cea5 100644 --- a/mandel.s +++ b/mandel.s @@ -131,45 +131,48 @@ minus: shr 4, arg .endmacro -.macro checkbit arg, bits - .if bits < 8 - lda arg - and #(1 << bits) - .else - lda arg + 1 - and #(1 << (bits - 8)) - .endif -.endmacro - -.macro bitmul16 arg1, arg2, res, bits +.macro bitmul16 arg1, arg2, result, bitnum .local next - checkbit arg2, bits clc + + ; check if arg1 has 0 or 1 bit in this place + .if bitnum < 8 + lda arg1 + and #(1 << bitnum) + .else + lda arg1 + 1 + and #(1 << (bitnum - 8)) + .endif beq next ; 16-bit add on the top bits - lda res + 2 - adc arg1 - sta res + 2 - lda res + 3 - adc arg1 + 1 + lda result + 2 + adc arg2 + sta result + 2 + lda result + 3 + adc arg2 + 1 + sta result + 3 next: - ; shift result right one bit - ; (shifts in the carry bit) - ror a - ror res - sta res + 1 + ; Shift the 32-bit result down by one bit, + ; saving the previous carry. + ror result + 3 + ror result + 2 + ror result + 1 + .if bitnum >= 8 + ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte + ; when it's all uninitialized data + ror result + .endif .endmacro .proc imul16 ; 16-bit arg in FR0 ; 16-bit arg in FR1 ; 32-bit result in FR2 - ; clobbers FR1 and FR2 - ; zero out the 32-bit temp + ; zero out the 32-bit temp's top 16 bits lda #0 sta FR2 + 2 sta FR2 + 3 @@ -178,6 +181,8 @@ next: .repeat 16, bitnum bitmul16 FR0, FR1, FR2, bitnum .endrepeat + + rts .endproc .proc iter