From 4b1001bfdc5b9abe08b6ce0c277e150a37f5a8ab Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Fri, 30 Dec 2022 17:33:18 -0800 Subject: [PATCH] annotate cycle counts --- mandel.s | 72 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/mandel.s b/mandel.s index 465ed96..39571b3 100644 --- a/mandel.s +++ b/mandel.s @@ -57,19 +57,22 @@ plus: neg 4, arg .endmacro +; 2 + 9 * bytes cycles .macro add bytes, arg1, arg2 - clc + clc ; 2 cyc .repeat bytes, byte - lda arg1+byte - adc arg2+byte - sta arg1+byte + lda arg1+byte ; 3 cyc + adc arg2+byte ; 3 cyc + sta arg1+byte ; 3 cyc .endrepeat .endmacro +; 20 cycles .macro add16 arg1, arg2 add 2, arg1, arg2 .endmacro +; 38 cycles .macro add32 arg1, arg2 add 4, arg1, arg2 .endmacro @@ -112,6 +115,7 @@ plus: shr 4, arg .endmacro +; 24 to 49 cycles .macro bitmul16 arg1, arg2, result, bitnum .local next @@ -119,79 +123,79 @@ plus: ; arg1 must be 0 or positive ; arg2 must be 0 or positive - clc + clc ; 2 cyc ; check if arg1 has 0 or 1 bit in this place + ; 5 cycles either way .if bitnum < 8 - lda arg1 - and #(1 << bitnum) + lda arg1 ; 3 cyc + and #(1 << bitnum) ; 2 cyc .else - lda arg1 + 1 - and #(1 << (bitnum - 8)) + lda arg1 + 1 ; 3 cyc + and #(1 << (bitnum - 8)) ; 2 cyc .endif - beq next + beq next ; 2 cyc ; 16-bit add on the top bits - lda result + 2 - adc arg2 - sta result + 2 - lda result + 3 - adc arg2 + 1 - sta result + 3 + add16 result + 2, arg2 ; 20 cyc next: ; Shift the 32-bit result down by one bit, ; saving the previous carry. - ror result + 3 - ror result + 2 - ror result + 1 + ror result + 3 ; 5 cyc + ror result + 2 ; 5 cyc + ror result + 1 ; 5 cyc .if bitnum >= 8 ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte ; when it's all uninitialized data - ror result + ror result ; 5 cyc .endif .endmacro +; 5 to 25 cycles .macro check_sign arg ; Check sign bit and flip argument to postive, ; keeping a count of sign bits in the X register. .local positive - lda arg + 1 - bpl positive - neg16 arg - inx + lda arg + 1 ; 3 cyc + bpl positive ; 2 cyc + neg16 arg ; 18 cyc + inx ; 2 cyc positive: .endmacro +; 579 to 725 cycles .proc imul16 arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result - ldx #0 + ldx #0 ; 2 cyc ; counts the number of sign bits in X - check_sign arg1 - check_sign arg2 + check_sign arg1 ; 5 to 25 cyc + check_sign arg2 ; 5 to 25 cyc ; zero out the 32-bit temp's top 16 bits - lda #0 - sta result + 2 - sta result + 3 + lda #0 ; 2 cyc + sta result + 2 ; 3 cyc + sta result + 3 ; 3 cyc ; the bottom two bytes will get cleared by the shifts ; unrolled loop for maximum speed, at the cost ; of a larger routine .repeat 16, bitnum + ; first half: 24 to 45 cycles + ; second half: 29 to 49 cycles bitmul16 arg1, arg2, result, bitnum .endrepeat ; In case of mixed input signs, return a negative result. - cpx #1 - bne positive_result - neg32 result + cpx #1 ; 2 cyc + bne positive_result ; 2 cyc + neg32 result ; 34 cyc positive_result: - rts + rts ; 6 cyc .endproc .proc iter