annotate cycle counts

This commit is contained in:
Brooke Vibber 2022-12-30 17:33:18 -08:00
parent 6583dc3680
commit 4b1001bfdc

View file

@ -57,19 +57,22 @@ plus:
neg 4, arg
.endmacro
; 2 + 9 * bytes cycles
.macro add bytes, arg1, arg2
clc
clc ; 2 cyc
.repeat bytes, byte
lda arg1+byte
adc arg2+byte
sta arg1+byte
lda arg1+byte ; 3 cyc
adc arg2+byte ; 3 cyc
sta arg1+byte ; 3 cyc
.endrepeat
.endmacro
; 20 cycles
.macro add16 arg1, arg2
add 2, arg1, arg2
.endmacro
; 38 cycles
.macro add32 arg1, arg2
add 4, arg1, arg2
.endmacro
@ -112,6 +115,7 @@ plus:
shr 4, arg
.endmacro
; 24 to 49 cycles
.macro bitmul16 arg1, arg2, result, bitnum
.local next
@ -119,79 +123,79 @@ plus:
; arg1 must be 0 or positive
; arg2 must be 0 or positive
clc
clc ; 2 cyc
; check if arg1 has 0 or 1 bit in this place
; 5 cycles either way
.if bitnum < 8
lda arg1
and #(1 << bitnum)
lda arg1 ; 3 cyc
and #(1 << bitnum) ; 2 cyc
.else
lda arg1 + 1
and #(1 << (bitnum - 8))
lda arg1 + 1 ; 3 cyc
and #(1 << (bitnum - 8)) ; 2 cyc
.endif
beq next
beq next ; 2 cyc
; 16-bit add on the top bits
lda result + 2
adc arg2
sta result + 2
lda result + 3
adc arg2 + 1
sta result + 3
add16 result + 2, arg2 ; 20 cyc
next:
; Shift the 32-bit result down by one bit,
; saving the previous carry.
ror result + 3
ror result + 2
ror result + 1
ror result + 3 ; 5 cyc
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
.if bitnum >= 8
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
ror result
ror result ; 5 cyc
.endif
.endmacro
; 5 to 25 cycles
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the X register.
.local positive
lda arg + 1
bpl positive
neg16 arg
inx
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
inx ; 2 cyc
positive:
.endmacro
; 579 to 725 cycles
.proc imul16
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
ldx #0
ldx #0 ; 2 cyc
; counts the number of sign bits in X
check_sign arg1
check_sign arg2
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
; zero out the 32-bit temp's top 16 bits
lda #0
sta result + 2
sta result + 3
lda #0 ; 2 cyc
sta result + 2 ; 3 cyc
sta result + 3 ; 3 cyc
; the bottom two bytes will get cleared by the shifts
; unrolled loop for maximum speed, at the cost
; of a larger routine
.repeat 16, bitnum
; first half: 24 to 45 cycles
; second half: 29 to 49 cycles
bitmul16 arg1, arg2, result, bitnum
.endrepeat
; In case of mixed input signs, return a negative result.
cpx #1
bne positive_result
neg32 result
cpx #1 ; 2 cyc
bne positive_result ; 2 cyc
neg32 result ; 34 cyc
positive_result:
rts
rts ; 6 cyc
.endproc
.proc iter