annotate cycle counts
This commit is contained in:
parent
6583dc3680
commit
4b1001bfdc
1 changed files with 38 additions and 34 deletions
72
mandel.s
72
mandel.s
|
@ -57,19 +57,22 @@ plus:
|
||||||
neg 4, arg
|
neg 4, arg
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
|
; 2 + 9 * bytes cycles
|
||||||
.macro add bytes, arg1, arg2
|
.macro add bytes, arg1, arg2
|
||||||
clc
|
clc ; 2 cyc
|
||||||
.repeat bytes, byte
|
.repeat bytes, byte
|
||||||
lda arg1+byte
|
lda arg1+byte ; 3 cyc
|
||||||
adc arg2+byte
|
adc arg2+byte ; 3 cyc
|
||||||
sta arg1+byte
|
sta arg1+byte ; 3 cyc
|
||||||
.endrepeat
|
.endrepeat
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
|
; 20 cycles
|
||||||
.macro add16 arg1, arg2
|
.macro add16 arg1, arg2
|
||||||
add 2, arg1, arg2
|
add 2, arg1, arg2
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
|
; 38 cycles
|
||||||
.macro add32 arg1, arg2
|
.macro add32 arg1, arg2
|
||||||
add 4, arg1, arg2
|
add 4, arg1, arg2
|
||||||
.endmacro
|
.endmacro
|
||||||
|
@ -112,6 +115,7 @@ plus:
|
||||||
shr 4, arg
|
shr 4, arg
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
|
; 24 to 49 cycles
|
||||||
.macro bitmul16 arg1, arg2, result, bitnum
|
.macro bitmul16 arg1, arg2, result, bitnum
|
||||||
.local next
|
.local next
|
||||||
|
|
||||||
|
@ -119,79 +123,79 @@ plus:
|
||||||
; arg1 must be 0 or positive
|
; arg1 must be 0 or positive
|
||||||
; arg2 must be 0 or positive
|
; arg2 must be 0 or positive
|
||||||
|
|
||||||
clc
|
clc ; 2 cyc
|
||||||
|
|
||||||
; check if arg1 has 0 or 1 bit in this place
|
; check if arg1 has 0 or 1 bit in this place
|
||||||
|
; 5 cycles either way
|
||||||
.if bitnum < 8
|
.if bitnum < 8
|
||||||
lda arg1
|
lda arg1 ; 3 cyc
|
||||||
and #(1 << bitnum)
|
and #(1 << bitnum) ; 2 cyc
|
||||||
.else
|
.else
|
||||||
lda arg1 + 1
|
lda arg1 + 1 ; 3 cyc
|
||||||
and #(1 << (bitnum - 8))
|
and #(1 << (bitnum - 8)) ; 2 cyc
|
||||||
.endif
|
.endif
|
||||||
beq next
|
beq next ; 2 cyc
|
||||||
|
|
||||||
; 16-bit add on the top bits
|
; 16-bit add on the top bits
|
||||||
lda result + 2
|
add16 result + 2, arg2 ; 20 cyc
|
||||||
adc arg2
|
|
||||||
sta result + 2
|
|
||||||
lda result + 3
|
|
||||||
adc arg2 + 1
|
|
||||||
sta result + 3
|
|
||||||
|
|
||||||
next:
|
next:
|
||||||
; Shift the 32-bit result down by one bit,
|
; Shift the 32-bit result down by one bit,
|
||||||
; saving the previous carry.
|
; saving the previous carry.
|
||||||
ror result + 3
|
ror result + 3 ; 5 cyc
|
||||||
ror result + 2
|
ror result + 2 ; 5 cyc
|
||||||
ror result + 1
|
ror result + 1 ; 5 cyc
|
||||||
.if bitnum >= 8
|
.if bitnum >= 8
|
||||||
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
|
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
|
||||||
; when it's all uninitialized data
|
; when it's all uninitialized data
|
||||||
ror result
|
ror result ; 5 cyc
|
||||||
.endif
|
.endif
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
|
; 5 to 25 cycles
|
||||||
.macro check_sign arg
|
.macro check_sign arg
|
||||||
; Check sign bit and flip argument to postive,
|
; Check sign bit and flip argument to postive,
|
||||||
; keeping a count of sign bits in the X register.
|
; keeping a count of sign bits in the X register.
|
||||||
.local positive
|
.local positive
|
||||||
lda arg + 1
|
lda arg + 1 ; 3 cyc
|
||||||
bpl positive
|
bpl positive ; 2 cyc
|
||||||
neg16 arg
|
neg16 arg ; 18 cyc
|
||||||
inx
|
inx ; 2 cyc
|
||||||
positive:
|
positive:
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
|
; 579 to 725 cycles
|
||||||
.proc imul16
|
.proc imul16
|
||||||
arg1 = FR0 ; 16-bit arg (clobbered)
|
arg1 = FR0 ; 16-bit arg (clobbered)
|
||||||
arg2 = FR1 ; 16-bit arg (clobbered)
|
arg2 = FR1 ; 16-bit arg (clobbered)
|
||||||
result = FR2 ; 32-bit result
|
result = FR2 ; 32-bit result
|
||||||
|
|
||||||
ldx #0
|
ldx #0 ; 2 cyc
|
||||||
; counts the number of sign bits in X
|
; counts the number of sign bits in X
|
||||||
check_sign arg1
|
check_sign arg1 ; 5 to 25 cyc
|
||||||
check_sign arg2
|
check_sign arg2 ; 5 to 25 cyc
|
||||||
|
|
||||||
; zero out the 32-bit temp's top 16 bits
|
; zero out the 32-bit temp's top 16 bits
|
||||||
lda #0
|
lda #0 ; 2 cyc
|
||||||
sta result + 2
|
sta result + 2 ; 3 cyc
|
||||||
sta result + 3
|
sta result + 3 ; 3 cyc
|
||||||
; the bottom two bytes will get cleared by the shifts
|
; the bottom two bytes will get cleared by the shifts
|
||||||
|
|
||||||
; unrolled loop for maximum speed, at the cost
|
; unrolled loop for maximum speed, at the cost
|
||||||
; of a larger routine
|
; of a larger routine
|
||||||
.repeat 16, bitnum
|
.repeat 16, bitnum
|
||||||
|
; first half: 24 to 45 cycles
|
||||||
|
; second half: 29 to 49 cycles
|
||||||
bitmul16 arg1, arg2, result, bitnum
|
bitmul16 arg1, arg2, result, bitnum
|
||||||
.endrepeat
|
.endrepeat
|
||||||
|
|
||||||
; In case of mixed input signs, return a negative result.
|
; In case of mixed input signs, return a negative result.
|
||||||
cpx #1
|
cpx #1 ; 2 cyc
|
||||||
bne positive_result
|
bne positive_result ; 2 cyc
|
||||||
neg32 result
|
neg32 result ; 34 cyc
|
||||||
positive_result:
|
positive_result:
|
||||||
|
|
||||||
rts
|
rts ; 6 cyc
|
||||||
.endproc
|
.endproc
|
||||||
|
|
||||||
.proc iter
|
.proc iter
|
||||||
|
|
Loading…
Reference in a new issue