Compare commits
No commits in common. "38091e535f610c205a5d7eb8955b8a2a4103ff52" and "9682b4a6b38620ff1ec939087426571d368d0419" have entirely different histories.
38091e535f
...
9682b4a6b3
1 changed files with 12 additions and 27 deletions
39
mandel.s
39
mandel.s
|
@ -30,18 +30,15 @@ FRX = $ec
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
; inner loop for imul16
|
; inner loop for imul16
|
||||||
; bitnum < 8: 25 or 41 cycles
|
; 24 to 44 cycles
|
||||||
; bitnum >= 8: 30 or 46 cycles
|
|
||||||
.macro bitmul16 arg1, arg2, result, bitnum
|
.macro bitmul16 arg1, arg2, result, bitnum
|
||||||
.local zero
|
|
||||||
.local one
|
|
||||||
.local next
|
.local next
|
||||||
|
|
||||||
; does 16-bit adds
|
; does 16-bit adds
|
||||||
; arg1 must be 0 or positive
|
; arg1 must be 0 or positive
|
||||||
; arg2 must be 0 or positive
|
; arg2 must be 0 or positive
|
||||||
|
|
||||||
; 7 cycles up to the branch
|
clc ; 2 cyc
|
||||||
|
|
||||||
; check if arg1 has 0 or 1 bit in this place
|
; check if arg1 has 0 or 1 bit in this place
|
||||||
; 5 cycles either way
|
; 5 cycles either way
|
||||||
|
@ -52,29 +49,21 @@ FRX = $ec
|
||||||
lda arg1 + 1 ; 3 cyc
|
lda arg1 + 1 ; 3 cyc
|
||||||
and #(1 << (bitnum - 8)) ; 2 cyc
|
and #(1 << (bitnum - 8)) ; 2 cyc
|
||||||
.endif
|
.endif
|
||||||
bne one ; 2 cyc
|
beq next ; 2 cyc
|
||||||
|
|
||||||
zero: ; 18 cyc, 23 cyc
|
|
||||||
lsr result + 3 ; 5 cyc
|
|
||||||
ror result + 2 ; 5 cyc
|
|
||||||
ror result + 1 ; 5 cyc
|
|
||||||
.if bitnum >= 8
|
|
||||||
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
|
|
||||||
; when it's all uninitialized data
|
|
||||||
ror result ; 5 cyc
|
|
||||||
.endif
|
|
||||||
jmp next ; 3 cyc
|
|
||||||
|
|
||||||
one: ; 32 cyc, 37 cyc
|
|
||||||
; 16-bit add on the top bits
|
; 16-bit add on the top bits
|
||||||
clc ; 2 cyc
|
|
||||||
lda result + 2 ; 3 cyc
|
lda result + 2 ; 3 cyc
|
||||||
adc arg2 ; 3 cyc
|
adc arg2 ; 3 cyc
|
||||||
sta result + 2 ; 3 cyc
|
sta result + 2 ; 3 cyc
|
||||||
lda result + 3 ; 3 cyc
|
lda result + 3 ; 3 cyc
|
||||||
adc arg2 + 1 ; 3 cyc
|
adc arg2 + 1 ; 3 cyc
|
||||||
ror a ; 2 cyc
|
ror a ; 2 cyc - get a jump on the shift
|
||||||
sta result + 3 ; 3 cyc
|
sta result + 3 ; 3 cyc
|
||||||
|
|
||||||
|
; Shift the 32-bit result down by one bit,
|
||||||
|
; saving the previous carry.
|
||||||
|
ror result + 3 ; 5 cyc
|
||||||
|
next:
|
||||||
ror result + 2 ; 5 cyc
|
ror result + 2 ; 5 cyc
|
||||||
ror result + 1 ; 5 cyc
|
ror result + 1 ; 5 cyc
|
||||||
.if bitnum >= 8
|
.if bitnum >= 8
|
||||||
|
@ -82,9 +71,6 @@ one: ; 32 cyc, 37 cyc
|
||||||
; when it's all uninitialized data
|
; when it's all uninitialized data
|
||||||
ror result ; 5 cyc
|
ror result ; 5 cyc
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
next:
|
|
||||||
|
|
||||||
.endmacro
|
.endmacro
|
||||||
|
|
||||||
; 5 to 25 cycles
|
; 5 to 25 cycles
|
||||||
|
@ -121,8 +107,8 @@ positive:
|
||||||
; of a larger routine
|
; of a larger routine
|
||||||
; 424 to 672 cycles
|
; 424 to 672 cycles
|
||||||
.repeat 16, bitnum
|
.repeat 16, bitnum
|
||||||
; first half: 22 to 40 cycles
|
; first half: 24 to 40 cycles
|
||||||
; second half: 29 to 47 cycles
|
; second half: 29 to 44 cycles
|
||||||
bitmul16 arg1, arg2, result, bitnum
|
bitmul16 arg1, arg2, result, bitnum
|
||||||
.endrepeat
|
.endrepeat
|
||||||
|
|
||||||
|
@ -167,7 +153,7 @@ loop:
|
||||||
|
|
||||||
.proc start
|
.proc start
|
||||||
|
|
||||||
looplong:
|
loop:
|
||||||
; FR0 = 5
|
; FR0 = 5
|
||||||
; FR1 = -3
|
; FR1 = -3
|
||||||
lda #5
|
lda #5
|
||||||
|
@ -182,6 +168,5 @@ looplong:
|
||||||
jsr imul16
|
jsr imul16
|
||||||
; should have 32-bit -15 in FR2
|
; should have 32-bit -15 in FR2
|
||||||
|
|
||||||
loop:
|
|
||||||
jmp loop
|
jmp loop
|
||||||
.endproc
|
.endproc
|
||||||
|
|
Loading…
Reference in a new issue