Compare commits

..

4 commits

Author SHA1 Message Date
3553ce986f shave some cycles off 16-bit squaring with shift instead of add
also fix the comments about how many cycles shift takes
2024-12-31 15:29:40 -08:00
0f49760aa5 unify tables for squaring and multiplication 2024-12-31 02:26:24 -08:00
f06aed0c00 set results from both 8-bit squares first
Since the results from the lo and hi squares don't overlap or overflow,
they can be written directly to the final output location without doing
any addition. Then only the multiplication that goes in the middle needs
any adds.
2024-12-31 02:22:31 -08:00
aee587388d eliminate mul_hibyte512 table
This costs an extra half cycle on average, assuming uniform distribution
of multiplication inputs. I don't think a half cycle is worth an extra
256-byte table.
2024-12-31 02:01:45 -08:00
4 changed files with 140 additions and 248 deletions

341
mandel.s
View file

@ -1,42 +1,43 @@
; Our zero-page vars ; Our zero-page vars
ox = $80 ; fixed8.24: center point x sx = $80 ; i16: screen pixel x
oy = $84 ; fixed8.24: center point y sy = $82 ; i16: screen pixel y
cx = $88 ; fixed8.24: c_x ox = $84 ; fixed4.12: center point x
cy = $8c ; fixed8.24: c_y oy = $86 ; fixed4.12: center point y
cx = $88 ; fixed4.12: c_x
cy = $8a ; fixed4.12: c_y
zx = $8c ; fixed4.12: z_x
zy = $8e ; fixed4.12: z_y
zx = $90 ; fixed8.24: z_x zx_2 = $90 ; fixed4.12: z_x^2
zy = $94 ; fixed8.24: z_y zy_2 = $92 ; fixed4.12: z_y^2
zx_2 = $98 ; fixed8.24: z_x^2 zx_zy = $94 ; fixed4.12: z_x * z_y
zy_2 = $9c ; fixed8.24: z_y^2 dist = $96 ; fixed4.12: z_x^2 + z_y^2
zx_zy = $a0 ; fixed8.24: z_x * z_y iter = $a0 ; u8: iteration count
dist = $a4 ; fixed8.24: z_x^2 + z_y^2
sx = $a8 ; i16: screen pixel x
sy = $aa ; i16: screen pixel y
z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $ad ; u8: index into z_buffer
z_buffer_end = $ae ; u8: index into z_buffer
iter = $af ; u8: iteration count
ptr = $b0 ; u16 zoom = $a1 ; u8: zoom shift level
pixel_ptr = $b2 ; u16 count_frames = $a2 ; u8
zoom = $b4 ; u8: zoom shift level count_pixels = $a3 ; u8
fill_level = $b5 ; u8 total_ms = $a4 ; float48
pixel_color = $b6 ; u8 total_pixels = $aa ; float48
pixel_mask = $b7 ; u8
pixel_shift = $b8 ; u8
pixel_offset = $b9 ; u8
palette_offset = $ba ; u8
chroma_offset = $bb ; u8
palette_ticks = $bc ; u8
chroma_ticks = $bd ; u8
count_frames = $be ; u8
count_pixels = $bf ; u8
total_pixels = $c0 ; float48 z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
total_ms = $c6 ; float48 z_buffer_start = $b1 ; u8: index into z_buffer
temp = $cc ; u16 z_buffer_end = $b2 ; u8: index into z_buffer
temp2 = $ce ; u16 temp = $b4 ; u16
temp2 = $b6 ; u16
pixel_ptr = $b8 ; u16
pixel_color = $ba ; u8
pixel_mask = $bb ; u8
pixel_shift = $bc ; u8
pixel_offset = $bd ; u8
fill_level = $be ; u8
palette_offset = $bf ; u8
palette_ticks = $c0 ; u8
chroma_ticks = $c1 ; u8
chroma_offset = $c2 ; u8
ptr = $c4 ; u16
palette_delay = 23 palette_delay = 23
chroma_delay = 137 chroma_delay = 137
@ -128,11 +129,8 @@ KEY_0 = 50
mantissa .byte 5 mantissa .byte 5
.endstruct .endstruct
.import mul_lobyte256 .import mul_lobyte
.import mul_hibyte256 .import mul_hibyte
.import mul_hibyte512
.import sqr_lobyte
.import sqr_hibyte
.data .data
@ -292,16 +290,16 @@ viewport_zoom:
.byte 6 .byte 6
viewport_ox: viewport_ox:
.dword $00000000 .word $0000
.dword $ff110000 .word $f110
.dword $ff110000 .word $f110
.dword $fe400000 .word $e400
viewport_oy: viewport_oy:
.dword $00000000 .word $0000
.dword $ffb60000 .word $fb60
.dword $ffbe0000 .word $fbe0
.dword $00000000 .word $0000
; 2 + 9 * byte cycles ; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2 .macro add bytes, dest, arg1, arg2
@ -320,7 +318,7 @@ viewport_oy:
; 38 cycles ; 38 cycles
.macro add32 dest, arg1, arg2 .macro add32 dest, arg1, arg2
add 4, dest, arg1, arg2 add 4, dest, arg2, dest
.endmacro .endmacro
; 8 cycles ; 8 cycles
@ -350,7 +348,7 @@ viewport_oy:
sub 4, dest, arg1, arg2 sub 4, dest, arg1, arg2
.endmacro .endmacro
; 3 + 5 * bytes cycles ; 3 + 5 * (bytes - 1) cycles
.macro shl bytes, arg .macro shl bytes, arg
asl arg ; 3 cyc asl arg ; 3 cyc
.repeat bytes-1, i .repeat bytes-1, i
@ -358,17 +356,17 @@ viewport_oy:
.endrepeat .endrepeat
.endmacro .endmacro
; 13 cycles ; 8 cycles
.macro shl16 arg .macro shl16 arg
shl 2, arg shl 2, arg
.endmacro .endmacro
; 18 cycles ; 13 cycles
.macro shl24 arg .macro shl24 arg
shl 3, arg shl 3, arg
.endmacro .endmacro
; 23 cycles ; 18 cycles
.macro shl32 arg .macro shl32 arg
shl 4, arg shl 4, arg
.endmacro .endmacro
@ -425,45 +423,32 @@ viewport_oy:
round16 arg ; 11-27 cycles round16 arg ; 11-27 cycles
.endmacro .endmacro
; input: arg1, arg2 as fixed4.12 .macro imul16_round dest, arg1, arg2, shift
; output: dest as fixed8.24
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; ? cyc jsr imul16_func ; ? cyc
copy32 dest, FR2 ; 24 cyc shift_round_16 FR2, shift ; 103-119 cycles for shift=4
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro .endmacro
; input: arg as fixed4.12 .macro sqr16_round dest, arg, shift
; output: dest as fixed8.24 ;imul16_round dest, arg, arg, shift
.macro sqr16 dest, arg
copy16 FR0, arg ; 12 cyc copy16 FR0, arg ; 12 cyc
jsr sqr16_func ; ? cyc jsr sqr16_func ; ? cyc
copy32 dest, FR2 ; 24 cyc shift_round_16 FR2, shift ; 103-119 cycles for shift=4
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro .endmacro
; input: arg as u8
; output: dest as u16
; clobbers a, x ; clobbers a, x
.macro sqr8 dest, arg .macro sqr8 dest, arg
ldx arg ldx arg
lda sqr_lobyte,x txa
lsr
lda mul_lobyte,x
rol
sta dest sta dest
lda sqr_hibyte,x lda mul_hibyte,x
sta dest + 1 rol
.endmacro
; input: arg as u8
; input/output: dest as u16
; clobbers a, x
.macro sqr8_add16 dest, arg
ldx arg
clc
lda sqr_lobyte,x
adc dest
sta dest
lda sqr_hibyte,x
adc dest + 1
sta dest + 1 sta dest + 1
.endmacro .endmacro
@ -552,22 +537,25 @@ bank_switch_table:
clc ; 2 cyc clc ; 2 cyc
adc mul_factor_x ; 3 cyc adc mul_factor_x ; 3 cyc
tax ; 2 cyc tax ; 2 cyc
bcc under256 ; 2 cyc lda mul_hibyte,x ; 4 cyc
lda mul_hibyte512,x ; 4 cyc bcc next ; 2 cyc
bcs next ; 2 cyc ; carry is set so we get to add 1 for free, but need to add 0x80
under256: adc #$7f ; 2 cyc
lda mul_hibyte256,x ; 4 cyc clc ; 2 cyc
sec ; 2 cyc ; stash the sum temporarily so we can use it as an operand to add
stx mul_product_lo ; 3 cyc
adc mul_product_lo ; 3 cyc
next: next:
sec ; 2 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
lda mul_lobyte256,x ; 4 cyc lda mul_lobyte,x ; 4 cyc
; - a^2/2 ; - a^2/2
ldx mul_factor_a ; 3 cyc ldx mul_factor_a ; 3 cyc
sbc mul_lobyte256,x ; 4 cyc sbc mul_lobyte,x ; 4 cyc
sta mul_product_lo ; 3 cyc sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc sbc mul_hibyte,x ; 4 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
; + x & a & 1: ; + x & a & 1:
@ -586,10 +574,10 @@ bank_switch_table:
; - x^2/2 ; - x^2/2
small_product: small_product:
sec ; 2 cyc sec ; 2 cyc
sbc mul_lobyte256,x ; 4 cyc sbc mul_lobyte,x ; 4 cyc
sta mul_product_lo ; 3 cyc sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc lda mul_product_hi ; 3 cyc
sbc mul_hibyte256,x ; 4 cyc sbc mul_hibyte,x ; 4 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
.endscope .endscope
.endif .endif
@ -796,18 +784,14 @@ arg2_pos:
; h*h*256*256 + h*l*256 + h*l*256 + l*l ; h*h*256*256 + h*l*256 + h*l*256 + l*l
sqr8 result, arg sqr8 result, arg
lda #0 sqr8 result + 2, arg + 1
sta result + 2
sta result + 3
imul8 inter, arg + 1, arg, xe imul8 inter, arg + 1, arg, xe
add16 result + 1, result + 1, inter shl16 inter
add_carry result + 3 add_carry result + 3
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3 add_carry result + 3
sqr8_add16 result + 2, arg + 1
rts ; 6 cyc rts ; 6 cyc
.endscope .endscope
.endmacro .endmacro
@ -875,8 +859,8 @@ next:
.proc mandelbrot .proc mandelbrot
; input: ; input:
; cx: position scaled to 8.24 fixed point - -128..+127.9 ; cx: position scaled to 4.12 fixed point - -8..+7.9
; cy: position scaled to 8.24 ; cy: position scaled to 4.12
; ;
; output: ; output:
; iter: iteration count at escape or 0 ; iter: iteration count at escape or 0
@ -888,41 +872,12 @@ next:
; zx_zy = 0 ; zx_zy = 0
; dist = 0 ; dist = 0
; iter = 0 ; iter = 0
; lda #00
; ldx #(iter - zx + 1)
;initloop:
; sta zx - 1,x
; dex
; bne initloop
; sta z_buffer_start
; sta z_buffer_end
lda #00 lda #00
sta zx ldx #(iter - zx + 1)
sta zx + 1 initloop:
sta zx + 2 sta zx - 1,x
sta zx + 3 dex
sta zy bne initloop
sta zy + 1
sta zy + 2
sta zy + 3
sta zx_2
sta zx_2 + 1
sta zx_2 + 2
sta zx_2 + 3
sta zy_2
sta zy_2 + 1
sta zy_2 + 2
sta zy_2 + 3
sta zx_zy
sta zx_zy + 1
sta zx_zy + 2
sta zx_zy + 3
sta dist
sta dist + 1
sta dist + 2
sta dist + 3
sta iter
sta z_buffer_start sta z_buffer_start
sta z_buffer_end sta z_buffer_end
@ -934,8 +889,6 @@ loop:
keep_going: keep_going:
.macro quick_exit arg, max .macro quick_exit arg, max
; arg: fixed8.24
; max: integer
.local positive .local positive
.local negative .local negative
.local nope_out .local nope_out
@ -943,16 +896,16 @@ keep_going:
.local all_done .local all_done
; check sign bit ; check sign bit
lda arg + 3 lda arg + 1
bmi negative bmi negative
positive: positive:
cmp #max cmp #((max) << 4)
bmi all_done ; 'less than' bmi all_done ; 'less than'
jmp exit_path jmp exit_path
negative: negative:
cmp #(256 - max) cmp #(256 - ((max) << 4))
beq first_equal ; 'equal' on first byte beq first_equal ; 'equal' on first byte
bpl all_done ; 'greater than' bpl all_done ; 'greater than'
@ -960,44 +913,34 @@ keep_going:
jmp exit_path jmp exit_path
first_equal: first_equal:
; following bytes all 0 shows it's really 'equal'
lda arg + 2
bne all_done
lda arg + 1
bne all_done
lda arg lda arg
bne all_done beq nope_out ; 2nd byte 0 shows it's really 'equal'
jmp exit_path
all_done: all_done:
.endmacro .endmacro
; 8.24: (-128 .. 127.9) ; 4.12: (-8 .. +7.9)
; zx = zx_2 - zy_2 + cx ; zx = zx_2 - zy_2 + cx
sub32 zx, zx_2, zy_2 sub16 zx, zx_2, zy_2
add32 zx, zx, cx add16 zx, zx, cx
quick_exit zx, 2 quick_exit zx, 2
; zy = zx_zy + zx_zy + cy ; zy = zx_zy + zx_zy + cy
add32 zy, zx_zy, zx_zy add16 zy, zx_zy, zx_zy
add32 zy, zy, cy add16 zy, zy, cy
quick_exit zy, 2 quick_exit zy, 2
; convert 8.24 -> 4.12: (-8 .. +7.9)
shift_round_16 zx, 4
shift_round_16 zy, 4
; zx_2 = zx * zx ; zx_2 = zx * zx
sqr16 zx_2, zx + 2 sqr16_round zx_2, zx, 4
; zy_2 = zy * zy ; zy_2 = zy * zy
sqr16 zy_2, zy + 2 sqr16_round zy_2, zy, 4
; zx_zy = zx * zy ; zx_zy = zx * zy
imul16 zx_zy, zx + 2, zy + 2 imul16_round zx_zy, zx, zy, 4
; dist = zx_2 + zy_2 ; dist = zx_2 + zy_2
add32 dist, zx_2, zy_2 add16 dist, zx_2, zy_2
quick_exit dist, 4 quick_exit dist, 4
; if may be in the lake, look for looping output with a small buffer ; if may be in the lake, look for looping output with a small buffer
@ -1034,10 +977,10 @@ z_buffer_loop:
; Compare the previously stored z values ; Compare the previously stored z values
ldy #0 ldy #0
z_compare zx + 2 z_compare zx
z_compare zx + 3 z_compare zx + 1
z_compare zy + 2 z_compare zy
z_compare zy + 3 z_compare zy + 1
cpy #4 cpy #4
bne z_no_matches bne z_no_matches
@ -1052,10 +995,10 @@ z_no_matches:
z_nothing_to_read: z_nothing_to_read:
; Store and expand ; Store and expand
z_store zx + 2 z_store zx
z_store zx + 3 z_store zx + 1
z_store zy + 2 z_store zy
z_store zy + 3 z_store zy + 1
z_advance z_advance
stx z_buffer_end stx z_buffer_end
@ -1106,17 +1049,14 @@ cont:
enough: enough:
.endmacro .endmacro
.macro zoom_factor dest, src, aspect .macro zoom_factor dest, src, zoom, aspect
; output: dest: fixed8.24
; input: src: fixed4.12
; aspect: fixed4.12
; clobbers A, X, flags, etc ; clobbers A, X, flags, etc
copy16 dest, src copy16 dest, src
scale_zoom dest scale_zoom dest
; cy = cy * (3 / 4) ; cy = cy * (3 / 4)
; cx = cx * (5 / 4) ; cx = cx * (5 / 4)
imul16 dest, dest, aspect imul16_round dest, dest, aspect, 4
.endmacro .endmacro
.proc pset .proc pset
@ -1341,15 +1281,12 @@ skip_luma:
cpy #KEY_MINUS cpy #KEY_MINUS
beq minus beq minus
; temp+temp2 = $00010000 << (8 - zoom) ; temp = $0010 << (8 - zoom)
lda #$00 lda #$10
sta temp sta temp
sta temp + 1
lda #$01
sta temp + 2
lda #$00 lda #$00
sta temp + 3 sta temp + 1
scale_zoom temp + 2 scale_zoom temp
cpy #KEY_UP cpy #KEY_UP
beq up beq up
@ -1359,7 +1296,14 @@ skip_luma:
beq left beq left
cpy #KEY_RIGHT cpy #KEY_RIGHT
beq right beq right
jmp number_keys cpy #KEY_1
beq one
cpy #KEY_2
beq two
cpy #KEY_3
beq three
cpy #KEY_4
beq four
skip_char: skip_char:
lda #0 lda #0
@ -1378,29 +1322,17 @@ minus:
dec zoom dec zoom
jmp done jmp done
up: up:
sub32 oy, oy, temp sub16 oy, oy, temp
jmp done jmp done
down: down:
add32 oy, oy, temp add16 oy, oy, temp
jmp done jmp done
left: left:
sub32 ox, ox, temp sub16 ox, ox, temp
jmp done jmp done
right: right:
add32 ox, ox, temp add16 ox, ox, temp
jmp done jmp done
number_keys:
cpy #KEY_1
beq one
cpy #KEY_2
beq two
cpy #KEY_3
beq three
cpy #KEY_4
beq four
jmp skip_char
one: one:
ldx #0 ldx #0
jmp load_key_viewport jmp load_key_viewport
@ -1462,32 +1394,17 @@ zero_byte_loop:
txa txa
asl a asl a
asl a
tax tax
lda viewport_ox,x lda viewport_ox,x
sta ox sta ox
lda viewport_oy,x lda viewport_oy,x
sta oy sta oy
inx inx
lda viewport_ox,x lda viewport_ox,x
sta ox + 1 sta ox + 1
lda viewport_oy,x lda viewport_oy,x
sta oy + 1 sta oy + 1
inx
lda viewport_ox,x
sta ox + 2
lda viewport_oy,x
sta oy + 2
inx
lda viewport_ox,x
sta ox + 3
lda viewport_oy,x
sta oy + 3
rts rts
.endproc .endproc
@ -1609,10 +1526,10 @@ skipped_mask:
not_skipped_mask: not_skipped_mask:
; run the fractal! ; run the fractal!
zoom_factor cx, sx, aspect_x zoom_factor cx, sx, zoom, aspect_x
add32 cx, cx, ox add16 cx, cx, ox
zoom_factor cy, sy, aspect_y zoom_factor cy, sy, zoom, aspect_y
add32 cy, cy, oy add16 cy, cy, oy
jsr mandelbrot jsr mandelbrot
jsr pset jsr pset

View file

@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26. The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
Iterations are capped at 255. Iterations are capped at 255.

View file

@ -11,40 +11,19 @@ function db(func) {
return lines.join('\n'); return lines.join('\n');
} }
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log( console.log(
`.segment "TABLES" `.segment "TABLES"
.export mul_lobyte256 .export mul_lobyte
.export mul_hibyte256 .export mul_hibyte
.export mul_hibyte512
.export sqr_lobyte
.export sqr_hibyte
; (i * i + 1) / 2 for the multiplier ; (i * i) / 2 for the multiplier
.align 256 .align 256
mul_lobyte256: mul_lobyte:
${db((i) => squares[i] & 0xff)} ${db((i) => ((i * i) >> 1) & 0xff)}
.align 256 .align 256
mul_hibyte256: mul_hibyte:
${db((i) => (squares[i] >> 8) & 0xff)} ${db((i) => ((i * i) >> 9) & 0xff)}
.align 256
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares
.align 256
sqr_lobyte:
${db((i) => (i * i) & 0xff)}
.align 256
sqr_hibyte:
${db((i) => ((i * i) >> 8) & 0xff)}
`); `);

View file

@ -1,13 +1,9 @@
things to try: things to try:
* skip add on the top-byte multiply in sqr8/mul8
* should save a few cycles, suggestion by jamey
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
* try 3.13 fixed point instead of 4.12 for more precision * try 3.13 fixed point instead of 4.12 for more precision
* can we get away without the extra bit? * can we get away without the extra bit?
* since exit compare space would be 6.26 i think so
* y-axis mirror optimization * y-axis mirror optimization