Compare commits

..

13 commits

Author SHA1 Message Date
d8601bb856 fix fix 2024-12-31 15:03:43 -08:00
7985ea9a39 fix panning for 32-bi 2024-12-31 14:45:38 -08:00
cc83c76706 update docs for 32-bit intermediates 2024-12-31 14:16:43 -08:00
2e8893fd78 haha fuck me 2024-12-31 13:54:53 -08:00
81bf7f3c43 tweak 2024-12-31 09:53:22 -08:00
1e0f577e09 wip 2024-12-31 09:09:11 -08:00
d2f41f9644 wip 2024-12-31 09:02:42 -08:00
2fcb30b76a wip 2024-12-31 08:56:59 -08:00
13257309dc init fix 2024-12-31 08:34:02 -08:00
7184b8e03f wip 2024-12-31 08:24:47 -08:00
4a1e35699a wip 2024-12-31 08:24:44 -08:00
0d086a179c wip 2024-12-31 08:23:04 -08:00
61eb1aaf21 notes 2024-12-31 05:11:26 -08:00
4 changed files with 248 additions and 140 deletions

341
mandel.s
View file

@ -1,43 +1,42 @@
; Our zero-page vars ; Our zero-page vars
sx = $80 ; i16: screen pixel x ox = $80 ; fixed8.24: center point x
sy = $82 ; i16: screen pixel y oy = $84 ; fixed8.24: center point y
ox = $84 ; fixed4.12: center point x cx = $88 ; fixed8.24: c_x
oy = $86 ; fixed4.12: center point y cy = $8c ; fixed8.24: c_y
cx = $88 ; fixed4.12: c_x
cy = $8a ; fixed4.12: c_y
zx = $8c ; fixed4.12: z_x
zy = $8e ; fixed4.12: z_y
zx_2 = $90 ; fixed4.12: z_x^2 zx = $90 ; fixed8.24: z_x
zy_2 = $92 ; fixed4.12: z_y^2 zy = $94 ; fixed8.24: z_y
zx_zy = $94 ; fixed4.12: z_x * z_y zx_2 = $98 ; fixed8.24: z_x^2
dist = $96 ; fixed4.12: z_x^2 + z_y^2 zy_2 = $9c ; fixed8.24: z_y^2
iter = $a0 ; u8: iteration count zx_zy = $a0 ; fixed8.24: z_x * z_y
dist = $a4 ; fixed8.24: z_x^2 + z_y^2
sx = $a8 ; i16: screen pixel x
sy = $aa ; i16: screen pixel y
z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $ad ; u8: index into z_buffer
z_buffer_end = $ae ; u8: index into z_buffer
iter = $af ; u8: iteration count
zoom = $a1 ; u8: zoom shift level ptr = $b0 ; u16
count_frames = $a2 ; u8 pixel_ptr = $b2 ; u16
count_pixels = $a3 ; u8 zoom = $b4 ; u8: zoom shift level
total_ms = $a4 ; float48 fill_level = $b5 ; u8
total_pixels = $aa ; float48 pixel_color = $b6 ; u8
pixel_mask = $b7 ; u8
pixel_shift = $b8 ; u8
pixel_offset = $b9 ; u8
palette_offset = $ba ; u8
chroma_offset = $bb ; u8
palette_ticks = $bc ; u8
chroma_ticks = $bd ; u8
count_frames = $be ; u8
count_pixels = $bf ; u8
z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not total_pixels = $c0 ; float48
z_buffer_start = $b1 ; u8: index into z_buffer total_ms = $c6 ; float48
z_buffer_end = $b2 ; u8: index into z_buffer temp = $cc ; u16
temp = $b4 ; u16 temp2 = $ce ; u16
temp2 = $b6 ; u16
pixel_ptr = $b8 ; u16
pixel_color = $ba ; u8
pixel_mask = $bb ; u8
pixel_shift = $bc ; u8
pixel_offset = $bd ; u8
fill_level = $be ; u8
palette_offset = $bf ; u8
palette_ticks = $c0 ; u8
chroma_ticks = $c1 ; u8
chroma_offset = $c2 ; u8
ptr = $c4 ; u16
palette_delay = 23 palette_delay = 23
chroma_delay = 137 chroma_delay = 137
@ -129,8 +128,11 @@ KEY_0 = 50
mantissa .byte 5 mantissa .byte 5
.endstruct .endstruct
.import mul_lobyte .import mul_lobyte256
.import mul_hibyte .import mul_hibyte256
.import mul_hibyte512
.import sqr_lobyte
.import sqr_hibyte
.data .data
@ -290,16 +292,16 @@ viewport_zoom:
.byte 6 .byte 6
viewport_ox: viewport_ox:
.word $0000 .dword $00000000
.word $f110 .dword $ff110000
.word $f110 .dword $ff110000
.word $e400 .dword $fe400000
viewport_oy: viewport_oy:
.word $0000 .dword $00000000
.word $fb60 .dword $ffb60000
.word $fbe0 .dword $ffbe0000
.word $0000 .dword $00000000
; 2 + 9 * byte cycles ; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2 .macro add bytes, dest, arg1, arg2
@ -318,7 +320,7 @@ viewport_oy:
; 38 cycles ; 38 cycles
.macro add32 dest, arg1, arg2 .macro add32 dest, arg1, arg2
add 4, dest, arg2, dest add 4, dest, arg1, arg2
.endmacro .endmacro
; 8 cycles ; 8 cycles
@ -348,7 +350,7 @@ viewport_oy:
sub 4, dest, arg1, arg2 sub 4, dest, arg1, arg2
.endmacro .endmacro
; 3 + 5 * (bytes - 1) cycles ; 3 + 5 * bytes cycles
.macro shl bytes, arg .macro shl bytes, arg
asl arg ; 3 cyc asl arg ; 3 cyc
.repeat bytes-1, i .repeat bytes-1, i
@ -356,17 +358,17 @@ viewport_oy:
.endrepeat .endrepeat
.endmacro .endmacro
; 8 cycles ; 13 cycles
.macro shl16 arg .macro shl16 arg
shl 2, arg shl 2, arg
.endmacro .endmacro
; 13 cycles ; 18 cycles
.macro shl24 arg .macro shl24 arg
shl 3, arg shl 3, arg
.endmacro .endmacro
; 18 cycles ; 23 cycles
.macro shl32 arg .macro shl32 arg
shl 4, arg shl 4, arg
.endmacro .endmacro
@ -423,32 +425,45 @@ viewport_oy:
round16 arg ; 11-27 cycles round16 arg ; 11-27 cycles
.endmacro .endmacro
.macro imul16_round dest, arg1, arg2, shift ; input: arg1, arg2 as fixed4.12
; output: dest as fixed8.24
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; ? cyc jsr imul16_func ; ? cyc
shift_round_16 FR2, shift ; 103-119 cycles for shift=4 copy32 dest, FR2 ; 24 cyc
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro .endmacro
.macro sqr16_round dest, arg, shift ; input: arg as fixed4.12
;imul16_round dest, arg, arg, shift ; output: dest as fixed8.24
.macro sqr16 dest, arg
copy16 FR0, arg ; 12 cyc copy16 FR0, arg ; 12 cyc
jsr sqr16_func ; ? cyc jsr sqr16_func ; ? cyc
shift_round_16 FR2, shift ; 103-119 cycles for shift=4 copy32 dest, FR2 ; 24 cyc
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro .endmacro
; input: arg as u8
; output: dest as u16
; clobbers a, x ; clobbers a, x
.macro sqr8 dest, arg .macro sqr8 dest, arg
ldx arg ldx arg
txa lda sqr_lobyte,x
lsr
lda mul_lobyte,x
rol
sta dest sta dest
lda mul_hibyte,x lda sqr_hibyte,x
rol sta dest + 1
.endmacro
; input: arg as u8
; input/output: dest as u16
; clobbers a, x
.macro sqr8_add16 dest, arg
ldx arg
clc
lda sqr_lobyte,x
adc dest
sta dest
lda sqr_hibyte,x
adc dest + 1
sta dest + 1 sta dest + 1
.endmacro .endmacro
@ -537,25 +552,22 @@ bank_switch_table:
clc ; 2 cyc clc ; 2 cyc
adc mul_factor_x ; 3 cyc adc mul_factor_x ; 3 cyc
tax ; 2 cyc tax ; 2 cyc
lda mul_hibyte,x ; 4 cyc bcc under256 ; 2 cyc
bcc next ; 2 cyc lda mul_hibyte512,x ; 4 cyc
; carry is set so we get to add 1 for free, but need to add 0x80 bcs next ; 2 cyc
adc #$7f ; 2 cyc under256:
clc ; 2 cyc lda mul_hibyte256,x ; 4 cyc
; stash the sum temporarily so we can use it as an operand to add
stx mul_product_lo ; 3 cyc
adc mul_product_lo ; 3 cyc
next:
sec ; 2 cyc sec ; 2 cyc
next:
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
lda mul_lobyte,x ; 4 cyc lda mul_lobyte256,x ; 4 cyc
; - a^2/2 ; - a^2/2
ldx mul_factor_a ; 3 cyc ldx mul_factor_a ; 3 cyc
sbc mul_lobyte,x ; 4 cyc sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc lda mul_product_hi ; 3 cyc
sbc mul_hibyte,x ; 4 cyc sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
; + x & a & 1: ; + x & a & 1:
@ -574,10 +586,10 @@ bank_switch_table:
; - x^2/2 ; - x^2/2
small_product: small_product:
sec ; 2 cyc sec ; 2 cyc
sbc mul_lobyte,x ; 4 cyc sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc lda mul_product_hi ; 3 cyc
sbc mul_hibyte,x ; 4 cyc sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc sta mul_product_hi ; 3 cyc
.endscope .endscope
.endif .endif
@ -784,14 +796,18 @@ arg2_pos:
; h*h*256*256 + h*l*256 + h*l*256 + l*l ; h*h*256*256 + h*l*256 + h*l*256 + l*l
sqr8 result, arg sqr8 result, arg
sqr8 result + 2, arg + 1 lda #0
sta result + 2
sta result + 3
imul8 inter, arg + 1, arg, xe imul8 inter, arg + 1, arg, xe
shl16 inter add16 result + 1, result + 1, inter
add_carry result + 3 add_carry result + 3
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3 add_carry result + 3
sqr8_add16 result + 2, arg + 1
rts ; 6 cyc rts ; 6 cyc
.endscope .endscope
.endmacro .endmacro
@ -859,8 +875,8 @@ next:
.proc mandelbrot .proc mandelbrot
; input: ; input:
; cx: position scaled to 4.12 fixed point - -8..+7.9 ; cx: position scaled to 8.24 fixed point - -128..+127.9
; cy: position scaled to 4.12 ; cy: position scaled to 8.24
; ;
; output: ; output:
; iter: iteration count at escape or 0 ; iter: iteration count at escape or 0
@ -872,12 +888,41 @@ next:
; zx_zy = 0 ; zx_zy = 0
; dist = 0 ; dist = 0
; iter = 0 ; iter = 0
; lda #00
; ldx #(iter - zx + 1)
;initloop:
; sta zx - 1,x
; dex
; bne initloop
; sta z_buffer_start
; sta z_buffer_end
lda #00 lda #00
ldx #(iter - zx + 1) sta zx
initloop: sta zx + 1
sta zx - 1,x sta zx + 2
dex sta zx + 3
bne initloop sta zy
sta zy + 1
sta zy + 2
sta zy + 3
sta zx_2
sta zx_2 + 1
sta zx_2 + 2
sta zx_2 + 3
sta zy_2
sta zy_2 + 1
sta zy_2 + 2
sta zy_2 + 3
sta zx_zy
sta zx_zy + 1
sta zx_zy + 2
sta zx_zy + 3
sta dist
sta dist + 1
sta dist + 2
sta dist + 3
sta iter
sta z_buffer_start sta z_buffer_start
sta z_buffer_end sta z_buffer_end
@ -889,6 +934,8 @@ loop:
keep_going: keep_going:
.macro quick_exit arg, max .macro quick_exit arg, max
; arg: fixed8.24
; max: integer
.local positive .local positive
.local negative .local negative
.local nope_out .local nope_out
@ -896,16 +943,16 @@ keep_going:
.local all_done .local all_done
; check sign bit ; check sign bit
lda arg + 1 lda arg + 3
bmi negative bmi negative
positive: positive:
cmp #((max) << 4) cmp #max
bmi all_done ; 'less than' bmi all_done ; 'less than'
jmp exit_path jmp exit_path
negative: negative:
cmp #(256 - ((max) << 4)) cmp #(256 - max)
beq first_equal ; 'equal' on first byte beq first_equal ; 'equal' on first byte
bpl all_done ; 'greater than' bpl all_done ; 'greater than'
@ -913,34 +960,44 @@ keep_going:
jmp exit_path jmp exit_path
first_equal: first_equal:
; following bytes all 0 shows it's really 'equal'
lda arg + 2
bne all_done
lda arg + 1
bne all_done
lda arg lda arg
beq nope_out ; 2nd byte 0 shows it's really 'equal' bne all_done
jmp exit_path
all_done: all_done:
.endmacro .endmacro
; 4.12: (-8 .. +7.9) ; 8.24: (-128 .. 127.9)
; zx = zx_2 - zy_2 + cx ; zx = zx_2 - zy_2 + cx
sub16 zx, zx_2, zy_2 sub32 zx, zx_2, zy_2
add16 zx, zx, cx add32 zx, zx, cx
quick_exit zx, 2 quick_exit zx, 2
; zy = zx_zy + zx_zy + cy ; zy = zx_zy + zx_zy + cy
add16 zy, zx_zy, zx_zy add32 zy, zx_zy, zx_zy
add16 zy, zy, cy add32 zy, zy, cy
quick_exit zy, 2 quick_exit zy, 2
; convert 8.24 -> 4.12: (-8 .. +7.9)
shift_round_16 zx, 4
shift_round_16 zy, 4
; zx_2 = zx * zx ; zx_2 = zx * zx
sqr16_round zx_2, zx, 4 sqr16 zx_2, zx + 2
; zy_2 = zy * zy ; zy_2 = zy * zy
sqr16_round zy_2, zy, 4 sqr16 zy_2, zy + 2
; zx_zy = zx * zy ; zx_zy = zx * zy
imul16_round zx_zy, zx, zy, 4 imul16 zx_zy, zx + 2, zy + 2
; dist = zx_2 + zy_2 ; dist = zx_2 + zy_2
add16 dist, zx_2, zy_2 add32 dist, zx_2, zy_2
quick_exit dist, 4 quick_exit dist, 4
; if may be in the lake, look for looping output with a small buffer ; if may be in the lake, look for looping output with a small buffer
@ -977,10 +1034,10 @@ z_buffer_loop:
; Compare the previously stored z values ; Compare the previously stored z values
ldy #0 ldy #0
z_compare zx z_compare zx + 2
z_compare zx + 1 z_compare zx + 3
z_compare zy z_compare zy + 2
z_compare zy + 1 z_compare zy + 3
cpy #4 cpy #4
bne z_no_matches bne z_no_matches
@ -995,10 +1052,10 @@ z_no_matches:
z_nothing_to_read: z_nothing_to_read:
; Store and expand ; Store and expand
z_store zx z_store zx + 2
z_store zx + 1 z_store zx + 3
z_store zy z_store zy + 2
z_store zy + 1 z_store zy + 3
z_advance z_advance
stx z_buffer_end stx z_buffer_end
@ -1049,14 +1106,17 @@ cont:
enough: enough:
.endmacro .endmacro
.macro zoom_factor dest, src, zoom, aspect .macro zoom_factor dest, src, aspect
; output: dest: fixed8.24
; input: src: fixed4.12
; aspect: fixed4.12
; clobbers A, X, flags, etc ; clobbers A, X, flags, etc
copy16 dest, src copy16 dest, src
scale_zoom dest scale_zoom dest
; cy = cy * (3 / 4) ; cy = cy * (3 / 4)
; cx = cx * (5 / 4) ; cx = cx * (5 / 4)
imul16_round dest, dest, aspect, 4 imul16 dest, dest, aspect
.endmacro .endmacro
.proc pset .proc pset
@ -1281,12 +1341,15 @@ skip_luma:
cpy #KEY_MINUS cpy #KEY_MINUS
beq minus beq minus
; temp = $0010 << (8 - zoom) ; temp+temp2 = $00010000 << (8 - zoom)
lda #$10
sta temp
lda #$00 lda #$00
sta temp
sta temp + 1 sta temp + 1
scale_zoom temp lda #$01
sta temp + 2
lda #$00
sta temp + 3
scale_zoom temp + 2
cpy #KEY_UP cpy #KEY_UP
beq up beq up
@ -1296,14 +1359,7 @@ skip_luma:
beq left beq left
cpy #KEY_RIGHT cpy #KEY_RIGHT
beq right beq right
cpy #KEY_1 jmp number_keys
beq one
cpy #KEY_2
beq two
cpy #KEY_3
beq three
cpy #KEY_4
beq four
skip_char: skip_char:
lda #0 lda #0
@ -1322,17 +1378,29 @@ minus:
dec zoom dec zoom
jmp done jmp done
up: up:
sub16 oy, oy, temp sub32 oy, oy, temp
jmp done jmp done
down: down:
add16 oy, oy, temp add32 oy, oy, temp
jmp done jmp done
left: left:
sub16 ox, ox, temp sub32 ox, ox, temp
jmp done jmp done
right: right:
add16 ox, ox, temp add32 ox, ox, temp
jmp done jmp done
number_keys:
cpy #KEY_1
beq one
cpy #KEY_2
beq two
cpy #KEY_3
beq three
cpy #KEY_4
beq four
jmp skip_char
one: one:
ldx #0 ldx #0
jmp load_key_viewport jmp load_key_viewport
@ -1394,17 +1462,32 @@ zero_byte_loop:
txa txa
asl a asl a
asl a
tax tax
lda viewport_ox,x lda viewport_ox,x
sta ox sta ox
lda viewport_oy,x lda viewport_oy,x
sta oy sta oy
inx inx
lda viewport_ox,x lda viewport_ox,x
sta ox + 1 sta ox + 1
lda viewport_oy,x lda viewport_oy,x
sta oy + 1 sta oy + 1
inx
lda viewport_ox,x
sta ox + 2
lda viewport_oy,x
sta oy + 2
inx
lda viewport_ox,x
sta ox + 3
lda viewport_oy,x
sta oy + 3
rts rts
.endproc .endproc
@ -1526,10 +1609,10 @@ skipped_mask:
not_skipped_mask: not_skipped_mask:
; run the fractal! ; run the fractal!
zoom_factor cx, sx, zoom, aspect_x zoom_factor cx, sx, aspect_x
add16 cx, cx, ox add32 cx, cx, ox
zoom_factor cy, sy, zoom, aspect_y zoom_factor cy, sy, aspect_y
add16 cy, cy, oy add32 cy, cy, oy
jsr mandelbrot jsr mandelbrot
jsr pset jsr pset

View file

@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13. The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
Iterations are capped at 255. Iterations are capped at 255.

View file

@ -11,19 +11,40 @@ function db(func) {
return lines.join('\n'); return lines.join('\n');
} }
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log( console.log(
`.segment "TABLES" `.segment "TABLES"
.export mul_lobyte .export mul_lobyte256
.export mul_hibyte .export mul_hibyte256
.export mul_hibyte512
.export sqr_lobyte
.export sqr_hibyte
; (i * i) / 2 for the multiplier ; (i * i + 1) / 2 for the multiplier
.align 256 .align 256
mul_lobyte: mul_lobyte256:
${db((i) => ((i * i) >> 1) & 0xff)} ${db((i) => squares[i] & 0xff)}
.align 256 .align 256
mul_hibyte: mul_hibyte256:
${db((i) => ((i * i) >> 9) & 0xff)} ${db((i) => (squares[i] >> 8) & 0xff)}
.align 256
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares
.align 256
sqr_lobyte:
${db((i) => (i * i) & 0xff)}
.align 256
sqr_hibyte:
${db((i) => ((i * i) >> 8) & 0xff)}
`); `);

View file

@ -1,9 +1,13 @@
things to try: things to try:
* skip add on the top-byte multiply in sqr8/mul8
* should save a few cycles, suggestion by jamey
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
* try 3.13 fixed point instead of 4.12 for more precision * try 3.13 fixed point instead of 4.12 for more precision
* can we get away without the extra bit? * can we get away without the extra bit?
* since exit compare space would be 6.26 i think so
* y-axis mirror optimization * y-axis mirror optimization