forked from brooke/mandel-6502
Compare commits
13 commits
Author | SHA1 | Date | |
d8601bb856 | |||
7985ea9a39 | |||
cc83c76706 | |||
2e8893fd78 | |||
81bf7f3c43 | |||
1e0f577e09 | |||
d2f41f9644 | |||
2fcb30b76a | |||
13257309dc | |||
7184b8e03f | |||
4a1e35699a | |||
0d086a179c | |||
61eb1aaf21 |
4 changed files with 248 additions and 140 deletions
@ -1,43 +1,42 @@
; Our zero-page vars
sx = $80 ; i16: screen pixel x
sy = $82 ; i16: screen pixel y
ox = $84 ; fixed4.12: center point x
oy = $86 ; fixed4.12: center point y
cx = $88 ; fixed4.12: c_x
cy = $8a ; fixed4.12: c_y
zx = $8c ; fixed4.12: z_x
zy = $8e ; fixed4.12: z_y
ox = $80 ; fixed8.24: center point x
oy = $84 ; fixed8.24: center point y
cx = $88 ; fixed8.24: c_x
cy = $8c ; fixed8.24: c_y
zx_2 = $90 ; fixed4.12: z_x^2
zy_2 = $92 ; fixed4.12: z_y^2
zx_zy = $94 ; fixed4.12: z_x * z_y
dist = $96 ; fixed4.12: z_x^2 + z_y^2
zx = $90 ; fixed8.24: z_x
zy = $94 ; fixed8.24: z_y
zx_2 = $98 ; fixed8.24: z_x^2
zy_2 = $9c ; fixed8.24: z_y^2
iter = $a0 ; u8: iteration count
zx_zy = $a0 ; fixed8.24: z_x * z_y
dist = $a4 ; fixed8.24: z_x^2 + z_y^2
sx = $a8 ; i16: screen pixel x
sy = $aa ; i16: screen pixel y
z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $ad ; u8: index into z_buffer
z_buffer_end = $ae ; u8: index into z_buffer
iter = $af ; u8: iteration count
zoom = $a1 ; u8: zoom shift level
count_frames = $a2 ; u8
count_pixels = $a3 ; u8
total_ms = $a4 ; float48
total_pixels = $aa ; float48
ptr = $b0 ; u16
pixel_ptr = $b2 ; u16
zoom = $b4 ; u8: zoom shift level
fill_level = $b5 ; u8
pixel_color = $b6 ; u8
pixel_mask = $b7 ; u8
pixel_shift = $b8 ; u8
pixel_offset = $b9 ; u8
palette_offset = $ba ; u8
chroma_offset = $bb ; u8
palette_ticks = $bc ; u8
chroma_ticks = $bd ; u8
count_frames = $be ; u8
count_pixels = $bf ; u8
z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $b1 ; u8: index into z_buffer
z_buffer_end = $b2 ; u8: index into z_buffer
temp = $b4 ; u16
temp2 = $b6 ; u16
pixel_ptr = $b8 ; u16
pixel_color = $ba ; u8
pixel_mask = $bb ; u8
pixel_shift = $bc ; u8
pixel_offset = $bd ; u8
fill_level = $be ; u8
palette_offset = $bf ; u8
palette_ticks = $c0 ; u8
chroma_ticks = $c1 ; u8
chroma_offset = $c2 ; u8
ptr = $c4 ; u16
total_pixels = $c0 ; float48
total_ms = $c6 ; float48
temp = $cc ; u16
temp2 = $ce ; u16
palette_delay = 23
chroma_delay = 137
@ -129,8 +128,11 @@ KEY_0 = 50
mantissa .byte 5
.import mul_lobyte
.import mul_hibyte
.import mul_lobyte256
.import mul_hibyte256
.import mul_hibyte512
.import sqr_lobyte
.import sqr_hibyte
@ -290,16 +292,16 @@ viewport_zoom:
.byte 6
.word $0000
.word $f110
.word $f110
.word $e400
.dword $00000000
.dword $ff110000
.dword $ff110000
.dword $fe400000
.word $0000
.word $fb60
.word $fbe0
.word $0000
.dword $00000000
.dword $ffb60000
.dword $ffbe0000
.dword $00000000
; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
@ -318,7 +320,7 @@ viewport_oy:
; 38 cycles
.macro add32 dest, arg1, arg2
add 4, dest, arg2, dest
add 4, dest, arg1, arg2
; 8 cycles
@ -348,7 +350,7 @@ viewport_oy:
sub 4, dest, arg1, arg2
; 3 + 5 * (bytes - 1) cycles
; 3 + 5 * bytes cycles
.macro shl bytes, arg
asl arg ; 3 cyc
.repeat bytes-1, i
@ -356,17 +358,17 @@ viewport_oy:
; 8 cycles
; 13 cycles
.macro shl16 arg
shl 2, arg
; 13 cycles
; 18 cycles
.macro shl24 arg
shl 3, arg
; 18 cycles
; 23 cycles
.macro shl32 arg
shl 4, arg
@ -423,32 +425,45 @@ viewport_oy:
round16 arg ; 11-27 cycles
.macro imul16_round dest, arg1, arg2, shift
; input: arg1, arg2 as fixed4.12
; output: dest as fixed8.24
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; ? cyc
shift_round_16 FR2, shift ; 103-119 cycles for shift=4
copy16 dest, FR2 + 2 ; 12 cyc
copy32 dest, FR2 ; 24 cyc
.macro sqr16_round dest, arg, shift
;imul16_round dest, arg, arg, shift
; input: arg as fixed4.12
; output: dest as fixed8.24
.macro sqr16 dest, arg
copy16 FR0, arg ; 12 cyc
jsr sqr16_func ; ? cyc
shift_round_16 FR2, shift ; 103-119 cycles for shift=4
copy16 dest, FR2 + 2 ; 12 cyc
jsr sqr16_func ; ? cyc
copy32 dest, FR2 ; 24 cyc
; input: arg as u8
; output: dest as u16
; clobbers a, x
.macro sqr8 dest, arg
ldx arg
lda mul_lobyte,x
lda sqr_lobyte,x
sta dest
lda mul_hibyte,x
lda sqr_hibyte,x
sta dest + 1
; input: arg as u8
; input/output: dest as u16
; clobbers a, x
.macro sqr8_add16 dest, arg
ldx arg
lda sqr_lobyte,x
adc dest
sta dest
lda sqr_hibyte,x
adc dest + 1
sta dest + 1
@ -537,25 +552,22 @@ bank_switch_table:
clc ; 2 cyc
adc mul_factor_x ; 3 cyc
tax ; 2 cyc
lda mul_hibyte,x ; 4 cyc
bcc next ; 2 cyc
; carry is set so we get to add 1 for free, but need to add 0x80
adc #$7f ; 2 cyc
clc ; 2 cyc
; stash the sum temporarily so we can use it as an operand to add
stx mul_product_lo ; 3 cyc
adc mul_product_lo ; 3 cyc
bcc under256 ; 2 cyc
lda mul_hibyte512,x ; 4 cyc
bcs next ; 2 cyc
lda mul_hibyte256,x ; 4 cyc
sec ; 2 cyc
sta mul_product_hi ; 3 cyc
lda mul_lobyte,x ; 4 cyc
lda mul_lobyte256,x ; 4 cyc
; - a^2/2
ldx mul_factor_a ; 3 cyc
sbc mul_lobyte,x ; 4 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte,x ; 4 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
; + x & a & 1:
@ -574,10 +586,10 @@ bank_switch_table:
; - x^2/2
sec ; 2 cyc
sbc mul_lobyte,x ; 4 cyc
sbc mul_lobyte256,x ; 4 cyc
sta mul_product_lo ; 3 cyc
lda mul_product_hi ; 3 cyc
sbc mul_hibyte,x ; 4 cyc
sbc mul_hibyte256,x ; 4 cyc
sta mul_product_hi ; 3 cyc
@ -784,14 +796,18 @@ arg2_pos:
; h*h*256*256 + h*l*256 + h*l*256 + l*l
sqr8 result, arg
sqr8 result + 2, arg + 1
lda #0
sta result + 2
sta result + 3
imul8 inter, arg + 1, arg, xe
shl16 inter
add16 result + 1, result + 1, inter
add_carry result + 3
add16 result + 1, result + 1, inter
add_carry result + 3
sqr8_add16 result + 2, arg + 1
rts ; 6 cyc
@ -859,8 +875,8 @@ next:
.proc mandelbrot
; input:
; cx: position scaled to 4.12 fixed point - -8..+7.9
; cy: position scaled to 4.12
; cx: position scaled to 8.24 fixed point - -128..+127.9
; cy: position scaled to 8.24
; output:
; iter: iteration count at escape or 0
@ -872,12 +888,41 @@ next:
; zx_zy = 0
; dist = 0
; iter = 0
; lda #00
; ldx #(iter - zx + 1)
; sta zx - 1,x
; dex
; bne initloop
; sta z_buffer_start
; sta z_buffer_end
lda #00
ldx #(iter - zx + 1)
sta zx - 1,x
bne initloop
sta zx
sta zx + 1
sta zx + 2
sta zx + 3
sta zy
sta zy + 1
sta zy + 2
sta zy + 3
sta zx_2
sta zx_2 + 1
sta zx_2 + 2
sta zx_2 + 3
sta zy_2
sta zy_2 + 1
sta zy_2 + 2
sta zy_2 + 3
sta zx_zy
sta zx_zy + 1
sta zx_zy + 2
sta zx_zy + 3
sta dist
sta dist + 1
sta dist + 2
sta dist + 3
sta iter
sta z_buffer_start
sta z_buffer_end
@ -889,6 +934,8 @@ loop:
.macro quick_exit arg, max
; arg: fixed8.24
; max: integer
.local positive
.local negative
.local nope_out
@ -896,51 +943,61 @@ keep_going:
.local all_done
; check sign bit
lda arg + 1
lda arg + 3
bmi negative
cmp #((max) << 4)
cmp #max
bmi all_done ; 'less than'
jmp exit_path
cmp #(256 - ((max) << 4))
cmp #(256 - max)
beq first_equal ; 'equal' on first byte
bpl all_done ; 'greater than'
jmp exit_path
; following bytes all 0 shows it's really 'equal'
lda arg + 2
bne all_done
lda arg + 1
bne all_done
lda arg
beq nope_out ; 2nd byte 0 shows it's really 'equal'
bne all_done
jmp exit_path
; 4.12: (-8 .. +7.9)
; 8.24: (-128 .. 127.9)
; zx = zx_2 - zy_2 + cx
sub16 zx, zx_2, zy_2
add16 zx, zx, cx
sub32 zx, zx_2, zy_2
add32 zx, zx, cx
quick_exit zx, 2
; zy = zx_zy + zx_zy + cy
add16 zy, zx_zy, zx_zy
add16 zy, zy, cy
add32 zy, zx_zy, zx_zy
add32 zy, zy, cy
quick_exit zy, 2
; convert 8.24 -> 4.12: (-8 .. +7.9)
shift_round_16 zx, 4
shift_round_16 zy, 4
; zx_2 = zx * zx
sqr16_round zx_2, zx, 4
sqr16 zx_2, zx + 2
; zy_2 = zy * zy
sqr16_round zy_2, zy, 4
sqr16 zy_2, zy + 2
; zx_zy = zx * zy
imul16_round zx_zy, zx, zy, 4
imul16 zx_zy, zx + 2, zy + 2
; dist = zx_2 + zy_2
add16 dist, zx_2, zy_2
add32 dist, zx_2, zy_2
quick_exit dist, 4
; if may be in the lake, look for looping output with a small buffer
@ -977,10 +1034,10 @@ z_buffer_loop:
; Compare the previously stored z values
ldy #0
z_compare zx
z_compare zx + 1
z_compare zy
z_compare zy + 1
z_compare zx + 2
z_compare zx + 3
z_compare zy + 2
z_compare zy + 3
cpy #4
bne z_no_matches
@ -995,10 +1052,10 @@ z_no_matches:
; Store and expand
z_store zx
z_store zx + 1
z_store zy
z_store zy + 1
z_store zx + 2
z_store zx + 3
z_store zy + 2
z_store zy + 3
stx z_buffer_end
@ -1049,14 +1106,17 @@ cont:
.macro zoom_factor dest, src, zoom, aspect
.macro zoom_factor dest, src, aspect
; output: dest: fixed8.24
; input: src: fixed4.12
; aspect: fixed4.12
; clobbers A, X, flags, etc
copy16 dest, src
scale_zoom dest
; cy = cy * (3 / 4)
; cx = cx * (5 / 4)
imul16_round dest, dest, aspect, 4
imul16 dest, dest, aspect
.proc pset
@ -1281,12 +1341,15 @@ skip_luma:
beq minus
; temp = $0010 << (8 - zoom)
lda #$10
sta temp
; temp+temp2 = $00010000 << (8 - zoom)
lda #$00
sta temp
sta temp + 1
scale_zoom temp
lda #$01
sta temp + 2
lda #$00
sta temp + 3
scale_zoom temp + 2
cpy #KEY_UP
beq up
@ -1296,14 +1359,7 @@ skip_luma:
beq left
beq right
cpy #KEY_1
beq one
cpy #KEY_2
beq two
cpy #KEY_3
beq three
cpy #KEY_4
beq four
jmp number_keys
lda #0
@ -1322,17 +1378,29 @@ minus:
dec zoom
jmp done
sub16 oy, oy, temp
sub32 oy, oy, temp
jmp done
add16 oy, oy, temp
add32 oy, oy, temp
jmp done
sub16 ox, ox, temp
sub32 ox, ox, temp
jmp done
add16 ox, ox, temp
add32 ox, ox, temp
jmp done
cpy #KEY_1
beq one
cpy #KEY_2
beq two
cpy #KEY_3
beq three
cpy #KEY_4
beq four
jmp skip_char
ldx #0
jmp load_key_viewport
@ -1394,17 +1462,32 @@ zero_byte_loop:
asl a
asl a
lda viewport_ox,x
sta ox
lda viewport_oy,x
sta oy
lda viewport_ox,x
sta ox + 1
lda viewport_oy,x
sta oy + 1
lda viewport_ox,x
sta ox + 2
lda viewport_oy,x
sta oy + 2
lda viewport_ox,x
sta ox + 3
lda viewport_oy,x
sta oy + 3
@ -1526,10 +1609,10 @@ skipped_mask:
; run the fractal!
zoom_factor cx, sx, zoom, aspect_x
add16 cx, cx, ox
zoom_factor cy, sy, zoom, aspect_y
add16 cy, cy, oy
zoom_factor cx, sx, aspect_x
add32 cx, cx, ox
zoom_factor cy, sy, aspect_y
add32 cy, cy, oy
jsr mandelbrot
jsr pset
@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
* without expanded RAM, a table of half-squares is used to implement the algorithm from
The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
Iterations are capped at 255.
@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e
## Todo
See ideas in ``.
See ideas in ``.
@ -11,19 +11,40 @@ function db(func) {
return lines.join('\n');
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
`.segment "TABLES"
.export mul_lobyte
.export mul_hibyte
.export mul_lobyte256
.export mul_hibyte256
.export mul_hibyte512
.export sqr_lobyte
.export sqr_hibyte
; (i * i) / 2 for the multiplier
; (i * i + 1) / 2 for the multiplier
.align 256
${db((i) => ((i * i) >> 1) & 0xff)}
${db((i) => squares[i] & 0xff)}
.align 256
${db((i) => ((i * i) >> 9) & 0xff)}
${db((i) => (squares[i] >> 8) & 0xff)}
.align 256
${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares
.align 256
${db((i) => (i * i) & 0xff)}
.align 256
${db((i) => ((i * i) >> 8) & 0xff)}
@ -1,9 +1,13 @@
things to try:
* skip add on the top-byte multiply in sqr8/mul8
* should save a few cycles, suggestion by jamey
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
* try 3.13 fixed point instead of 4.12 for more precision
* can we get away without the extra bit?
* since exit compare space would be 6.26 i think so
* y-axis mirror optimization
Reference in a new issue