forked from brooke/mandel-6502
Compare commits
13 commits
shrink-tab
...
main
Author | SHA1 | Date | |
---|---|---|---|
d8601bb856 | |||
7985ea9a39 | |||
cc83c76706 | |||
2e8893fd78 | |||
81bf7f3c43 | |||
1e0f577e09 | |||
d2f41f9644 | |||
2fcb30b76a | |||
13257309dc | |||
7184b8e03f | |||
4a1e35699a | |||
0d086a179c | |||
61eb1aaf21 |
4 changed files with 248 additions and 140 deletions
345
mandel.s
345
mandel.s
|
@ -1,43 +1,42 @@
|
|||
; Our zero-page vars
|
||||
sx = $80 ; i16: screen pixel x
|
||||
sy = $82 ; i16: screen pixel y
|
||||
ox = $84 ; fixed4.12: center point x
|
||||
oy = $86 ; fixed4.12: center point y
|
||||
cx = $88 ; fixed4.12: c_x
|
||||
cy = $8a ; fixed4.12: c_y
|
||||
zx = $8c ; fixed4.12: z_x
|
||||
zy = $8e ; fixed4.12: z_y
|
||||
ox = $80 ; fixed8.24: center point x
|
||||
oy = $84 ; fixed8.24: center point y
|
||||
cx = $88 ; fixed8.24: c_x
|
||||
cy = $8c ; fixed8.24: c_y
|
||||
|
||||
zx_2 = $90 ; fixed4.12: z_x^2
|
||||
zy_2 = $92 ; fixed4.12: z_y^2
|
||||
zx_zy = $94 ; fixed4.12: z_x * z_y
|
||||
dist = $96 ; fixed4.12: z_x^2 + z_y^2
|
||||
zx = $90 ; fixed8.24: z_x
|
||||
zy = $94 ; fixed8.24: z_y
|
||||
zx_2 = $98 ; fixed8.24: z_x^2
|
||||
zy_2 = $9c ; fixed8.24: z_y^2
|
||||
|
||||
iter = $a0 ; u8: iteration count
|
||||
zx_zy = $a0 ; fixed8.24: z_x * z_y
|
||||
dist = $a4 ; fixed8.24: z_x^2 + z_y^2
|
||||
sx = $a8 ; i16: screen pixel x
|
||||
sy = $aa ; i16: screen pixel y
|
||||
z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
|
||||
z_buffer_start = $ad ; u8: index into z_buffer
|
||||
z_buffer_end = $ae ; u8: index into z_buffer
|
||||
iter = $af ; u8: iteration count
|
||||
|
||||
zoom = $a1 ; u8: zoom shift level
|
||||
count_frames = $a2 ; u8
|
||||
count_pixels = $a3 ; u8
|
||||
total_ms = $a4 ; float48
|
||||
total_pixels = $aa ; float48
|
||||
ptr = $b0 ; u16
|
||||
pixel_ptr = $b2 ; u16
|
||||
zoom = $b4 ; u8: zoom shift level
|
||||
fill_level = $b5 ; u8
|
||||
pixel_color = $b6 ; u8
|
||||
pixel_mask = $b7 ; u8
|
||||
pixel_shift = $b8 ; u8
|
||||
pixel_offset = $b9 ; u8
|
||||
palette_offset = $ba ; u8
|
||||
chroma_offset = $bb ; u8
|
||||
palette_ticks = $bc ; u8
|
||||
chroma_ticks = $bd ; u8
|
||||
count_frames = $be ; u8
|
||||
count_pixels = $bf ; u8
|
||||
|
||||
z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
|
||||
z_buffer_start = $b1 ; u8: index into z_buffer
|
||||
z_buffer_end = $b2 ; u8: index into z_buffer
|
||||
temp = $b4 ; u16
|
||||
temp2 = $b6 ; u16
|
||||
pixel_ptr = $b8 ; u16
|
||||
pixel_color = $ba ; u8
|
||||
pixel_mask = $bb ; u8
|
||||
pixel_shift = $bc ; u8
|
||||
pixel_offset = $bd ; u8
|
||||
fill_level = $be ; u8
|
||||
palette_offset = $bf ; u8
|
||||
|
||||
palette_ticks = $c0 ; u8
|
||||
chroma_ticks = $c1 ; u8
|
||||
chroma_offset = $c2 ; u8
|
||||
ptr = $c4 ; u16
|
||||
total_pixels = $c0 ; float48
|
||||
total_ms = $c6 ; float48
|
||||
temp = $cc ; u16
|
||||
temp2 = $ce ; u16
|
||||
|
||||
palette_delay = 23
|
||||
chroma_delay = 137
|
||||
|
@ -129,8 +128,11 @@ KEY_0 = 50
|
|||
mantissa .byte 5
|
||||
.endstruct
|
||||
|
||||
.import mul_lobyte
|
||||
.import mul_hibyte
|
||||
.import mul_lobyte256
|
||||
.import mul_hibyte256
|
||||
.import mul_hibyte512
|
||||
.import sqr_lobyte
|
||||
.import sqr_hibyte
|
||||
|
||||
.data
|
||||
|
||||
|
@ -290,16 +292,16 @@ viewport_zoom:
|
|||
.byte 6
|
||||
|
||||
viewport_ox:
|
||||
.word $0000
|
||||
.word $f110
|
||||
.word $f110
|
||||
.word $e400
|
||||
.dword $00000000
|
||||
.dword $ff110000
|
||||
.dword $ff110000
|
||||
.dword $fe400000
|
||||
|
||||
viewport_oy:
|
||||
.word $0000
|
||||
.word $fb60
|
||||
.word $fbe0
|
||||
.word $0000
|
||||
.dword $00000000
|
||||
.dword $ffb60000
|
||||
.dword $ffbe0000
|
||||
.dword $00000000
|
||||
|
||||
; 2 + 9 * byte cycles
|
||||
.macro add bytes, dest, arg1, arg2
|
||||
|
@ -318,7 +320,7 @@ viewport_oy:
|
|||
|
||||
; 38 cycles
|
||||
.macro add32 dest, arg1, arg2
|
||||
add 4, dest, arg2, dest
|
||||
add 4, dest, arg1, arg2
|
||||
.endmacro
|
||||
|
||||
; 8 cycles
|
||||
|
@ -348,7 +350,7 @@ viewport_oy:
|
|||
sub 4, dest, arg1, arg2
|
||||
.endmacro
|
||||
|
||||
; 3 + 5 * (bytes - 1) cycles
|
||||
; 3 + 5 * bytes cycles
|
||||
.macro shl bytes, arg
|
||||
asl arg ; 3 cyc
|
||||
.repeat bytes-1, i
|
||||
|
@ -356,17 +358,17 @@ viewport_oy:
|
|||
.endrepeat
|
||||
.endmacro
|
||||
|
||||
; 8 cycles
|
||||
; 13 cycles
|
||||
.macro shl16 arg
|
||||
shl 2, arg
|
||||
.endmacro
|
||||
|
||||
; 13 cycles
|
||||
; 18 cycles
|
||||
.macro shl24 arg
|
||||
shl 3, arg
|
||||
.endmacro
|
||||
|
||||
; 18 cycles
|
||||
; 23 cycles
|
||||
.macro shl32 arg
|
||||
shl 4, arg
|
||||
.endmacro
|
||||
|
@ -423,32 +425,45 @@ viewport_oy:
|
|||
round16 arg ; 11-27 cycles
|
||||
.endmacro
|
||||
|
||||
.macro imul16_round dest, arg1, arg2, shift
|
||||
; input: arg1, arg2 as fixed4.12
|
||||
; output: dest as fixed8.24
|
||||
.macro imul16 dest, arg1, arg2
|
||||
copy16 FR0, arg1 ; 12 cyc
|
||||
copy16 FR1, arg2 ; 12 cyc
|
||||
jsr imul16_func ; ? cyc
|
||||
shift_round_16 FR2, shift ; 103-119 cycles for shift=4
|
||||
copy16 dest, FR2 + 2 ; 12 cyc
|
||||
copy32 dest, FR2 ; 24 cyc
|
||||
.endmacro
|
||||
|
||||
.macro sqr16_round dest, arg, shift
|
||||
;imul16_round dest, arg, arg, shift
|
||||
; input: arg as fixed4.12
|
||||
; output: dest as fixed8.24
|
||||
.macro sqr16 dest, arg
|
||||
copy16 FR0, arg ; 12 cyc
|
||||
jsr sqr16_func ; ? cyc
|
||||
shift_round_16 FR2, shift ; 103-119 cycles for shift=4
|
||||
copy16 dest, FR2 + 2 ; 12 cyc
|
||||
jsr sqr16_func ; ? cyc
|
||||
copy32 dest, FR2 ; 24 cyc
|
||||
.endmacro
|
||||
|
||||
; input: arg as u8
|
||||
; output: dest as u16
|
||||
; clobbers a, x
|
||||
.macro sqr8 dest, arg
|
||||
ldx arg
|
||||
txa
|
||||
lsr
|
||||
lda mul_lobyte,x
|
||||
rol
|
||||
lda sqr_lobyte,x
|
||||
sta dest
|
||||
lda mul_hibyte,x
|
||||
rol
|
||||
lda sqr_hibyte,x
|
||||
sta dest + 1
|
||||
.endmacro
|
||||
|
||||
; input: arg as u8
|
||||
; input/output: dest as u16
|
||||
; clobbers a, x
|
||||
.macro sqr8_add16 dest, arg
|
||||
ldx arg
|
||||
clc
|
||||
lda sqr_lobyte,x
|
||||
adc dest
|
||||
sta dest
|
||||
lda sqr_hibyte,x
|
||||
adc dest + 1
|
||||
sta dest + 1
|
||||
.endmacro
|
||||
|
||||
|
@ -537,25 +552,22 @@ bank_switch_table:
|
|||
clc ; 2 cyc
|
||||
adc mul_factor_x ; 3 cyc
|
||||
tax ; 2 cyc
|
||||
lda mul_hibyte,x ; 4 cyc
|
||||
bcc next ; 2 cyc
|
||||
; carry is set so we get to add 1 for free, but need to add 0x80
|
||||
adc #$7f ; 2 cyc
|
||||
clc ; 2 cyc
|
||||
; stash the sum temporarily so we can use it as an operand to add
|
||||
stx mul_product_lo ; 3 cyc
|
||||
adc mul_product_lo ; 3 cyc
|
||||
next:
|
||||
bcc under256 ; 2 cyc
|
||||
lda mul_hibyte512,x ; 4 cyc
|
||||
bcs next ; 2 cyc
|
||||
under256:
|
||||
lda mul_hibyte256,x ; 4 cyc
|
||||
sec ; 2 cyc
|
||||
next:
|
||||
sta mul_product_hi ; 3 cyc
|
||||
lda mul_lobyte,x ; 4 cyc
|
||||
lda mul_lobyte256,x ; 4 cyc
|
||||
|
||||
; - a^2/2
|
||||
ldx mul_factor_a ; 3 cyc
|
||||
sbc mul_lobyte,x ; 4 cyc
|
||||
sbc mul_lobyte256,x ; 4 cyc
|
||||
sta mul_product_lo ; 3 cyc
|
||||
lda mul_product_hi ; 3 cyc
|
||||
sbc mul_hibyte,x ; 4 cyc
|
||||
sbc mul_hibyte256,x ; 4 cyc
|
||||
sta mul_product_hi ; 3 cyc
|
||||
|
||||
; + x & a & 1:
|
||||
|
@ -574,10 +586,10 @@ bank_switch_table:
|
|||
; - x^2/2
|
||||
small_product:
|
||||
sec ; 2 cyc
|
||||
sbc mul_lobyte,x ; 4 cyc
|
||||
sbc mul_lobyte256,x ; 4 cyc
|
||||
sta mul_product_lo ; 3 cyc
|
||||
lda mul_product_hi ; 3 cyc
|
||||
sbc mul_hibyte,x ; 4 cyc
|
||||
sbc mul_hibyte256,x ; 4 cyc
|
||||
sta mul_product_hi ; 3 cyc
|
||||
.endscope
|
||||
.endif
|
||||
|
@ -784,14 +796,18 @@ arg2_pos:
|
|||
; h*h*256*256 + h*l*256 + h*l*256 + l*l
|
||||
|
||||
sqr8 result, arg
|
||||
sqr8 result + 2, arg + 1
|
||||
lda #0
|
||||
sta result + 2
|
||||
sta result + 3
|
||||
|
||||
imul8 inter, arg + 1, arg, xe
|
||||
shl16 inter
|
||||
add16 result + 1, result + 1, inter
|
||||
add_carry result + 3
|
||||
add16 result + 1, result + 1, inter
|
||||
add_carry result + 3
|
||||
|
||||
sqr8_add16 result + 2, arg + 1
|
||||
|
||||
rts ; 6 cyc
|
||||
.endscope
|
||||
.endmacro
|
||||
|
@ -859,8 +875,8 @@ next:
|
|||
|
||||
.proc mandelbrot
|
||||
; input:
|
||||
; cx: position scaled to 4.12 fixed point - -8..+7.9
|
||||
; cy: position scaled to 4.12
|
||||
; cx: position scaled to 8.24 fixed point - -128..+127.9
|
||||
; cy: position scaled to 8.24
|
||||
;
|
||||
; output:
|
||||
; iter: iteration count at escape or 0
|
||||
|
@ -872,12 +888,41 @@ next:
|
|||
; zx_zy = 0
|
||||
; dist = 0
|
||||
; iter = 0
|
||||
; lda #00
|
||||
; ldx #(iter - zx + 1)
|
||||
;initloop:
|
||||
; sta zx - 1,x
|
||||
; dex
|
||||
; bne initloop
|
||||
; sta z_buffer_start
|
||||
; sta z_buffer_end
|
||||
|
||||
lda #00
|
||||
ldx #(iter - zx + 1)
|
||||
initloop:
|
||||
sta zx - 1,x
|
||||
dex
|
||||
bne initloop
|
||||
sta zx
|
||||
sta zx + 1
|
||||
sta zx + 2
|
||||
sta zx + 3
|
||||
sta zy
|
||||
sta zy + 1
|
||||
sta zy + 2
|
||||
sta zy + 3
|
||||
sta zx_2
|
||||
sta zx_2 + 1
|
||||
sta zx_2 + 2
|
||||
sta zx_2 + 3
|
||||
sta zy_2
|
||||
sta zy_2 + 1
|
||||
sta zy_2 + 2
|
||||
sta zy_2 + 3
|
||||
sta zx_zy
|
||||
sta zx_zy + 1
|
||||
sta zx_zy + 2
|
||||
sta zx_zy + 3
|
||||
sta dist
|
||||
sta dist + 1
|
||||
sta dist + 2
|
||||
sta dist + 3
|
||||
sta iter
|
||||
sta z_buffer_start
|
||||
sta z_buffer_end
|
||||
|
||||
|
@ -889,6 +934,8 @@ loop:
|
|||
keep_going:
|
||||
|
||||
.macro quick_exit arg, max
|
||||
; arg: fixed8.24
|
||||
; max: integer
|
||||
.local positive
|
||||
.local negative
|
||||
.local nope_out
|
||||
|
@ -896,51 +943,61 @@ keep_going:
|
|||
.local all_done
|
||||
|
||||
; check sign bit
|
||||
lda arg + 1
|
||||
lda arg + 3
|
||||
bmi negative
|
||||
|
||||
positive:
|
||||
cmp #((max) << 4)
|
||||
cmp #max
|
||||
bmi all_done ; 'less than'
|
||||
jmp exit_path
|
||||
|
||||
negative:
|
||||
cmp #(256 - ((max) << 4))
|
||||
cmp #(256 - max)
|
||||
beq first_equal ; 'equal' on first byte
|
||||
bpl all_done ; 'greater than'
|
||||
|
||||
nope_out:
|
||||
jmp exit_path
|
||||
|
||||
|
||||
first_equal:
|
||||
; following bytes all 0 shows it's really 'equal'
|
||||
lda arg + 2
|
||||
bne all_done
|
||||
lda arg + 1
|
||||
bne all_done
|
||||
lda arg
|
||||
beq nope_out ; 2nd byte 0 shows it's really 'equal'
|
||||
bne all_done
|
||||
jmp exit_path
|
||||
|
||||
all_done:
|
||||
.endmacro
|
||||
|
||||
; 4.12: (-8 .. +7.9)
|
||||
; 8.24: (-128 .. 127.9)
|
||||
; zx = zx_2 - zy_2 + cx
|
||||
sub16 zx, zx_2, zy_2
|
||||
add16 zx, zx, cx
|
||||
sub32 zx, zx_2, zy_2
|
||||
add32 zx, zx, cx
|
||||
quick_exit zx, 2
|
||||
|
||||
; zy = zx_zy + zx_zy + cy
|
||||
add16 zy, zx_zy, zx_zy
|
||||
add16 zy, zy, cy
|
||||
add32 zy, zx_zy, zx_zy
|
||||
add32 zy, zy, cy
|
||||
quick_exit zy, 2
|
||||
|
||||
; convert 8.24 -> 4.12: (-8 .. +7.9)
|
||||
shift_round_16 zx, 4
|
||||
shift_round_16 zy, 4
|
||||
|
||||
; zx_2 = zx * zx
|
||||
sqr16_round zx_2, zx, 4
|
||||
sqr16 zx_2, zx + 2
|
||||
|
||||
; zy_2 = zy * zy
|
||||
sqr16_round zy_2, zy, 4
|
||||
sqr16 zy_2, zy + 2
|
||||
|
||||
; zx_zy = zx * zy
|
||||
imul16_round zx_zy, zx, zy, 4
|
||||
imul16 zx_zy, zx + 2, zy + 2
|
||||
|
||||
; dist = zx_2 + zy_2
|
||||
add16 dist, zx_2, zy_2
|
||||
add32 dist, zx_2, zy_2
|
||||
quick_exit dist, 4
|
||||
|
||||
; if may be in the lake, look for looping output with a small buffer
|
||||
|
@ -977,10 +1034,10 @@ z_buffer_loop:
|
|||
|
||||
; Compare the previously stored z values
|
||||
ldy #0
|
||||
z_compare zx
|
||||
z_compare zx + 1
|
||||
z_compare zy
|
||||
z_compare zy + 1
|
||||
z_compare zx + 2
|
||||
z_compare zx + 3
|
||||
z_compare zy + 2
|
||||
z_compare zy + 3
|
||||
|
||||
cpy #4
|
||||
bne z_no_matches
|
||||
|
@ -995,10 +1052,10 @@ z_no_matches:
|
|||
z_nothing_to_read:
|
||||
|
||||
; Store and expand
|
||||
z_store zx
|
||||
z_store zx + 1
|
||||
z_store zy
|
||||
z_store zy + 1
|
||||
z_store zx + 2
|
||||
z_store zx + 3
|
||||
z_store zy + 2
|
||||
z_store zy + 3
|
||||
z_advance
|
||||
stx z_buffer_end
|
||||
|
||||
|
@ -1049,14 +1106,17 @@ cont:
|
|||
enough:
|
||||
.endmacro
|
||||
|
||||
.macro zoom_factor dest, src, zoom, aspect
|
||||
.macro zoom_factor dest, src, aspect
|
||||
; output: dest: fixed8.24
|
||||
; input: src: fixed4.12
|
||||
; aspect: fixed4.12
|
||||
; clobbers A, X, flags, etc
|
||||
copy16 dest, src
|
||||
scale_zoom dest
|
||||
|
||||
; cy = cy * (3 / 4)
|
||||
; cx = cx * (5 / 4)
|
||||
imul16_round dest, dest, aspect, 4
|
||||
imul16 dest, dest, aspect
|
||||
.endmacro
|
||||
|
||||
.proc pset
|
||||
|
@ -1281,12 +1341,15 @@ skip_luma:
|
|||
cpy #KEY_MINUS
|
||||
beq minus
|
||||
|
||||
; temp = $0010 << (8 - zoom)
|
||||
lda #$10
|
||||
sta temp
|
||||
; temp+temp2 = $00010000 << (8 - zoom)
|
||||
lda #$00
|
||||
sta temp
|
||||
sta temp + 1
|
||||
scale_zoom temp
|
||||
lda #$01
|
||||
sta temp + 2
|
||||
lda #$00
|
||||
sta temp + 3
|
||||
scale_zoom temp + 2
|
||||
|
||||
cpy #KEY_UP
|
||||
beq up
|
||||
|
@ -1296,14 +1359,7 @@ skip_luma:
|
|||
beq left
|
||||
cpy #KEY_RIGHT
|
||||
beq right
|
||||
cpy #KEY_1
|
||||
beq one
|
||||
cpy #KEY_2
|
||||
beq two
|
||||
cpy #KEY_3
|
||||
beq three
|
||||
cpy #KEY_4
|
||||
beq four
|
||||
jmp number_keys
|
||||
|
||||
skip_char:
|
||||
lda #0
|
||||
|
@ -1322,17 +1378,29 @@ minus:
|
|||
dec zoom
|
||||
jmp done
|
||||
up:
|
||||
sub16 oy, oy, temp
|
||||
sub32 oy, oy, temp
|
||||
jmp done
|
||||
down:
|
||||
add16 oy, oy, temp
|
||||
add32 oy, oy, temp
|
||||
jmp done
|
||||
left:
|
||||
sub16 ox, ox, temp
|
||||
sub32 ox, ox, temp
|
||||
jmp done
|
||||
right:
|
||||
add16 ox, ox, temp
|
||||
add32 ox, ox, temp
|
||||
jmp done
|
||||
|
||||
number_keys:
|
||||
cpy #KEY_1
|
||||
beq one
|
||||
cpy #KEY_2
|
||||
beq two
|
||||
cpy #KEY_3
|
||||
beq three
|
||||
cpy #KEY_4
|
||||
beq four
|
||||
jmp skip_char
|
||||
|
||||
one:
|
||||
ldx #0
|
||||
jmp load_key_viewport
|
||||
|
@ -1394,17 +1462,32 @@ zero_byte_loop:
|
|||
|
||||
txa
|
||||
asl a
|
||||
asl a
|
||||
|
||||
tax
|
||||
lda viewport_ox,x
|
||||
sta ox
|
||||
lda viewport_oy,x
|
||||
sta oy
|
||||
|
||||
inx
|
||||
lda viewport_ox,x
|
||||
sta ox + 1
|
||||
lda viewport_oy,x
|
||||
sta oy + 1
|
||||
|
||||
inx
|
||||
lda viewport_ox,x
|
||||
sta ox + 2
|
||||
lda viewport_oy,x
|
||||
sta oy + 2
|
||||
|
||||
inx
|
||||
lda viewport_ox,x
|
||||
sta ox + 3
|
||||
lda viewport_oy,x
|
||||
sta oy + 3
|
||||
|
||||
rts
|
||||
.endproc
|
||||
|
||||
|
@ -1526,10 +1609,10 @@ skipped_mask:
|
|||
not_skipped_mask:
|
||||
|
||||
; run the fractal!
|
||||
zoom_factor cx, sx, zoom, aspect_x
|
||||
add16 cx, cx, ox
|
||||
zoom_factor cy, sy, zoom, aspect_y
|
||||
add16 cy, cy, oy
|
||||
zoom_factor cx, sx, aspect_x
|
||||
add32 cx, cx, ox
|
||||
zoom_factor cy, sy, aspect_y
|
||||
add32 cy, cy, oy
|
||||
jsr mandelbrot
|
||||
jsr pset
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
|
|||
* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
|
||||
* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
|
||||
|
||||
The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
|
||||
The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
|
||||
|
||||
Iterations are capped at 255.
|
||||
|
||||
|
@ -47,4 +47,4 @@ Currently produces a `.xex` executable, which can be booted up in common Atari e
|
|||
|
||||
## Todo
|
||||
|
||||
See ideas in `todo.md`.
|
||||
See ideas in `todo.md`.
|
||||
|
|
35
tables.js
35
tables.js
|
@ -11,19 +11,40 @@ function db(func) {
|
|||
return lines.join('\n');
|
||||
}
|
||||
|
||||
let squares = [];
|
||||
for (let i = 0; i < 512; i++) {
|
||||
squares.push(Math.trunc((i * i + 1) / 2));
|
||||
}
|
||||
|
||||
console.log(
|
||||
`.segment "TABLES"
|
||||
|
||||
.export mul_lobyte
|
||||
.export mul_hibyte
|
||||
.export mul_lobyte256
|
||||
.export mul_hibyte256
|
||||
.export mul_hibyte512
|
||||
.export sqr_lobyte
|
||||
.export sqr_hibyte
|
||||
|
||||
; (i * i) / 2 for the multiplier
|
||||
; (i * i + 1) / 2 for the multiplier
|
||||
.align 256
|
||||
mul_lobyte:
|
||||
${db((i) => ((i * i) >> 1) & 0xff)}
|
||||
mul_lobyte256:
|
||||
${db((i) => squares[i] & 0xff)}
|
||||
|
||||
.align 256
|
||||
mul_hibyte:
|
||||
${db((i) => ((i * i) >> 9) & 0xff)}
|
||||
mul_hibyte256:
|
||||
${db((i) => (squares[i] >> 8) & 0xff)}
|
||||
|
||||
.align 256
|
||||
mul_hibyte512:
|
||||
${db((i) => (squares[i + 256] >> 8) & 0xff)}
|
||||
|
||||
; (i * i) for the plain squares
|
||||
.align 256
|
||||
sqr_lobyte:
|
||||
${db((i) => (i * i) & 0xff)}
|
||||
|
||||
.align 256
|
||||
sqr_hibyte:
|
||||
${db((i) => ((i * i) >> 8) & 0xff)}
|
||||
|
||||
`);
|
||||
|
|
4
todo.md
4
todo.md
|
@ -1,9 +1,13 @@
|
|||
things to try:
|
||||
|
||||
* skip add on the top-byte multiply in sqr8/mul8
|
||||
* should save a few cycles, suggestion by jamey
|
||||
|
||||
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
|
||||
|
||||
* try 3.13 fixed point instead of 4.12 for more precision
|
||||
* can we get away without the extra bit?
|
||||
* since exit compare space would be 6.26 i think so
|
||||
|
||||
* y-axis mirror optimization
|
||||
|
||||
|
|
Loading…
Reference in a new issue