Compare commits

..

No commits in common. "main" and "main" have entirely different histories.
main ... main

3 changed files with 140 additions and 212 deletions

334
mandel.s
View file

@ -1,16 +1,16 @@
; Our zero-page vars ; Our zero-page vars
ox = $80 ; fixed6.26: center point x ox = $80 ; fixed8.24: center point x
oy = $84 ; fixed6.26: center point y oy = $84 ; fixed8.24: center point y
cx = $88 ; fixed6.26: c_x cx = $88 ; fixed8.24: c_x
cy = $8c ; fixed6.26: c_y cy = $8c ; fixed8.24: c_y
zx = $90 ; fixed6.26: z_x zx = $90 ; fixed8.24: z_x
zy = $94 ; fixed6.26: z_y zy = $94 ; fixed8.24: z_y
zx_2 = $98 ; fixed6.26: z_x^2 zx_2 = $98 ; fixed8.24: z_x^2
zy_2 = $9c ; fixed6.26: z_y^2 zy_2 = $9c ; fixed8.24: z_y^2
zx_zy = $a0 ; fixed6.26: z_x * z_y zx_zy = $a0 ; fixed8.24: z_x * z_y
dist = $a4 ; fixed6.26: z_x^2 + z_y^2 dist = $a4 ; fixed8.24: z_x^2 + z_y^2
sx = $a8 ; i16: screen pixel x sx = $a8 ; i16: screen pixel x
sy = $aa ; i16: screen pixel y sy = $aa ; i16: screen pixel y
z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
@ -31,10 +31,10 @@ chroma_offset = $bb ; u8
palette_ticks = $bc ; u8 palette_ticks = $bc ; u8
chroma_ticks = $bd ; u8 chroma_ticks = $bd ; u8
count_frames = $be ; u8 count_frames = $be ; u8
; free space $bf count_pixels = $bf ; u8
count_iters = $c0 ; u16 total_pixels = $c0 ; float48
; free space c2-cb total_ms = $c6 ; float48
temp = $cc ; u16 temp = $cc ; u16
temp2 = $ce ; u16 temp2 = $ce ; u16
@ -59,12 +59,10 @@ LBUFF = $0580 ; result buffer for FASC routine
; FP ROM routine vectors ; FP ROM routine vectors
FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set) FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48) IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
FPI = $D9D2 ; floating point to integer
FADD = $DA66 ; ADDITION (FR0 += FR1) FADD = $DA66 ; ADDITION (FR0 += FR1)
FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1) FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1)
FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1) FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1)
FDIV = $DB28 ; DIVISION (FR0 /= FR1) FDIV = $DB28 ; DIVISION (FR0 /= FR1)
ZFR0 = $DA44 ; clear FR0
ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX) ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX) FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX) FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
@ -143,7 +141,7 @@ str_self:
.byte "MANDEL-6502" .byte "MANDEL-6502"
str_self_end: str_self_end:
str_speed: str_speed:
.byte "us/iter: " .byte " ms/px"
str_speed_end: str_speed_end:
str_run: str_run:
.byte " RUN" .byte " RUN"
@ -191,38 +189,20 @@ aspect:
; ;
; 184h is the equiv of 220.8h at square pixels ; 184h is the equiv of 220.8h at square pixels
; 320 / 220.8 = 1.45 display aspect ratio ; 320 / 220.8 = 1.45 display aspect ratio
aspect_x: ; fixed3.13 5/4 aspect_x: ; fixed4.16 5/4
.word 5 << (13 - 2) .word 5 << (12 - 2)
aspect_y: ; fixed3.13 3/4 aspect_y: ; fixed4.16 3/4
.word 3 << (13 - 2) .word 3 << (12 - 2)
sec_per_frame: ; float48 00 . 01 66 66 66 67 ms_per_frame: ; float48 16.66666667
.byte 63 ; exponent/sign - -1 bytes .byte 64 ; exponent/sign
.byte $01 ; BCD digits .byte $16 ; BCD digits
.byte $66 .byte $66
.byte $66 .byte $66
.byte $66 .byte $66
.byte $67 .byte $67
us_per_sec: ; float48 1e9 01 00 0,0 00 . 00
.byte 67 ; exponent/sign +3 bytes
.byte $01 ; BCD digits
.byte $00
.byte $00
.byte $00
.byte $00
total_iters: ; float48
.repeat 6
.byte 0
.endrepeat
total_sec: ; float48
.repeat 6
.byte 0
.endrepeat
display_list_start: display_list_start:
; 24 lines overscan ; 24 lines overscan
.repeat 3 .repeat 3
@ -254,9 +234,9 @@ display_list_len = display_list_end - display_list_start
color_map: color_map:
.byte 0 .byte 0
.repeat 85 .repeat 85
.byte %01010101 .byte 1
.byte %10101010 .byte 2
.byte %11111111 .byte 3
.endrepeat .endrepeat
@ -305,34 +285,23 @@ fill_masks:
.byte %00000001 .byte %00000001
.byte %00000000 .byte %00000000
pixel_masks:
.byte %11111111
.byte %11110000
.byte %11000000
viewport_zoom: viewport_zoom:
.byte 0 .byte 1
.byte 5 .byte 6
.byte 7 .byte 8
.byte 5 .byte 6
.byte 7
.byte 7
viewport_ox: viewport_ox:
.dword ($00000000 & $3fffffff) << 2 .dword $00000000
.dword ($ff110000 & $3fffffff) << 2 .dword $ff110000
.dword ($ff110000 & $3fffffff) << 2 .dword $ff110000
.dword ($fe400000 & $3fffffff) << 2 .dword $fe400000
.dword ($fe3b0000 & $3fffffff) << 2
.dword $fd220000
viewport_oy: viewport_oy:
.dword ($00000000 & $3fffffff) << 2 .dword $00000000
.dword ($ffb60000 & $3fffffff) << 2 .dword $ffb60000
.dword ($ffbe0000 & $3fffffff) << 2 .dword $ffbe0000
.dword ($00000000 & $3fffffff) << 2 .dword $00000000
.dword ($fffe0000 & $3fffffff) << 2
.dword $ff000000
; 2 + 9 * byte cycles ; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2 .macro add bytes, dest, arg1, arg2
@ -484,6 +453,20 @@ viewport_oy:
sta dest + 1 sta dest + 1
.endmacro .endmacro
; input: arg as u8
; input/output: dest as u16
; clobbers a, x
.macro sqr8_add16 dest, arg
ldx arg
clc
lda sqr_lobyte,x
adc dest
sta dest
lda sqr_hibyte,x
adc dest + 1
sta dest + 1
.endmacro
.segment "TABLES" .segment "TABLES"
; lookup table for top byte -> PORTB value for bank-switch ; lookup table for top byte -> PORTB value for bank-switch
.align 256 .align 256
@ -766,8 +749,9 @@ inner_loop:
; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2 ; h1*h2*256*256 + h1*l2*256 + h2*l1*256 + l1*l2
imul8 result, arg1, arg2, xe imul8 result, arg1, arg2, xe
lda #0
imul8 result + 2, arg1 + 1, arg2 + 1, xe sta result + 2
sta result + 3
imul8 inter, arg1 + 1, arg2, xe imul8 inter, arg1 + 1, arg2, xe
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
@ -777,6 +761,9 @@ inner_loop:
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3 add_carry result + 3
imul8 inter, arg1 + 1, arg2 + 1, xe
add16 result + 2, result + 2, inter
; In case of negative inputs, adjust high word ; In case of negative inputs, adjust high word
; https://stackoverflow.com/a/28827013 ; https://stackoverflow.com/a/28827013
lda arg1 + 1 lda arg1 + 1
@ -809,8 +796,9 @@ arg2_pos:
; h*h*256*256 + h*l*256 + h*l*256 + l*l ; h*h*256*256 + h*l*256 + h*l*256 + l*l
sqr8 result, arg sqr8 result, arg
lda #0
sqr8 result + 2, arg + 1 sta result + 2
sta result + 3
imul8 inter, arg + 1, arg, xe imul8 inter, arg + 1, arg, xe
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
@ -818,6 +806,8 @@ arg2_pos:
add16 result + 1, result + 1, inter add16 result + 1, result + 1, inter
add_carry result + 3 add_carry result + 3
sqr8_add16 result + 2, arg + 1
rts ; 6 cyc rts ; 6 cyc
.endscope .endscope
.endmacro .endmacro
@ -885,8 +875,8 @@ next:
.proc mandelbrot .proc mandelbrot
; input: ; input:
; cx: position scaled to 6.26 fixed point - -32..+31.9 ; cx: position scaled to 8.24 fixed point - -128..+127.9
; cy: position scaled to 6.26 ; cy: position scaled to 8.24
; ;
; output: ; output:
; iter: iteration count at escape or 0 ; iter: iteration count at escape or 0
@ -937,11 +927,6 @@ next:
sta z_buffer_end sta z_buffer_end
loop: loop:
inc count_iters
bne low_iters
inc count_iters + 1
low_iters:
; iter++ & max-iters break ; iter++ & max-iters break
inc iter inc iter
bne keep_going bne keep_going
@ -949,7 +934,7 @@ low_iters:
keep_going: keep_going:
.macro quick_exit arg, max .macro quick_exit arg, max
; arg: fixed6.26 ; arg: fixed8.24
; max: integer ; max: integer
.local positive .local positive
.local negative .local negative
@ -962,12 +947,12 @@ keep_going:
bmi negative bmi negative
positive: positive:
cmp #(max << 2) cmp #max
bmi all_done ; 'less than' bmi all_done ; 'less than'
jmp exit_path jmp exit_path
negative: negative:
cmp #(256 - (max << 2)) cmp #(256 - max)
beq first_equal ; 'equal' on first byte beq first_equal ; 'equal' on first byte
bpl all_done ; 'greater than' bpl all_done ; 'greater than'
@ -987,7 +972,7 @@ keep_going:
all_done: all_done:
.endmacro .endmacro
; 6.26: (-32 .. 31.9) ; 8.24: (-128 .. 127.9)
; zx = zx_2 - zy_2 + cx ; zx = zx_2 - zy_2 + cx
sub32 zx, zx_2, zy_2 sub32 zx, zx_2, zy_2
add32 zx, zx, cx add32 zx, zx, cx
@ -998,9 +983,9 @@ keep_going:
add32 zy, zy, cy add32 zy, zy, cy
quick_exit zy, 2 quick_exit zy, 2
; convert 6.26 -> 3.13: (-4 .. +3.9) ; convert 8.24 -> 4.12: (-8 .. +7.9)
shift_round_16 zx, 3 shift_round_16 zx, 4
shift_round_16 zy, 3 shift_round_16 zy, 4
; zx_2 = zx * zx ; zx_2 = zx * zx
sqr16 zx_2, zx + 2 sqr16 zx_2, zx + 2
@ -1122,9 +1107,9 @@ enough:
.endmacro .endmacro
.macro zoom_factor dest, src, aspect .macro zoom_factor dest, src, aspect
; output: dest: fixed6.26 ; output: dest: fixed8.24
; input: src: fixed3.13 ; input: src: fixed4.12
; aspect: fixed3.13 ; aspect: fixed4.12
; clobbers A, X, flags, etc ; clobbers A, X, flags, etc
copy16 dest, src copy16 dest, src
scale_zoom dest scale_zoom dest
@ -1142,11 +1127,8 @@ enough:
; iter -> color ; iter -> color
ldx iter ldx iter
lda color_map,x lda color_map,x
ldx fill_level
and pixel_masks,x
sta pixel_color sta pixel_color
lda pixel_masks,x lda #(255 - 3)
eor #$ff
sta pixel_mask sta pixel_mask
; sy -> line base address in temp ; sy -> line base address in temp
@ -1195,23 +1177,22 @@ point:
; pixel_mask <<= pixel_shift (shifting in ones) ; pixel_mask <<= pixel_shift (shifting in ones)
and #3 and #3
sta pixel_shift sta pixel_shift
lda #3
sec
sbc pixel_shift
tax tax
shift_loop: shift_loop:
beq shift_done beq shift_done
lsr pixel_color asl pixel_color
lsr pixel_color asl pixel_color
sec sec
ror pixel_mask rol pixel_mask
sec sec
ror pixel_mask rol pixel_mask
dex dex
jmp shift_loop jmp shift_loop
shift_done: shift_done:
ldy fill_level
ldx fill_masks,y
inx
; pixel_offset = temp >> 2 ; pixel_offset = temp >> 2
lda temp lda temp
lsr a lsr a
@ -1219,25 +1200,12 @@ shift_done:
sta pixel_offset sta pixel_offset
tay tay
draw_pixel:
; read, mask, or, write ; read, mask, or, write
lda (pixel_ptr),y lda (pixel_ptr),y
and pixel_mask and pixel_mask
ora pixel_color ora pixel_color
sta (pixel_ptr),y sta (pixel_ptr),y
dex
beq done
clc
lda #40
adc pixel_ptr
sta pixel_ptr
lda #0
adc pixel_ptr + 1
sta pixel_ptr + 1
jmp draw_pixel
done:
rts rts
.endproc .endproc
@ -1245,7 +1213,6 @@ done:
; clobbers A, X ; clobbers A, X
.local loop .local loop
.local done .local done
.local padding
ldx #0 ldx #0
loop: loop:
cpx #len cpx #len
@ -1253,27 +1220,11 @@ loop:
txa txa
tay tay
lda (strptr),y lda (strptr),y
pha ; save the char for terminator check
and #$7f ; strip the high bit (terminator)
tay tay
lda char_map,y lda char_map,y
sta textbuffer + col,x sta textbuffer + col,x
inx inx
pla
bmi padding
jmp loop jmp loop
padding:
ldy #32 ; space
lda char_map,y
cpx #len
beq done
sta textbuffer + col,x
inx
jmp padding
done: done:
.endmacro .endmacro
@ -1416,7 +1367,7 @@ skip_char:
plus: plus:
lda zoom lda zoom
cmp #7 cmp #8
bpl skip_char bpl skip_char
inc zoom inc zoom
jmp done jmp done
@ -1448,10 +1399,6 @@ number_keys:
beq three beq three
cpy #KEY_4 cpy #KEY_4
beq four beq four
cpy #KEY_5
beq five
cpy #KEY_6
beq six
jmp skip_char jmp skip_char
one: one:
@ -1465,12 +1412,6 @@ three:
jmp load_key_viewport jmp load_key_viewport
four: four:
ldx #3 ldx #3
jmp load_key_viewport
five:
ldx #4
jmp load_key_viewport
six:
ldx #5
; fall through ; fall through
load_key_viewport: load_key_viewport:
jsr load_viewport jsr load_viewport
@ -1601,20 +1542,16 @@ copy_byte_loop:
jsr SETVBV jsr SETVBV
main_loop: main_loop:
; count_frames = 0; count_iters = 0 ; count_frames = 0; count_pixels = 0
lda #0 lda #0
sta count_frames sta count_frames
sta count_iters sta count_pixels
sta count_iters + 1
; total_sec = 0.0; total_iters = 0.0 ; total_ms = 0.0; total_pixels = 0.0
jsr ZFR0 ldx #total_ms
ldx #.lobyte(total_sec) jsr ZF1
ldy #.hibyte(total_sec) ldx #total_pixels
jsr FST0R jsr ZF1
ldx #.lobyte(total_iters)
ldy #.hibyte(total_iters)
jsr FST0R
jsr clear_screen jsr clear_screen
jsr status_bar jsr status_bar
@ -1686,32 +1623,38 @@ not_skipped_mask:
no_key: no_key:
; check if we should update the counters ; check if we should update the counters
;
; count_pixels >= width? update!
inc count_pixels
lda count_pixels
cmp #width
bmi update_status
; count_frames >= 120? update! ; count_frames >= 120? update!
lda count_frames lda count_frames
cmp #120 ; >= 2 seconds cmp #120 ; >= 2 seconds
bpl update_status bmi skipped
jmp skipped
update_status: update_status:
; FR0 = (float)count_iters & clear count_iters ; FR0 = (float)count_pixels & clear count_pixels
copy16 FR0, count_iters lda count_pixels
jsr IFP sta FR0
lda #0 lda #0
sta count_iters sta FR0 + 1
sta count_iters + 1 sta count_pixels
jsr IFP
; FR1 = total_iters ; FR1 = total_pixels
ldx #.lobyte(total_iters) ldx #.lobyte(total_pixels)
ldy #.hibyte(total_iters) ldy #.hibyte(total_pixels)
jsr FLD1R jsr FLD1R
; FR0 += FR1 ; FR0 += FR1
jsr FADD jsr FADD
; total_iters = FR0 ; total_pixels = FR0
ldx #.lobyte(total_iters) ldx #.lobyte(total_pixels)
ldy #.hibyte(total_iters) ldy #.hibyte(total_pixels)
jsr FST0R jsr FST0R
@ -1724,58 +1667,44 @@ update_status:
sta count_frames sta count_frames
jsr IFP jsr IFP
; FR0 *= sec_per_frame ; FR0 *= ms_per_frame
ldx #.lobyte(sec_per_frame) ldx #.lobyte(ms_per_frame)
ldy #.hibyte(sec_per_frame) ldy #.hibyte(ms_per_frame)
jsr FLD1R jsr FLD1R
jsr FMUL jsr FMUL
; FR0 += total_sec ; FR0 += total_ms
ldx #.lobyte(total_sec) ldx #total_ms
ldy #.hibyte(total_sec) ldy #0
jsr FLD1R jsr FLD1R
jsr FADD jsr FADD
; total_sec = FR0 ; total_ms = FR0
ldx #.lobyte(total_sec) ldx #total_ms
ldy #.hibyte(total_sec) ldy #0
jsr FST0R jsr FST0R
; FR0 /= total_iters ; FR0 /= total_pixels
ldx #.lobyte(total_iters) ldx #total_pixels
ldy #.hibyte(total_iters) ldy #0
jsr FLD1R jsr FLD1R
jsr FDIV jsr FDIV
; FR0 *= us_per_sec
ldx #.lobyte(us_per_sec)
ldy #.hibyte(us_per_sec)
jsr FLD1R
jsr FMUL
; round (down) to integer
jsr FPI
clc
jsr IFP
; convert to ASCII in INBUFF ; convert to ASCII in INBUFF
jsr FASC jsr FASC
; print the first 6 digits ; print the first 6 digits
draw_text speed_start, str_speed_len, str_speed draw_text_indirect speed_start, speed_precision, INBUFF
draw_text_indirect speed_start + str_speed_len, speed_precision, INBUFF draw_text speed_start + speed_precision, str_speed_len, str_speed
skipped: skipped:
; sx += fill_level[fill_masks] + 1
ldx fill_level
lda fill_masks,x
clc clc
adc #1 ; will never carry lda sx
adc sx adc #1
sta sx sta sx
lda #0 lda sx + 1
adc sx + 1 adc #0
sta sx + 1 sta sx + 1
lda sx lda sx
@ -1785,15 +1714,12 @@ skipped:
loop_sx_done: loop_sx_done:
; sy += fill_level[fill_masks] + 1
ldx fill_level
lda fill_masks,x
clc clc
adc #1 ; will never carry lda sy
adc sy adc #1
sta sy sta sy
lda #0 lda sy + 1
adc sy + 1 adc #0
sta sy + 1 sta sy + 1
lda sy lda sy

View file

@ -18,7 +18,7 @@ Enjoy! I'll probably work on this off and on for the next few weeks until I've g
## Current state ## Current state
Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys. Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 4 preset viewports via the number keys.
The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered. The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
@ -27,7 +27,7 @@ The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 3
* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications * when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication * without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates. The mandelbrot calculations are done using 4.12-precision fixed point numbers with 8.24-precision intermediates. It may be possible to squish this down to 3.13/6.26.
Iterations are capped at 255. Iterations are capped at 255.

14
todo.md
View file

@ -1,17 +1,19 @@
things to try: things to try:
* fix status bar to show elapsed time, per-iter time, per-pixel iter count * skip add on the top-byte multiply in sqr8/mul8
* should save a few cycles, suggestion by jamey
* 'turbo' mode disabling graphics in full or part
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D * patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
* maybe clean up the load/layout of the big mul table * try 3.13 fixed point instead of 4.12 for more precision
* can we get away without the extra bit?
* consider alternate lookup tables in the top 16KB under ROM * since exit compare space would be 6.26 i think so
* y-axis mirror optimization * y-axis mirror optimization
* 'wide pixels' 2x and 4x for a fuller initial image in the tiered rendering
* maybe redo tiering to just 4x4, 2x2, 1x1?
* extract viewport for display & re-input via keyboard * extract viewport for display & re-input via keyboard
* fujinet screenshot/viewport uploader * fujinet screenshot/viewport uploader