Compare commits

..

No commits in common. "main" and "status" have entirely different histories.

13 changed files with 924 additions and 2435 deletions

1
.gitignore vendored
View file

@ -1,4 +1,3 @@
*.o
*.xex
tables.s
.DS_Store

View file

@ -1,2 +0,0 @@
Brooke Vibber <bvibber@pobox.com>
Brooke Vibber <bvibber@pobox.com> <brion@pobox.com>

View file

@ -2,21 +2,13 @@
all : mandel.xex
mandel.xex : mandel.o mandel-core.o tables.o atari-xex.cfg
ld65 -C ./atari-xex.cfg --mapfile mandel.map -o $@ mandel.o mandel-core.o tables.o atari.lib
mandel.s : mandel.c mandel.h
cc65 -o $@ mandel.c
%.xex : %.o
ld65 -C atari-asm-xex.cfg -o $@ $<
%.o : %.s
ca65 -o $@ $<
tables.s : tables.js
node tables.js > tables.s
clean :
rm -f tables.s
rm -f mandel.s
rm -f *.o
rm -f *.xex
rm -f mandel.map

View file

@ -1,28 +0,0 @@
FEATURES {
STARTADDRESS: default = $2E00;
}
SYMBOLS {
__STARTADDRESS__: type = export, value = %S;
}
MEMORY {
ZP: file = "", define = yes, start = $0082, size = $007E;
MAIN: file = %O, define = yes, start = %S, size = $4000 - %S;
# Keep $4000-7fff clear for expanded RAM access window
TABLES: file = %O, define = yes, start = $8000, size = $a000 - $8000;
# Keep $a000-$bfff clear for BASIC cartridge
}
FILES {
%O: format = atari;
}
FORMATS {
atari: runad = start;
}
SEGMENTS {
ZEROPAGE: load = ZP, type = zp, optional = yes;
EXTZP: load = ZP, type = zp, optional = yes; # to enable modules to be able to link to C and assembler programs
CODE: load = MAIN, type = rw, define = yes;
RODATA: load = MAIN, type = ro optional = yes;
DATA: load = MAIN, type = rw optional = yes;
BSS: load = MAIN, type = bss, optional = yes, define = yes;
TABLES: load = TABLES, type = ro, optional = yes, align = 256;
}

View file

@ -1,69 +0,0 @@
# Sample linker configuration for C programs using the Atari binary file support.
# Use with: cl65 -tatari -Catari-xex.cfg prog.c -o prog.xex
FEATURES {
STARTADDRESS: default = $8000;
}
SYMBOLS {
__SYSTEM_CHECK__: type = import; # force inclusion of "system check" load chunk
__STACKSIZE__: type = weak, value = $0800; # 2k stack
__STARTADDRESS__: type = export, value = %S;
__RESERVED_MEMORY__: type = weak, value = $0000;
__SYSCHKHDR__: type = export, value = 0; # Disable system check header
__SYSCHKTRL__: type = export, value = 0; # Disable system check trailer
__TABLESEG_START__: type = weak, value = $2E00 + $0300;
__TABLESEG_SIZE__: type = weak, value = 6 * $100;
__BANKSY_START__: type = weak, value = $4000;
__BANKSY_SIZE__: type = weak, value = $4000;
__FRAMEBUFFER_START__: type = weak, value = $A000;
}
MEMORY {
# Note -- $80 and $81 (LOMEM) appear to be reserved in ZP.
ZP: file = "", define = yes, start = $0082, size = $007E;
# "system check" load chunk
SYSCHKCHNK: file = %O, start = $2E00, size = $0300;
# Note $a000-$bfff is against the BASIC cartridge, may require booting with OPTION.
TABLES: file = %O, define = yes, start = __TABLESEG_START__, size = __TABLESEG_SIZE__;
# We reserve $4000-7fff for the bank-switch window.
# In theory we could keep data and code here that we only use on 48k/64k systems.
BANKSWITCH: file = "", define = yes, start = __BANKSY_START__, size = __BANKSY_SIZE__;
# "main program" load chunk
MAIN: file = %O, define = yes, start = %S, size = __FRAMEBUFFER_START__ - __STACKSIZE__ - __RESERVED_MEMORY__ - %S;
}
FILES {
%O: format = atari;
}
FORMATS {
atari: runad = start,
initad = SYSCHKCHNK: __SYSTEM_CHECK__;
}
SEGMENTS {
ZEROPAGE: load = ZP, type = zp;
EXTZP: load = ZP, type = zp, optional = yes;
SYSCHK: load = SYSCHKCHNK, type = rw, define = yes, optional = yes;
TABLES: load = TABLES, type = ro, optional = yes, align = 256;
BANKSWICH: load = BANKSWITCH, type = ro, optional = yes;
STARTUP: load = MAIN, type = ro, define = yes;
LOWBSS: load = MAIN, type = rw, optional = yes; # not zero initialized
LOWCODE: load = MAIN, type = ro, define = yes, optional = yes;
ONCE: load = MAIN, type = ro, optional = yes;
CODE: load = MAIN, type = ro, define = yes;
RODATA: load = MAIN, type = ro;
DATA: load = MAIN, type = rw;
INIT: load = MAIN, type = rw, optional = yes;
BSS: load = MAIN, type = bss, define = yes;
}
FEATURES {
CONDES: type = constructor,
label = __CONSTRUCTOR_TABLE__,
count = __CONSTRUCTOR_COUNT__,
segment = ONCE;
CONDES: type = destructor,
label = __DESTRUCTOR_TABLE__,
count = __DESTRUCTOR_COUNT__,
segment = RODATA;
CONDES: type = interruptor,
label = __INTERRUPTOR_TABLE__,
count = __INTERRUPTOR_COUNT__,
segment = RODATA,
import = __CALLIRQ__;
}

File diff suppressed because it is too large Load diff

View file

@ -1,15 +0,0 @@
/**
* The UI and I/O wrapper for the Mandelbrot runner, in C.
*
* For the moment *all* logic is in mandel-core.s, I'm just
* trying to get this to run within a cc65 environment.
* Eventually just the inner loop fun will live in there.
*/
#include <stdlib.h>
#include <stdio.h>
#include "mandel.h"
void main(void) {
mandel_start();
}

View file

@ -1,4 +0,0 @@
#include <inttypes.h>
// From mandel-core.s:
extern void mandel_start(void);

912
mandel.s Normal file
View file

@ -0,0 +1,912 @@
; Our zero-page vars
sx = $80 ; i16: screen pixel x
sy = $82 ; i16: screen pixel y
ox = $84 ; fixed4.12: center point x
oy = $86 ; fixed4.12: center point y
cx = $88 ; fixed4.12: c_x
cy = $8a ; fixed4.12: c_y
zx = $8c ; fixed4.12: z_x
zy = $8e ; fixed4.12: z_y
zx_2 = $90 ; fixed4.12: z_x^2
zy_2 = $92 ; fixed4.12: z_y^2
zx_zy = $94 ; fixed4.12: z_x * z_y
dist = $96 ; fixed4.12: z_x^2 + z_y^2
iter = $a0 ; u8: iteration count
zoom = $a1 ; u8: zoom shift level
count_frames = $a2 ; u8
count_pixels = $a3 ; u8
total_ms = $a4 ; float48
total_pixels = $aa ; float48
temp = $b0 ; u16
pixel_ptr = $b2 ; u16
pixel_color = $b4 ; u8
pixel_mask = $b5 ; u8
pixel_shift = $b6 ; u8
pixel_offset = $b7 ; u8
; FP registers in zero page
FR0 = $d4 ; float48
FRE = $da
FR1 = $e0 ; float48
FR2 = $e6 ; float48
CIX = $f2 ; u8 - index into INBUFF
INBUFF = $f3 ; u16 - pointer to ascii
FLPTR = $fc ; u16 - pointer to user buffer float48
LBUFF = $0580 ; result buffer for FASC routine
; FP ROM routine vectors
FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set)
IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48)
FADD = $DA66 ; ADDITION (FR0 += FR1)
FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1)
FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1)
FDIV = $DB28 ; DIVISION (FR0 /= FR1)
ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX)
FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX)
FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX)
FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX)
FMOVE = $DDB6 ; MOVE FR0 TO FR1
; High data
framebuffer_top = $8000
textbuffer = $8f00
framebuffer_bottom = $9000
display_list = $9f00
framebuffer_end = $a000
height = 184
half_height = height >> 1
width = 160
half_width = width >> 1
stride = width >> 2
DMACTL = $D400
DLISTL = $D402
DLISTH = $D403
; OS shadow registers
SDLSTL = $230
SDLSTH = $231
; interrupt stuff
XITVBV = $E462
SETVBV = $E45C
.struct float48
exponent .byte
mantissa .byte 6
.endstruct
.data
strings:
str_self:
.byte "MANDEL-6502"
str_self_end:
str_speed:
.byte "ms/px"
str_speed_end:
str_run:
.byte " RUN"
str_run_end:
str_done:
.byte "DONE"
str_done_end:
str_self_len = str_self_end - str_self
str_speed_len = str_speed_end - str_speed
str_run_len = str_run_end - str_run
str_done_len = str_done_end - str_done
speed_start = str_self_len + 2
speed_len = 14 + str_speed_len
char_map:
; Map ATASCII string values to framebuffer font entries
; Sighhhhh
.repeat 32, i
.byte i + 64
.endrepeat
.repeat 64, i
.byte i
.endrepeat
.repeat 32, i
.byte 96 + i
.endrepeat
aspect:
; aspect ratio!
; pixels at 320w are 5:6 (narrow)
; pixels at 160w are 5:3 (wide)
;
; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4)
; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4)
;
; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624
; &horizontal range -80 .. 79.9 is -3.125 .. 3.124
;
; 184h is the equiv of 220.8h at square pixels
; 320 / 220.8 = 1.45 display aspect ratio
aspect_x: ; fixed4.16 5/4
.word 5 << (12 - 2)
aspect_y: ; fixed4.16 3/4
.word 3 << (12 - 2)
ms_per_frame: ; float48 16.66666667
.byte 64 ; exponent/sign
.byte $16 ; BCD digits
.byte $66
.byte $66
.byte $66
.byte $67
display_list_start:
; 24 lines overscan
.repeat 3
.byte $70 ; 8 blank lines
.endrep
; 8 scan lines, 1 row of 40-column text
.byte $42
.addr textbuffer
; 184 lines graphics
; ANTIC mode e (160px 2bpp, 1 scan line per line)
.byte $4e
.addr framebuffer_top
.repeat half_height - 1
.byte $0e
.endrep
.byte $4e
.addr framebuffer_bottom
.repeat half_height - 1
.byte $0e
.endrep
.byte $41 ; jump and blank
.addr display_list
display_list_end:
display_list_len = display_list_end - display_list_start
color_map:
.byte 0
.repeat 85
.byte 1
.byte 2
.byte 3
.endrepeat
.code
.export start
; 2 + 9 * byte cycles
.macro add bytes, dest, arg1, arg2
clc ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
adc arg2 + byte
sta dest + byte
.endrepeat
.endmacro
.macro add16 dest, arg1, arg2
add 2, dest, arg1, arg2
.endmacro
.macro add32 dest, arg1, arg2
add 4, dest, arg2, dest
.endmacro
; 2 + 9 * byte cycles
.macro sub bytes, dest, arg1, arg2
sec ; 2 cyc
.repeat bytes, byte ; 9 * byte cycles
lda arg1 + byte
sbc arg2 + byte
sta dest + byte
.endrepeat
.endmacro
.macro sub16 dest, arg1, arg2
sub 2, dest, arg1, arg2
.endmacro
.macro sub32 dest, arg1, arg2
sub 4, dest, arg1, arg2
.endmacro
.macro shl bytes, arg
asl arg
.repeat bytes-1, i
rol arg + 1 + i
.endrepeat
.endmacro
.macro shl16 arg
shl 2, arg
.endmacro
.macro shl24 arg
shl 3, arg
.endmacro
.macro shl32 arg
shl 4, arg
.endmacro
; 6 * bytes cycles
.macro copy bytes, dest, arg
.repeat bytes, byte ; 6 * bytes cycles
lda arg + byte ; 3 cyc
sta dest + byte ; 3 cyc
.endrepeat
.endmacro
.macro copy16 dest, arg
copy 2, dest, arg
.endmacro
.macro copy32 dest, arg
copy 4, dest, arg
.endmacro
.macro copyfloat dest, arg
copy 6, dest, arg
.endmacro
; 2 + 8 * byte cycles
.macro neg bytes, arg
sec ; 2 cyc
.repeat bytes, byte ; 8 * byte cycles
lda #00 ; 2 cyc
sbc arg + byte ; 3 cyc
sta arg + byte ; 3 cyc
.endrepeat
.endmacro
; 18 cycles
.macro neg16 arg
neg 2, arg
.endmacro
; 34 cycles
.macro neg32 arg
neg 4, arg
.endmacro
; inner loop for imul16
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
.macro bitmul16 arg1, arg2, result, bitnum
.local zero
.local one
.local next
; does 16-bit adds
; arg1 and arg2 are treated as unsigned
; negative signed inputs must be flipped first
; 7 cycles up to the branch
; check if arg1 has 0 or 1 bit in this place
; 5 cycles either way
.if bitnum < 8
lda arg1 ; 3 cyc
and #(1 << (bitnum)) ; 2 cyc
.else
lda arg1 + 1 ; 3 cyc
and #(1 << ((bitnum) - 8)) ; 2 cyc
.endif
bne one ; 2 cyc
zero: ; 18 cyc, 23 cyc
lsr result + 3 ; 5 cyc
jmp next ; 3 cyc
one: ; 32 cyc, 37 cyc
; 16-bit add on the top bits
clc ; 2 cyc
lda result + 2 ; 3 cyc
adc arg2 ; 3 cyc
sta result + 2 ; 3 cyc
lda result + 3 ; 3 cyc
adc arg2 + 1 ; 3 cyc
ror a ; 2 cyc - get a jump on the shift
sta result + 3 ; 3 cyc
next:
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
.if bitnum >= 8
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
ror result ; 5 cyc
.endif
.endmacro
; 5 to 25 cycles
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the X register.
.local positive
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
inx ; 2 cyc
positive:
.endmacro
; 518 - 828 cyc
.macro imul16 dest, arg1, arg2
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; 470-780 cyc
copy32 dest, FR2 ; 24 cyc
.endmacro
.macro shift_round_16 arg, shift
.repeat shift
shl32 arg
.endrepeat
round16 arg
.endmacro
.macro imul16_round dest, arg1, arg2, shift
copy16 FR0, arg1 ; 12 cyc
copy16 FR1, arg2 ; 12 cyc
jsr imul16_func ; 470-780 cyc
shift_round_16 FR2, shift
copy16 dest, FR2 + 2 ; 12 cyc
.endmacro
; min 470 cycles
; max 780 cycles
.proc imul16_func
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
ldx #0 ; 2 cyc
; counts the number of sign bits in X
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
; zero out the 32-bit temp's top 16 bits
lda #0 ; 2 cyc
sta result + 2 ; 3 cyc
sta result + 3 ; 3 cyc
; the bottom two bytes will get cleared by the shifts
; unrolled loop for maximum speed, at the cost
; of a larger routine
; 440 to 696 cycles
.repeat 16, bitnum
; bitnum < 8: 25 or 41 cycles
; bitnum >= 8: 30 or 46 cycles
bitmul16 arg1, arg2, result, bitnum
.endrepeat
; In case of mixed input signs, return a negative result.
cpx #1 ; 2 cyc
bne positive_result ; 2 cyc
neg32 result ; 34 cyc
positive_result:
rts ; 6 cyc
.endproc
.macro round16 arg
; Round top 16 bits of 32-bit fixed-point number in-place
.local increment
.local high_half
.local check_sign
.local next
; low word > $8000: round up
; = $8000: round up if positive
; round down if negative
; < $8000: round down
lda arg + 1
cmp #$80
beq high_half
bpl increment
bmi next
high_half:
lda arg
beq check_sign
bpl increment
bmi next
check_sign:
lda arg + 3
bmi next
increment: ; 5-10 cyc
inc arg + 2 ; 5 cyc
bne next ; 2 cyc
inc arg + 3 ; 5 cyc
next:
.endmacro
.proc mandelbrot
; input:
; cx: position scaled to 4.12 fixed point - -8..+7.9
; cy: position scaled to 4.12
;
; output:
; iter: iteration count at escape or 0
; zx = 0
; zy = 0
; zx_2 = 0
; zy_2 = 0
; zx_zy = 0
; dist = 0
; iter = 0
lda #00
ldx #(iter - zx + 1)
initloop:
sta zx - 1,x
dex
bne initloop
loop:
; iter++ & max-iters break
inc iter
bne keep_going
rts
keep_going:
.macro quick_exit arg, max
.local positive
.local negative
.local nope_out
.local first_equal
.local all_done
; check sign bit
lda arg + 1
bmi negative
positive:
cmp #((max) << 4)
bmi all_done ; 'less than'
rts
negative:
cmp #(256 - ((max) << 4))
beq first_equal ; 'equal' on first byte
bpl all_done ; 'greater than'
nope_out:
rts
first_equal:
lda arg
beq nope_out ; 2nd byte 0 shows it's really 'equal'
all_done:
.endmacro
; 4.12: (-8 .. +7.9)
; zx = zx_2 - zy_2 + cx
sub16 zx, zx_2, zy_2
add16 zx, zx, cx
quick_exit zx, 2
; zy = zx_zy + zx_zy + cy
add16 zy, zx_zy, zx_zy
add16 zy, zy, cy
quick_exit zy, 2
; zx_2 = zx * zx
imul16_round zx_2, zx, zx, 4
; zy_2 = zy * zy
imul16_round zy_2, zy, zy, 4
; zx_zy = zx * zy
imul16_round zx_zy, zx, zy, 4
; dist = zx_2 + zy_2
add16 dist, zx_2, zy_2
quick_exit dist, 4
; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters
jmp loop
peace_out:
rts
.endproc
.macro zoom_factor dest, src, zoom, aspect
.local cont
.local enough
; cx = (sx << (8 - zoom))
copy16 dest, src
ldx zoom
cont:
cpx #8
beq enough
shl16 dest
inx
jmp cont
enough:
; cy = cy * (3 / 4)
; cx = cx * (5 / 4)
imul16_round dest, dest, aspect, 4
.endmacro
.proc pset
; screen coords in signed sx,sy
; iter holds the target to use
; @todo implement
; iter -> color
ldx iter
lda color_map,x
sta pixel_color
lda #(255 - 3)
sta pixel_mask
; sy -> line base address in temp
lda sy
bpl positive
negative:
; temp1 = top half
lda #.lobyte(framebuffer_top + stride * half_height)
sta pixel_ptr
lda #.hibyte(framebuffer_top + stride * half_height)
sta pixel_ptr + 1
jmp point
positive:
lda #.lobyte(framebuffer_bottom)
sta pixel_ptr
lda #.hibyte(framebuffer_bottom)
sta pixel_ptr + 1
point:
; pixel_ptr += sy * stride
; temp * 40
; = temp * 32 + temp * 8
; = (temp << 5) + (temp << 3)
copy16 temp, sy
shl16 temp
shl16 temp
shl16 temp
add16 pixel_ptr, pixel_ptr, temp
shl16 temp
shl16 temp
add16 pixel_ptr, pixel_ptr, temp
; Ok so temp1 points to the start of the line, which is 40 bytes.
; Get the byte and bit offsets
lda sx
clc
adc #half_width
sta temp
; pixel_shift = temp & 3
; pixel_color <<= pixel_shift (shifting in zeros)
; pixel_mask <<= pixel_shift (shifting in ones)
and #3
sta pixel_shift
lda #3
sec
sbc pixel_shift
tax
shift_loop:
beq shift_done
asl pixel_color
asl pixel_color
sec
rol pixel_mask
sec
rol pixel_mask
dex
jmp shift_loop
shift_done:
; pixel_offset = temp >> 2
lda temp
lsr a
lsr a
sta pixel_offset
tay
; read, mask, or, write
lda (pixel_ptr),y
and pixel_mask
ora pixel_color
sta (pixel_ptr),y
rts
.endproc
.macro draw_text col, len, cstr
; clobbers A, X
.local loop
.local done
ldx #0
loop:
cpx #len
beq done
ldy cstr,x
lda char_map,y
sta textbuffer + col,x
inx
jmp loop
done:
.endmacro
.proc vblank_handler
inc count_frames
jmp XITVBV
.endproc
.proc update_speed
; convert frames (u16) to fp
; add to frames_total
; convert pixels (u16) to fp
; add to pixels_total
; (frames_total * 16.66666667) / pixels_total
; convert to ATASCII
; draw text
.endproc
.proc start
; ox = 0; oy = 0; zoom = 0
; count_frames = 0; count_pixels = 0
lda #0
sta ox
sta ox + 1
sta oy
sta oy + 1
sta count_frames
sta count_pixels
; total_ms = 0.0; total_pixels = 0.0
ldx #total_ms
jsr ZF1
ldx #total_pixels
jsr ZF1
; zoom = 2x
lda #1
sta zoom
; Disable display DMA
lda #0
sta DMACTL
; zero the range from framebuffer_top to framebuffer_end
lda #.lobyte(framebuffer_top)
sta temp
lda #.hibyte(framebuffer_top)
sta temp + 1
zero_page_loop:
lda #0
ldy #0
zero_byte_loop:
sta (temp),y
iny
bne zero_byte_loop
inc temp + 1
lda temp + 1
cmp #.hibyte(framebuffer_end)
bne zero_page_loop
; Copy the display list into properly aligned memory
; Can't cross 1024-byte boundaries :D
ldx #0
copy_byte_loop:
lda display_list_start,x
sta display_list,x
inx
cpx #display_list_len
bne copy_byte_loop
; Set up the display list
lda #.lobyte(display_list)
sta DLISTL ; actual register
sta SDLSTL ; shadow register the OS will copy in
lda #.hibyte(display_list)
sta DLISTH ; actual register
sta SDLSTH ; shadow register the OS will copy in
; Status bar
draw_text 0, str_self_len, str_self
draw_text 40 - str_run_len, str_run_len, str_run
; Re-enable display DMA
lda #$22
sta DMACTL
; install the vblank handler
lda #7 ; deferred
ldx #.hibyte(vblank_handler)
ldy #.lobyte(vblank_handler)
jsr SETVBV
main_loop:
; sy = -92 .. 91
lda #(256-half_height)
sta sy
lda #(256-1)
sta sy + 1
loop_sy:
; sx = -80 .. 79
lda #(256-half_width)
sta sx
lda #(256-1)
sta sx + 1
loop_sx:
zoom_factor cx, sx, zoom, aspect_x
zoom_factor cy, sy, zoom, aspect_y
jsr mandelbrot
jsr pset
; check if we should update the counters
;
; count_pixels >= width? update!
inc count_pixels
lda count_pixels
cmp #width
bmi update_status
; count_frames >= 120? update!
lda count_frames
cmp #120 ; >= 2 seconds
bmi skip_status
update_status:
; FR0 = (float)count_pixels & clear count_pixels
lda count_pixels
sta FR0
lda #0
sta FR0 + 1
sta count_pixels
jsr IFP
; FR1 = total_pixels
ldx #.lobyte(total_pixels)
ldy #.hibyte(total_pixels)
jsr FLD1R
; FR0 += FR1
jsr FADD
; total_pixels = FR0
ldx #.lobyte(total_pixels)
ldy #.hibyte(total_pixels)
jsr FST0R
; FR0 = (float)count_frames & clear count_frames
; warning: this should really disable interrupts @TODO
lda count_frames
sta FR0
lda #0
sta FR0 + 1
sta count_frames
jsr IFP
; FR0 *= ms_per_frame
ldx #.lobyte(ms_per_frame)
ldy #.hibyte(ms_per_frame)
jsr FLD1R
jsr FMUL
; FR0 += total_ms
ldx #total_ms
ldy #0
jsr FLD1R
jsr FADD
; total_ms = FR0
ldx #total_ms
ldy #0
jsr FST0R
; FR0 /= total_pixels
ldx #total_pixels
ldy #0
jsr FLD1R
jsr FDIV
; convert to ASCII in INBUFF
jsr FASC
; find the last byte
ldy #0
number_loop:
lda (INBUFF),y
bmi lastchar
tax
lda char_map,x
sta textbuffer + speed_start,y
iny
bpl number_loop
lastchar:
; Y is last char
; trim that high bit
and #$7f
tax
lda char_map,x
sta textbuffer + speed_start,y
; Fill out any remaining spaces
lda #0
space_loop:
iny
sta textbuffer + speed_start,y
cpy #(20)
bmi space_loop
skip_status:
clc
lda sx
adc #1
sta sx
lda sx + 1
adc #0
sta sx + 1
lda sx
cmp #half_width
beq loop_sx_done
jmp loop_sx
loop_sx_done:
clc
lda sy
adc #1
sta sy
lda sy + 1
adc #0
sta sy + 1
lda sy
cmp #half_height
beq loop_sy_done
jmp loop_sy
loop_sy_done:
draw_text 40 - str_done_len, str_done_len, str_done
loop:
; finished
jmp loop
.endproc

View file

@ -14,37 +14,30 @@ Non-goals:
Enjoy! I'll probably work on this off and on for the next few weeks until I've got it producing fractals.
-- brooke, january 2023 - december 2024
-- brion, january 2023
## Current state
Basic rendering is functional, with interactive zoom/pan (+/-/arrows) and 6 preset viewports via the number keys.
Basic rendering is functional, but no interactive behavior (zoom/pan) or benchmarking is done yet.
The 16-bit signed integer multiplication takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
The 16-bit signed integer multiplication works; it takes two 16-bit inputs and emits one 32-bit output in the zero page, using the Atari OS ROM's floating point registers as workspaces. Inputs are clobbered.
* 16-bit multiplies are decomposed into 4 8-bit unsigned multiplies and some addition
* an optimized case for squares uses a table of 8-bit squares to reduce the number of 8-bit multiplication sub-ops
* when expanded RAM is available as on 130XE, a 64KB 8-bit multiplication table accelerates the remaining multiplications
* without expanded RAM, a table of half-squares is used to implement the algorithm from https://everything2.com/title/Fast+6502+multiplication
The main loop is a basic add-and-shift, using 16-bit adds which requires flipping the sign of negative inputs (otherwise you'd have to add all those sign-extension bits). Runs in 470-780 cycles depending on input.
The mandelbrot calculations are done using 3.13-precision fixed point numbers with 6.26-precision intermediates.
The mandelbrot calculations are done using 4.12-precision fixed point numbers. It may be possible to squish this down to 3.13.
Iterations are capped at 255.
The pixels are run in a progressive layout to get the basic shape on screen faster.
## Next steps
There is a running counter of ms/px using the vertical blank interrupts as a timer, used to track our progress. :D
Add a running counter of ms/px using the vertical blank interrupts as a timer. This'll show how further work improves it!
There's a check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
Check for cycles in (zx,zy) output when in the 'lake'; if values repeat, they cannot escape. This is a big time saver in fractint.
There's some cute color cycling.
I may be able to do a faster multiply using tables of squares for 8-bit component multiplication.
## Deps and build instructions
I'm using `ca65` as a macro assembler, and have a Unix-style `Makefile` for building. Should work fairly easily on Linux and Mac. Might work on "raw" Windows but I use WSL for that.
Currently produces a `.xex` executable, which can be booted up in common Atari emulators and some i/o devices.
## Todo
See ideas in `todo.md`.

View file

@ -1,50 +0,0 @@
function db(func) {
let lines = [];
for (let i = 0; i < 256; i += 16) {
let items = [];
for (let j = 0; j < 16; j++) {
let x = i + j;
items.push(func(x));
}
lines.push(' .byte ' + items.join(', '));
}
return lines.join('\n');
}
let squares = [];
for (let i = 0; i < 512; i++) {
squares.push(Math.trunc((i * i + 1) / 2));
}
console.log(
`.segment "TABLES"
.export mul_lobyte256
.export mul_hibyte256
.export mul_hibyte512
.export sqr_lobyte
.export sqr_hibyte
; (i * i + 1) / 2 for the multiplier
.align 256
mul_lobyte256:
${db((i) => squares[i] & 0xff)}
.align 256
mul_hibyte256:
${db((i) => (squares[i] >> 8) & 0xff)}
.align 256
mul_hibyte512:
${db((i) => (squares[i + 256] >> 8) & 0xff)}
; (i * i) for the plain squares
.align 256
sqr_lobyte:
${db((i) => (i * i) & 0xff)}
.align 256
sqr_hibyte:
${db((i) => ((i * i) >> 8) & 0xff)}
`);

View file

@ -1,41 +0,0 @@
// ax = (a + x)2/2 - a2/2 - x2/2
function half_square(x) {
return Math.round(x * x / 2) & 0xffff >>> 0;
}
function mul8(a, b) {
let result = half_square(a + b) & 0xffff;
result = (result - half_square(a)) & 0xffff;
result = (result - half_square(b)) & 0xffff;
result = (result + (b & a & 1)) & 0xffff;
return result >>> 0;
}
function mul16(a, b) {
let ah = (a & 0xff00) >>> 8;
let al = (a & 0x00ff) >>> 0;
let bh = (b & 0xff00) >>> 8;
let bl = (b & 0x00ff) >>> 0;
let result = (mul8(al, bl) & 0xffff) >>> 0;
result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
return result;
}
let max = 65536;
//let max = 256;
//let max = 128;
//let max = 8;
for (let a = 0; a < max; a++) {
for (let b = 0; b < max; b++) {
let expected = Math.imul(a, b) >>> 0;
//let actual = mul8(a, b);
let actual = mul16(a, b);
if (expected !== actual) {
console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
}
}
}

17
todo.md
View file

@ -1,17 +0,0 @@
things to try:
* fix status bar to show elapsed time, per-iter time, per-pixel iter count
* 'turbo' mode disabling graphics in full or part
* patch the entire expanded-ram imul8xe on top of imul8 to avoid the 3-cycle thunk penalty :D
* maybe clean up the load/layout of the big mul table
* consider alternate lookup tables in the top 16KB under ROM
* y-axis mirror optimization
* extract viewport for display & re-input via keyboard
* fujinet screenshot/viewport uploader