Compare commits

..

1 commit

Author SHA1 Message Date
fa0de6dc77 WIP savings of half a cycle per imul8_xe
Uses X to cache arg1, which is always used, instead of
arg2, which is only used on odds.

Should save half a cycle per imul8_xe, untested
2025-09-16 21:29:40 -07:00
5 changed files with 65 additions and 156 deletions

View file

@ -2,11 +2,8 @@
all : mandel.xex
mandel.xex : mandel.o mandel-core.o tables.o atari-xex.cfg
ld65 -C ./atari-xex.cfg --mapfile mandel.map -o $@ mandel.o mandel-core.o tables.o atari.lib
mandel.s : mandel.c mandel.h
cc65 -o $@ mandel.c
mandel.xex : mandel.o tables.o atari-asm-xex.cfg
ld65 -C ./atari-asm-xex.cfg --mapfile mandel.map -o $@ mandel.o tables.o
%.o : %.s
ca65 -o $@ $<
@ -16,7 +13,8 @@ tables.s : tables.js
clean :
rm -f tables.s
rm -f mandel.s
rm -f *.o
rm -f *.xex
rm -f mandel.map

View file

@ -1,69 +0,0 @@
# Sample linker configuration for C programs using the Atari binary file support.
# Use with: cl65 -tatari -Catari-xex.cfg prog.c -o prog.xex
FEATURES {
STARTADDRESS: default = $8000;
}
SYMBOLS {
__SYSTEM_CHECK__: type = import; # force inclusion of "system check" load chunk
__STACKSIZE__: type = weak, value = $0800; # 2k stack
__STARTADDRESS__: type = export, value = %S;
__RESERVED_MEMORY__: type = weak, value = $0000;
__SYSCHKHDR__: type = export, value = 0; # Disable system check header
__SYSCHKTRL__: type = export, value = 0; # Disable system check trailer
__TABLESEG_START__: type = weak, value = $2E00 + $0300;
__TABLESEG_SIZE__: type = weak, value = 6 * $100;
__BANKSY_START__: type = weak, value = $4000;
__BANKSY_SIZE__: type = weak, value = $4000;
__FRAMEBUFFER_START__: type = weak, value = $A000;
}
MEMORY {
# Note -- $80 and $81 (LOMEM) appear to be reserved in ZP.
ZP: file = "", define = yes, start = $0082, size = $007E;
# "system check" load chunk
SYSCHKCHNK: file = %O, start = $2E00, size = $0300;
# Note $a000-$bfff is against the BASIC cartridge, may require booting with OPTION.
TABLES: file = %O, define = yes, start = __TABLESEG_START__, size = __TABLESEG_SIZE__;
# We reserve $4000-7fff for the bank-switch window.
# In theory we could keep data and code here that we only use on 48k/64k systems.
BANKSWITCH: file = "", define = yes, start = __BANKSY_START__, size = __BANKSY_SIZE__;
# "main program" load chunk
MAIN: file = %O, define = yes, start = %S, size = __FRAMEBUFFER_START__ - __STACKSIZE__ - __RESERVED_MEMORY__ - %S;
}
FILES {
%O: format = atari;
}
FORMATS {
atari: runad = start,
initad = SYSCHKCHNK: __SYSTEM_CHECK__;
}
SEGMENTS {
ZEROPAGE: load = ZP, type = zp;
EXTZP: load = ZP, type = zp, optional = yes;
SYSCHK: load = SYSCHKCHNK, type = rw, define = yes, optional = yes;
TABLES: load = TABLES, type = ro, optional = yes, align = 256;
BANKSWICH: load = BANKSWITCH, type = ro, optional = yes;
STARTUP: load = MAIN, type = ro, define = yes;
LOWBSS: load = MAIN, type = rw, optional = yes; # not zero initialized
LOWCODE: load = MAIN, type = ro, define = yes, optional = yes;
ONCE: load = MAIN, type = ro, optional = yes;
CODE: load = MAIN, type = ro, define = yes;
RODATA: load = MAIN, type = ro;
DATA: load = MAIN, type = rw;
INIT: load = MAIN, type = rw, optional = yes;
BSS: load = MAIN, type = bss, define = yes;
}
FEATURES {
CONDES: type = constructor,
label = __CONSTRUCTOR_TABLE__,
count = __CONSTRUCTOR_COUNT__,
segment = ONCE;
CONDES: type = destructor,
label = __DESTRUCTOR_TABLE__,
count = __DESTRUCTOR_COUNT__,
segment = RODATA;
CONDES: type = interruptor,
label = __INTERRUPTOR_TABLE__,
count = __INTERRUPTOR_COUNT__,
segment = RODATA,
import = __CALLIRQ__;
}

View file

@ -1,15 +0,0 @@
/**
* The UI and I/O wrapper for the Mandelbrot runner, in C.
*
* For the moment *all* logic is in mandel-core.s, I'm just
* trying to get this to run within a cc65 environment.
* Eventually just the inner loop fun will live in there.
*/
#include <stdlib.h>
#include <stdio.h>
#include "mandel.h"
void main(void) {
mandel_start();
}

View file

@ -1,4 +0,0 @@
#include <inttypes.h>
// From mandel-core.s:
extern void mandel_start(void);

View file

@ -1,44 +1,44 @@
.zeropage
; Our zero-page vars
ox = $80 ; fixed6.26: center point x
oy = $84 ; fixed6.26: center point y
cx = $88 ; fixed6.26: c_x
cy = $8c ; fixed6.26: c_y
ox: .res 4 ; fixed6.26: center point x
oy: .res 4 ; fixed6.26: center point y
cx: .res 4 ; fixed6.26: c_x
cy: .res 4 ; fixed6.26: c_y
zx = $90 ; fixed6.26: z_x
zy = $94 ; fixed6.26: z_y
zx_2 = $98 ; fixed6.26: z_x^2
zy_2 = $9c ; fixed6.26: z_y^2
zx: .res 4 ; fixed6.26: z_x
zy: .res 4 ; fixed6.26: z_y
zx_2: .res 4 ; fixed6.26: z_x^2
zy_2: .res 4 ; fixed6.26: z_y^2
zx_zy = $a0 ; fixed6.26: z_x * z_y
dist = $a4 ; fixed6.26: z_x^2 + z_y^2
sx = $a8 ; i16: screen pixel x
sy = $aa ; i16: screen pixel y
z_buffer_active = $ac ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start = $ad ; u8: index into z_buffer
z_buffer_end = $ae ; u8: index into z_buffer
iter = $af ; u8: iteration count
zx_zy: .res 4 ; fixed6.26: z_x * z_y
dist: .res 4 ; fixed6.26: z_x^2 + z_y^2
ptr = $b0 ; u16
pixel_ptr = $b2 ; u16
zoom = $b4 ; u8: zoom shift level
fill_level = $b5 ; u8
pixel_color = $b6 ; u8
pixel_mask = $b7 ; u8
pixel_shift = $b8 ; u8
pixel_offset = $b9 ; u8
palette_offset = $ba ; u8
chroma_offset = $bb ; u8
palette_ticks = $bc ; u8
chroma_ticks = $bd ; u8
count_frames = $be ; u8
; free space $bf
z_buffer_active: .res 1 ; boolean: 1 if we triggered the lake, 0 if not
z_buffer_start: .res 1 ; u8: index into z_buffer
z_buffer_end: .res 1 ; u8: index into z_buffer
iter: .res 1 ; u8: iteration count
ptr: .res 2 ; u16
temp: .res 2 ; u16
temp2: .res 2 ; u16
.data
; can move to .data
sx: .res 2 ; i16: screen pixel x
sy: .res 2 ; i16: screen pixel y
zoom: .res 1 ; u8: zoom shift level
fill_level: .res 1 ; u8
pixel_color: .res 1 ; u8
pixel_mask: .res 1 ; u8
pixel_shift: .res 1 ; u8
pixel_offset: .res 1 ; u8
palette_offset: .res 1 ; u8
chroma_offset: .res 1 ; u8
palette_ticks: .res 1 ; u8
chroma_ticks: .res 1 ; u8
count_frames: .res 1 ; u8
count_iters: .res 2 ; u16
text_col: .res 1 ; u8
text_row: .res 1 ; u8
count_iters = $c0 ; u16
text_col = $c2 ; u8
text_row = $c3 ; u8
; free space c4-cb
temp = $cc ; u16
temp2 = $ce ; u16
palette_delay = 23
chroma_delay = 137
@ -131,8 +131,6 @@ KEY_E = 42
KEY_X = 22
KEY_Y = 43
.data
.struct float48
exponent .byte
mantissa .byte 5
@ -144,6 +142,7 @@ KEY_Y = 43
.import sqr_lobyte
.import sqr_hibyte
.data
strings:
str_self:
@ -362,7 +361,7 @@ z_buffer:
.word 0
.endrepeat
.export _mandel_start
.export start
;max_fill_level = 6
max_fill_level = 3
@ -589,7 +588,7 @@ bank_switch_table:
.macro imul8 dest, arg1, arg2, xe
.if xe
; using 64KB lookup table
; 51-70 cycles
; 50-70 cycles
; clobbers x, y, dest, ptr
.scope
output = dest
@ -601,13 +600,13 @@ bank_switch_table:
; bottom 14 bits except the LSB are the per-bank table index
; add $4000 for the bank pointer
txa ; 2 cyc
and #$3f ; 2 cyc
ora #$40 ; 2 cyc
sta ptr + 1 ; 3 cyc
; copy the entry into output
lda arg1 ; 3 cyc
tax ; 2 cyc
and #$fe ; 2 cyc
tay ; 2 cyc
lda (ptr),y ; 5 cyc
@ -624,13 +623,13 @@ bank_switch_table:
;;sta PORTB ; 4 cyc - disabled
; check that 1 bit we skipped to fit into space
lda arg1 ; 3 cyc
txa ; 2 cyc
and #1 ; 2 cyc
beq done ; 2 cyc
; add arg2 one last time for the skipped bit
clc ; 2 cyc
txa ; 2 cyc
lda arg1 ; 3 cyc
adc output ; 3 cyc
sta output ; 3 cyc
lda #0 ; 2 cyc
@ -1254,21 +1253,21 @@ enough:
negative:
; temp1 = top half
lda #.lobyte(framebuffer_top + stride * half_height)
sta ptr
sta pixel_ptr
lda #.hibyte(framebuffer_top + stride * half_height)
sta ptr + 1
sta pixel_ptr + 1
jmp point
positive:
lda #.lobyte(framebuffer_bottom)
sta ptr
sta pixel_ptr
lda #.hibyte(framebuffer_bottom)
sta ptr + 1
sta pixel_ptr + 1
point:
; ptr += sy * stride
; pixel_ptr += sy * stride
; temp * 40
; = temp * 32 + temp * 8
; = (temp << 5) + (temp << 3)
@ -1276,10 +1275,10 @@ point:
shl16 temp
shl16 temp
shl16 temp
add16 ptr, ptr, temp
add16 pixel_ptr, pixel_ptr, temp
shl16 temp
shl16 temp
add16 ptr, ptr, temp
add16 pixel_ptr, pixel_ptr, temp
; Ok so temp1 points to the start of the line, which is 40 bytes.
; Get the byte and bit offsets
@ -1319,20 +1318,20 @@ shift_done:
draw_pixel:
; read, mask, or, write
lda (ptr),y
lda (pixel_ptr),y
and pixel_mask
ora pixel_color
sta (ptr),y
sta (pixel_ptr),y
dex
beq done
clc
lda #40
adc ptr
sta ptr
adc pixel_ptr
sta pixel_ptr
lda #0
adc ptr + 1
sta ptr + 1
adc pixel_ptr + 1
sta pixel_ptr + 1
jmp draw_pixel
done:
@ -1746,7 +1745,7 @@ zero_byte_loop:
rts
.endproc
.proc _mandel_start
.proc start
jsr imul8xe_init