Merge branch 'fastmul' into fastmul2

2024-11-11 11:45:58 -08:00 · 2024-11-11 11:45:58 -08:00 · 97948dc814
commit 97948dc814
parent 29630c8887 f10bb4fe18
5 changed files with 196 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 *.o
 *.xex
 tables.s
 .DS_Store
--- a/8
+++ b/8
@ -2,13 +2,17 @@
 all : mandel.xex
-%.xex : %.o
+mandel.xex : mandel.o tables.o
-	ld65 -C atari-asm-xex.cfg -o $@ $<
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+
 %.o : %.s
 	ca65 -o $@ $<
 tables.s : tables.js
 	node tables.js > tables.s
 clean :
 	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex
--- a/mandel.s
+++ b/mandel.s
@ -25,14 +25,15 @@ z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not
 z_buffer_start  = $b1 ; u8: index into z_buffer
 z_buffer_end    = $b2 ; u8: index into z_buffer
 temp            = $b4 ; u16
 temp2           = $b6 ; u16
-pixel_ptr       = $b6 ; u16
+pixel_ptr       = $b8 ; u16
-pixel_color     = $b8 ; u8
+pixel_color     = $ba ; u8
-pixel_mask      = $b9 ; u8
+pixel_mask      = $bb ; u8
-pixel_shift     = $ba ; u8
+pixel_shift     = $bc ; u8
-pixel_offset    = $bb ; u8
+pixel_offset    = $bd ; u8
-fill_level      = $bc ; u8
+fill_level      = $be ; u8
-palette_offset  = $bd ; u8
+palette_offset  = $bf ; u8
 ; FP registers in zero page
 FR0    = $d4 ; float48
@ -107,6 +108,10 @@ KEY_RIGHT = $87
    mantissa .byte 6
 .endstruct
 .import mul_lobyte256
 .import mul_hibyte256
 .import mul_hibyte512
 .data
 strings:
@ -257,6 +262,12 @@ fill_masks:
    add 4, dest, arg2, dest
 .endmacro
 .macro add_carry dest
    lda dest
    adc #0
    sta dest
 .endmacro
 ; 2 + 9 * byte cycles
 .macro sub bytes, dest, arg1, arg2
    sec ; 2 cyc
@ -387,12 +398,12 @@ next:
 ; 5 to 25 cycles
 .macro check_sign arg
    ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the X register.
+    ; keeping a count of sign bits in the Y register.
    .local positive
    lda arg + 1   ; 3 cyc
    bpl positive  ; 2 cyc
    neg16 arg     ; 18 cyc
-    inx           ; 2 cyc
+    iny           ; 2 cyc
 positive:
 .endmacro
@ -421,13 +432,13 @@ positive:
 ; min 470 cycles
 ; max 780 cycles
-.proc imul16_func
+.proc imul16_func_orig
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
-    ldx #0          ; 2 cyc
+    ldy #0          ; 2 cyc
-    ; counts the number of sign bits in X
+    ; counts the number of sign bits in Y
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
@ -447,7 +458,98 @@ positive:
    .endrepeat
    ; In case of mixed input signs, return a negative result.
-    cpx #1              ; 2 cyc
+    cpy #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
 positive_result:
    rts ; 6 cyc
 .endproc
 ; Adapted from https://everything2.com/title/Fast+6502+multiplication
 .macro imul8 dest, arg1, arg2
    .local under256
    .local next
    .local small_product
    .scope
        mul_factor_a   = arg1
        mul_factor_x   = arg2
        mul_product_lo = dest
        mul_product_hi = dest + 1
        lda mul_factor_a      ; setup: 6 cycles
        ;ldx mul_factor_x
        clc                   ; (a + x)^2/2: 23 cycles
        adc mul_factor_x
        tax
        bcc under256
        lda mul_hibyte512,x
        bcs next
    under256:
        lda mul_hibyte256,x
        sec
    next:
        sta mul_product_hi
        lda mul_lobyte256,x
        ldx mul_factor_a      ; - a^2/2: 20 cycles
        sbc mul_lobyte256,x
        sta mul_product_lo
        lda mul_product_hi
        sbc mul_hibyte256,x
        sta mul_product_hi
        ldx mul_factor_x      ; + x & a & 1: 22 cycles
        txa                   ; (this is a kludge to correct a
        and mul_factor_a      ; roundoff error that makes odd * odd too low)
        and #1
        clc
        adc mul_product_lo
        bcc small_product
        inc mul_product_hi
    small_product:
        sec                   ; - x^2/2: 25 cycles
        sbc mul_lobyte256,x
        lda mul_product_hi
        sbc mul_hibyte256,x
        sta mul_product_hi
    .endscope
 .endmacro
 .proc imul16_func
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result
    inter = temp2
    ldy #0          ; 2 cyc
    ; counts the number of sign bits in Y
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
    lda #0
    sta result + 0
    sta result + 1
    sta result + 2
    sta result + 3
    imul8 inter, arg1, arg2
    add16 result, result, inter
    imul8 inter, arg1 + 1, arg2
    add16 result + 1, result + 1, inter
    imul8 inter, arg1, arg2 + 1
    add16 result + 1, result + 1, inter
    add_carry result + 3
    imul8 inter, arg1 + 1, arg2 + 1
    add16 result + 2, result + 2, inter
    ; In case of mixed input signs, return a negative result.
    cpy #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
 positive_result:
--- a/tables.js
+++ b/tables.js
@ -0,0 +1,33 @@
 function db(func) {
    let lines = [];
    for (let i = 0; i < 256; i += 16) {
        let items = [];
        for (let j = 0; j < 16; j++) {
            let x = i + j;
            items.push(func(x));
        }
        lines.push('    .byte ' + items.join(', '));
    }
    return lines.join('\n');
 }
 console.log(
 `.segment "TABLES"
 .export mul_lobyte256
 .export mul_hibyte256
 .export mul_hibyte512
 .align 256
 mul_lobyte256:
 ${db((x) => Math.round(x * x / 2) & 0xff)}
 .align 256
 mul_hibyte256:
 ${db((x) => (Math.round(x * x / 2) >> 8) & 0xff)}
 .align 256
 mul_hibyte512:
 ${db((x) => (Math.round((x + 256) * (x + 256) / 2) >> 8) & 0xff)}
 `);
--- a/testme.js
+++ b/testme.js
@ -0,0 +1,41 @@
 // ax = (a + x)2/2 - a2/2 - x2/2 
 function half_square(x) {
    return Math.round(x * x / 2) & 0xffff >>> 0;
 }
 function mul8(a, b) {
    let result = half_square(a + b) & 0xffff;
    result = (result - half_square(a)) & 0xffff;
    result = (result - half_square(b)) & 0xffff;
    result = (result + (b & a & 1)) & 0xffff;
    return result >>> 0;
 }
 function mul16(a, b) {
    let ah = (a & 0xff00) >>> 8;
    let al = (a & 0x00ff) >>> 0;
    let bh = (b & 0xff00) >>> 8;
    let bl = (b & 0x00ff) >>> 0;
    let result = (mul8(al, bl) & 0xffff) >>> 0;
    result = ((result + (mul8(ah, bl) << 8)) & 0x00ffffff) >>> 0;
    result = ((result + (mul8(al, bh) << 8)) & 0x01ffffff) >>> 0;
    result = ((result + (mul8(ah, bh) << 16)) & 0xffffffff) >>> 0;
    return result;
 }
 let max = 65536;
 //let max = 256;
 //let max = 128;
 //let max = 8;
 for (let a = 0; a < max; a++) {
    for (let b = 0; b < max; b++) {
        let expected = Math.imul(a, b) >>> 0;
        //let actual = mul8(a, b);
        let actual = mul16(a, b);
        if (expected !== actual) {
            console.log(`wrong! ${a} * ${b} expected ${expected} got ${actual}`);
        }
    }
 }