WIP alternate imul16

not working at present
2023-02-11 12:24:48 -08:00 · 2023-02-11 12:24:48 -08:00 · e3c80bff59
commit e3c80bff59
parent 839330edb3
3 changed files with 116 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 *.o
 *.xex
+tables.s
 .DS_Store
--- a/8
+++ b/8
@ -2,13 +2,17 @@

 all : mandel.xex

-%.xex : %.o
-	ld65 -C atari-asm-xex.cfg -o $@ $<
+mandel.xex : mandel.o tables.o
+	ld65 -C ./atari-asm-xex.cfg -o $@ $+

 %.o : %.s
 	ca65 -o $@ $<

+tables.s : tables.js
+	node tables.js > tables.s
+
 clean :
+	rm -f tables.s
 	rm -f *.o
 	rm -f *.xex

--- a/mandel.s
+++ b/mandel.s
@ -22,11 +22,12 @@ total_ms     = $a4 ; float48
 total_pixels = $aa ; float48

 temp         = $b0 ; u16
-pixel_ptr    = $b2 ; u16
-pixel_color  = $b4 ; u8
-pixel_mask   = $b5 ; u8
-pixel_shift  = $b6 ; u8
-pixel_offset = $b7 ; u8
+temp2        = $b2 ; u16
+pixel_ptr    = $b4 ; u16
+pixel_color  = $b6 ; u8
+pixel_mask   = $b7 ; u8
+pixel_shift  = $b8 ; u8
+pixel_offset = $b9 ; u8


 ; FP registers in zero page
@ -83,6 +84,10 @@ SETVBV = $E45C
    mantissa .byte 6
 .endstruct

+.import mul_lobyte256
+.import mul_hibyte256
+.import mul_hibyte512
+
 .data

 strings:
@ -206,6 +211,12 @@ color_map:
    add 4, dest, arg2, dest
 .endmacro

+.macro add_carry dest
+    lda dest
+    adc #0
+    sta dest
+.endmacro
+
 ; 2 + 9 * byte cycles
 .macro sub bytes, dest, arg1, arg2
    sec ; 2 cyc
@ -336,12 +347,12 @@ next:
 ; 5 to 25 cycles
 .macro check_sign arg
    ; Check sign bit and flip argument to postive,
-    ; keeping a count of sign bits in the X register.
+    ; keeping a count of sign bits in the Y register.
    .local positive
    lda arg + 1   ; 3 cyc
    bpl positive  ; 2 cyc
    neg16 arg     ; 18 cyc
-    inx           ; 2 cyc
+    iny           ; 2 cyc
 positive:
 .endmacro

@ -370,13 +381,13 @@ positive:

 ; min 470 cycles
 ; max 780 cycles
-.proc imul16_func
+.proc imul16_func_orig
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result

-    ldx #0          ; 2 cyc
-    ; counts the number of sign bits in X
+    ldy #0          ; 2 cyc
+    ; counts the number of sign bits in Y
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc
    
@ -396,7 +407,94 @@ positive:
    .endrepeat

    ; In case of mixed input signs, return a negative result.
-    cpx #1              ; 2 cyc
+    cpy #1              ; 2 cyc
+    bne positive_result ; 2 cyc
+    neg32 result        ; 34 cyc
+positive_result:
+
+    rts ; 6 cyc
+.endproc
+
+; Adapted from https://everything2.com/title/Fast+6502+multiplication
+.macro imul8 dest, arg1, arg2
+    .scope
+        mul_factor_a   = arg1
+        mul_factor_x   = arg2
+        mul_product_lo = dest
+        mul_product_hi = dest + 1
+
+        lda mul_factor_a      ; setup: 6 cycles
+        ;ldx mul_factor_x
+
+        clc                   ; (a + x)^2/2: 23 cycles
+        adc mul_factor_x
+        tax
+        bcc under256
+        lda mul_hibyte512,x
+        bcs next
+    under256:
+        lda mul_hibyte256,x
+        sec
+    next:
+        sta mul_product_hi
+        lda mul_lobyte256,x
+
+        ldx mul_factor_a      ; - a^2/2: 20 cycles
+        sbc mul_lobyte256,x
+        sta mul_product_lo
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+
+        ldx mul_factor_x      ; + x & a & 1: 22 cycles
+        txa                   ; (this is a kludge to correct a
+        and mul_factor_a      ; roundoff error that makes odd * odd too low)
+        and #1
+
+        clc
+        adc mul_product_lo
+        bcc small_product
+        inc mul_product_hi
+    small_product:
+        sec                   ; - x^2/2: 25 cycles
+        sbc mul_lobyte256,x
+        lda mul_product_hi
+        sbc mul_hibyte256,x
+        sta mul_product_hi
+    .endscope
+.endmacro
+
+.proc imul16_func
+    arg1 = FR0   ; 16-bit arg (clobbered)
+    arg2 = FR1   ; 16-bit arg (clobbered)
+    result = FR2 ; 32-bit result
+
+    ldy #0          ; 2 cyc
+    ; counts the number of sign bits in Y
+    check_sign arg1 ; 5 to 25 cyc
+    check_sign arg2 ; 5 to 25 cyc
+
+    lda #0
+    sta result + 0
+    sta result + 1
+    sta result + 2
+    sta result + 3
+
+    imul8 temp, arg1, arg2
+    add16 result, result, temp
+
+    imul8 temp, arg1 + 1, arg2
+    add16 result + 1, result + 1, temp
+
+    imul8 temp, arg1, arg2 + 1
+    add16 result + 1, result + 1, temp
+    add_carry result + 3
+
+    imul8 temp, arg1 + 1, arg2 + 1
+    add16 result + 2, result + 2, temp
+
+    ; In case of mixed input signs, return a negative result.
+    cpy #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
 positive_result: