; multiple-precision arithmetic routines
; the ARM is a 32-bit little-endian RISC processor

r0 RN 0
r1 RN 1
r2 RN 2
r3 RN 3
r4 RN 4
r5 RN 5
r6 RN 6
r7 RN 7
r8 RN 8
r9 RN 9
r10 RN 10
r11 RN 11
r12 RN 12
r14 RN 14
sp RN 13
lr RN 14
pc RN 15

	AREA |A$$Code|, CODE, READONLY

	IMPORT global_precision

gp_	DCD global_precision

	EXPORT _p_addc
; _p_addc
; r0,r1 point to the numbers, r2 is carry (0 or 1)
; return with number at r0 increased, new carry in r0
_p_addc	STMFD sp!,{r4,r5,lr}
	LDR r3,gp_			; addr of gp variable
	LDR r3,[r3]			; value of gp variable
	MOVS r3,r3,LSL#16		; <<16 [it's a short]
	MOVEQ r0,r2			; precision = 0 ??!
	LDMEQFD sp!,{r4,r5,pc}
	MOVS r2,r2,LSR#1		; carry into C flag
pa1	LDR r4,[r0]
	LDR r5,[r1],#4
	ADCS r4,r4,r5
	STR r4,[r0],#4
	SUB r3,r3,#1<<16
	TEQ r3,#0
	BNE pa1
	ADC r0,r3,#0			; r3==0 here, so r0 = carry
	LDMFD sp!,{r4,r5,pc}

	EXPORT _p_subb
; _p_subb
; r0,r1 point to the numbers, r2 is borrow (0 or 1)
; return with number at r1 decreased, new borrow in r0
_p_subb	STMFD sp!,{r4,r5,lr}
	LDR r3,gp_			; addr of gp variable
	LDR r3,[r3]			; value of gp variable
	MOVS r3,r3,LSL#16		; <<16 [it's a short]
	MOVEQ r0,r2			; precision = 0 ??!
	LDMEQFD sp!,{r4,r5,pc}
	EOR r2,r2,#1			; carry = 1-borrow
	MOVS r2,r2,LSR#1		; carry into C flag
ps1	LDR r4,[r0]
	LDR r5,[r1],#4
	SBCS r4,r4,r5
	STR r4,[r0],#4
	SUB r3,r3,#1<<16
	TEQ r3,#0
	BNE ps1
	ADC r0,r3,#0			; r3==0 here, so r0 = carry
	EOR r0,r0,#1			; borrow = 1-carry
	LDMFD sp!,{r4,r5,pc}

	EXPORT _p_rol
; _p_rol
; r0 points to a number, r1 is carry.
; rotate left 1 bit, return new carry
_p_rol	LDR r3,gp_
	LDR r3,[r3]
	MOVS r3,r3,LSL#16
	MOVEQ r0,r1
	MOVEQ pc,lr
	MOVS r1,r1,LSR#1		; carry into C flag
pr1	LDR r2,[r0]
	ADCS r2,r2,r2			; left shift 1b with carry -> b0
	STR r2,[r0],#4
	SUB r3,r3,#1<<16
	TEQ r3,#0
	BNE pr1
	ADC r0,r3,#0
	MOV pc,lr

	EXPORT _p_smul
; _p_smul
; r0:prod r1:multiplicand r2=multiplier
; accumulate into prod
_p_smul	STMFD sp!,{r5-r9,lr}
	LDR r3,gp_
	LDR r3,[r3]
	MOVS r3,r3,LSL#16
	LDMEQFD sp!,{r5-r9,pc}
	MOV r14,#0		; carried-over high word
	MOV r7,r2,LSR#16	; high1	in r7
pm1	LDR r5,[r1],#4		; get a digit
	; now a 32x32->64 multiply...
;NBNBNB factor out splitting of r2, since it doesn't change
	MOV r6,r5,LSR#16	; high0 in r6
	SUB r5,r5,r6,LSL#16	; low0	in r5
	SUB r8,r2,r7,LSL#16	; low1  in r8
	MUL r9,r5,r8		; ll in r9
	MUL r8,r6,r8		; hl in r8
	MUL r5,r7,r5		; lh in r5
	MUL r6,r7,r6		; hh in r6
	ADDS r8,r5,r8		; sum of cross terms in r8
	ADDCS r6,r6,#&10000	; bit 16 of hh
	ADDS r5,r9,r8,LSL#16	; ll + low(cross)
	ADC r8,r6,r8,LSR#16	; hh + high(cross) + carry
	; now low in r5, high in r8
	ADDS r5,r5,r14		; add in carry
	ADC r8,r8,#0
	LDR r14,[r0]		; digit from PROD
	ADDS r5,r5,r14		; add it in. There will be at most 1
	ADC r14,r8,#0		;  carry between prev add and this.
	STR r5,[r0],#4		; store a digit
	LDR r5,[r0]
	SUBS r3,r3,#1<<16
	BNE pm1
	ADD r5,r5,r14
	STR r5,[r0]		; add final carry back in
	LDMFD sp!,{r5-r9,pc}

; I really ought to implement mp_dmul in assembler too.
; It's pretty speed-critical.
; Actually if I do this my effort on mp_smul will have been
; wasted :-)
; Well, here goes...

	EXPORT _p_dmul
; _p_dmul
; r0 points to double-length space. r1,r2 point to bignums.
_p_dmul	STMFD sp!,{r4-r11,lr}
	; next few instructions are nasty hack so I can just drop in
	; my existing code...
	MOV r4,r0
	MOV r0,r2
	LDR r3,gp_
	LDR r3,[r3]
	MOV r3,r3,LSL#16
	MOV r3,r3,LSR#16
	MOV r2,r3
	; now for the code itself
mul9999	MOV r6,#0
	MOV r7,r4		; save start of answer
	ANDS r5,r3,#3
	BEQ m99l1		; multiple of 4 words
	CMP r5,#2
	STR r6,[r4],#4
	STRGE r6,[r4],#4
	STRGT r6,[r4],#4
m99l1	BICS r5,r3,#3
	BEQ m99l3
	MOV r8,#0
	MOV r9,#0
	MOV r10,#0
m99l2	STMIA r4!,{r6,r8,r9,r10}
	SUBS r5,r5,#4
	BGT m99l2
	; destination now has lots of 0s
	; it's worth setting out the register allocation here:
	; r0,r1 are digit pointers. r0 points to a[i], r1 to b[0]
	; r2,r3 are lengths. r2 decrements, r3 doesn't
	; r4 points to c[i]
	; r5 is the digit (of a) we're multiplying by
	; r6 is the carry
	; r7 is the inner loop counter ("j")
	; r8,9,10,11,12 are used by the 32x32 multiply
	; r13 is SP
	; r14 is the (unmodified) length of the first number
m99l3	;STMFD sp!,{r14}
	MOV r4,r7		; restore c
	MOV r14,r2		; save la
m99l4	LDR r5,[r0],#4		; f=a[i++]	START OF OUTER LOOP
	MOV r6,#0		; ca=0
	MOV r7,#0		; j=0
m99l5	LDR r8,[r1,r7,LSL#2]	; b[j]		START OF INNER LOOP
	; start of multiply routine -- mul3232 with all regs +8
	MOV r10,r8,LSR#16	; high halfwords in r10,r11
	MOV r11,r5,LSR#16
	SUB r8,r8,r10,LSL#16	; low halfwords in r8,r9
	SUB r9,r5,r11,LSL#16
	MUL r12,r8,r9		; ll in r12
	MUL r9,r10,r9		; hl in r9
	MUL r8,r11,r8		; lh in r8
	MUL r10,r11,r10		; hh in r10
	ADDS r9,r8,r9		; sum of cross terms in r9; now r8 spare
	ADDCS r10,r10,#&10000	; insert carry
	ADDS r8,r12,r9,LSL#16	; low bits in r8
	ADC r9,r10,r9,LSR#16	; high bits in r9
	; end of multiply routine. H in r9, L in r8
	LDR r10,[r4,r7,LSL#2]	; c[i+j]
	ADDS r8,r8,r10		; f*b[j] + c[i+j]
	ADC r9,r9,#0		; including high bits
	ADDS r8,r8,r6		; ... + ca
	ADC r6,r9,#0		; high bits are new carry
	STR r8,[r4,r7,LSL#2]	; low bits are new value for c[i+j]
	ADD r7,r7,#1		; ++j
	CMP r7,r3		; if (j<lb)
	BLT m99l5		; then continue with inner loop
	; END OF INNER LOOP
	STR r6,[r4,r7,LSL#2]	; c[i+j]=ca
	ADD r4,r4,#4		; advance &c[i]
	SUBS r2,r2,#1		; finished outer loop?
	BGT m99l4		; nope
	; END OF OUTER LOOP
	LDMFD sp!,{r4-r11,pc}

	END
