;some optimized functions (yes!)

        GET hdr.xint

	AREA |A$$code|,READONLY,CODE

	EXPORT MULT32
	EXPORT MULT31
	EXPORT |LMULBI|
	EXPORT |LMULSH|
	EXPORT |LMULSHR|
	EXPORT out_interlaced
	EXPORT |check_hardware|

;****************************************************************************

; IN  r0  a, unit is anything
;     r1  b, unit is anything
; OUT r0  (a*b)>>32
MULT32

	smull   r2,r0,r1,r0
	mov     pc,r14

; IN  r0  a, unit is anything
;     r1  b, unit is anything
; OUT r0  (a*b)>>31
MULT31

	smull   r2,r0,r1,r0
	movs    r2,r2,lsr #31
	adc     r0,r2,r0,lsl #1

	mov     pc,r14

; IN  r0  a, unit is 1<<bishift
;     r1  b, unit is anything
; OUT r0  a*b, unit is anything
|LMULBI|

	smull   r2,r0,r1,r0
	movs    r2,r2,lsr #bishift
	adc     r0,r2,r0,lsl #(32-bishift)

	mov     pc,r14

; IN  r0  a
;     r1  b
;     r2  shift
; OUT r0  (a*b) >> (32+shift)
|LMULSH|

; shift > 31, return 0
	cmp     r2,#31
	movgt   r0,#0
	movgt   pc,r14

; (l,h) = (a*b)
	smull   r3,r0,r1,r0

	cmp     r2,#0
	blt     LMULSH_N
	beq     LMULSH_0

LMULSH_P
; shift >= 0: return [(l,h) >> (32+shift)] which is [(h >> shift)]
	movs    r0,r0,asr r2
	adc     r0,r0,#0
	mov     pc,r14

LMULSH_0
; shift == 0: return [(l,h) >> 32] which is [h]
	cmp     r3,#0
	addlt   r0,r0,#1
	mov     pc,r14

LMULSH_N
; shift < 0: shift = 32-abs(shift)
	add     r2,r2,#32
	cmp     r2, #0
	blt     LMULSH_NN

; return [(l,h) >> shift] which is [(h << (32-shift)) + (l >> shift)]
    movs    r3,r3,lsr r2
	rsb     r2,r2,#32
	adc     r0,r3,r0,lsl r2
	mov     pc,r14

LMULSH_NN
; shift < 0:  return [(l,h) << -shift] which is [l << -shift]
;                    assuming there is no overflow possible (h = 0, l small)
    rsb     r2, r2,#0
    mov     r0,r3,lsl r2
	mov     pc,r14

; IN  r0  a
;     r1  b
;     r2  -32 < shift < 32
; OUT r0  (a*b) >> (32+shift)
|LMULSHR|

; (l,h) = (a*b)
	smull   r3,r0,r1,r0

	cmp     r2,#0
	blt     LMULSHR_N
	beq     LMULSHR_0

LMULSHR_P
; shift >= 0: return [(l,h) >> (32+shift)] which is [(h >> shift)]
	movs    r0,r0,asr r2
	adc     r0,r0,#0
	mov     pc,r14

LMULSHR_0
; shift == 0: return [(l,h) >> 32] which is [h]
	cmp     r3,#0
	addlt   r0,r0,#1
	mov     pc,r14

LMULSHR_N
; shift < 0: shift = 32-abs(shift)
	add     r2,r2,#32

; return [(l,h) >> shift] which is [(h << (32-shift)) + (l >> shift)]
    movs    r3,r3,lsr r2
	rsb     r2,r2,#32
	adc     r0,r3,r0,lsl r2
	mov     pc,r14

; r0 xint** in
; r1 short* out
; r2 int    samples
; r3 int    channels
out_interlaced
	stmfd	r13!,{r4-r12,r14}
    bic     r1,r1,#3 ; dirty but align stuff
	cmp     r3,#2
	blt     out_interlaced_1c
	bgt     out_interlaced_xc

out_interlaced_2c
	mov     r3,#&10000
	sub     r3,r3,#1
	ldr     r14,[r0,#4]
	ldr     r0,[r0]
out_interlaced_2cloop
	ldmia	r0!,{r4,r6,r8,r10}
	ldmia	r14!,{r5,r7,r9,r11}

	movs    r4,r4,asr #xishift-15
	adc     r4,r4,#0
	subs    r12,r4,#&8000
	movge   r4,r3,lsr#1
	adds    r12,r4,#&8000
	movlt   r4,#&8000
	and     r4,r4,r3
	movs    r5,r5,asr #xishift-15
	adc     r5,r5,#0
	subs    r12,r5,#&8000
	movge   r5,r3,lsr#1
	adds    r12,r5,#&8000
	movlt   r5,#&8000
	orr     r4,r4,r5,lsl#16

	movs    r6,r6,asr #xishift-15
	adc     r6,r6,#0
	subs    r12,r6,#&8000
	movge   r6,r3,lsr#1
	adds    r12,r6,#&8000
	movlt   r6,#&8000
	and     r6,r6,r3
	movs    r7,r7,asr #xishift-15
	adc     r7,r7,#0
	subs    r12,r7,#&8000
	movge   r7,r3,lsr#1
	adds    r12,r7,#&8000
	movlt   r7,#&8000
	orr     r6,r6,r7,lsl#16

	movs    r8,r8,asr #xishift-15
	adc     r8,r8,#0
	subs    r12,r8,#&8000
	movge   r8,r3,lsr#1
	adds    r12,r8,#&8000
	movlt   r8,#&8000
	and     r8,r8,r3
	movs    r9,r9,asr #xishift-15
	adc     r9,r9,#0
	subs    r12,r9,#&8000
	movge   r9,r3,lsr#1
	adds    r12,r9,#&8000
	movlt   r9,#&8000
	orr     r8,r8,r9,lsl#16

	movs    r10,r10,asr #xishift-15
	adc     r10,r10,#0
	subs    r12,r10,#&8000
	movge   r10,r3,lsr#1
	adds    r12,r10,#&8000
	movlt   r10,#&8000
	and     r10,r10,r3
	movs    r11,r11,asr #xishift-15
	adc     r11,r11,#0
	subs    r12,r11,#&8000
	movge   r11,r3,lsr#1
	adds    r12,r11,#&8000
	movlt   r11,#&8000
	orr     r10,r10,r11,lsl#16

	stmia	r1!,{r4,r6,r8,r10}
	subs    r2,r2,#4
	bgt     out_interlaced_2cloop
	b	out_interlaced_end

out_interlaced_1c
	mov     r4,#&10000
	sub     r4,r4,#1
	ldr     r0,[r0]
out_interlaced_1cloop
	ldmia   r0!,{r5,r6}
	mov     r5,r5,asr #xishift-15
	subs    r11,r5,#&8000
	movge   r5,r4,lsr#1
	adds    r11,r5,#&8000
	movlt   r5,#&8000
	and     r5,r5,r4

	mov     r6,r6,asr #xishift-15
	subs    r11,r6,#&8000
	movge   r6,r4,lsr#1
	adds    r11,r6,#&8000
	movlt   r6,#&8000

	orr     r5,r5,r6,lsl#16
	str     r5,[r1],#4
	subs    r2,r2,#2
	bgt     out_interlaced_1cloop
	b	out_interlaced_end

out_interlaced_xc
	mov     r4,#&10000
	sub     r4,r4,#1
	mov     r6,#0
out_interlaced_cloop
	ldr     r5,[r0,r6,lsl #2]
	mov     r7,#0
out_interlaced_sloop
	ldr     r8,[r5,r7,lsl #2]
	mov     r8,r8,asr #xishift-15
	subs    r11,r8,#&8000
	movge   r8,r4,lsr#1
	adds    r11,r8,#&8000
	movlt   r8,#&8000

    strb    r8,[r1,#0]
    mov     r8,r8,lsr #8
    strb    r8,[r1,#1]
    add     r1,r1,r3,lsl #1

	add     r7,r7,#1
	cmp     r7,r2
	blt     out_interlaced_sloop

    mul     r7,r3,r2
    sub     r1,r1,r7,lsl #1
    add     r1,r1,#2

	add     r6,r6,#1
	cmp     r6,r3
	blt     out_interlaced_cloop

	b	out_interlaced_end

out_interlaced_end
	[ {CONFIG}=32
	ldmfd	r13!,{r4-r12,pc}
	|
	ldmfd	r13!,{r4-r12,pc}^
	]

;==========================================================================================

; parameters:
;  out: r0 = 0 if ok, error ptr otherwise

Err_StrongARM
	DCD     0
	= "Processor with 64-bit multiplication instructions (StrongARM or later) is required.",0
	ALIGN
|check_hardware|
; Check if CPU supports SMLA
	MOV     R0,#0
	MOV     R1,#0
	MOV     R2,#251
	MOV     R3,#241
	SMLAL   R0,R1,R2,R3
	MUL     R1,R2,R3
	CMP     R0,R1
	ADRNE   R0,Err_StrongARM
	MOVEQ   R0,#0
	MOV     PC,R14

	END
