;some optimized functions (yes!)

	AREA |A$$code|,READONLY,CODE

	EXPORT imul31
	EXPORT imulxi
	EXPORT ifft_pass
	EXPORT |check_hardware|

;****************************************************************************

; IN  r0  a, unit is anything
;     r1  b, unit is anything
; OUT r0  (a*b) >> 31
imul31

	smull   r2,r0,r1,r0
	movs    r2,r2,lsr #31
	adc     r0,r2,r0,lsl #1

	mov     pc,r14

; IN  r0  a, unit is anything
;     r1  b, unit is anything
; OUT r0  (a*b) >> xishift
imulxi

	smull   r2,r0,r1,r0
	movs    r2,r2,lsr #24
	adc     r0,r2,r0,lsl #8

	mov     pc,r14

; IN  r0  complex_t* buf
;     r1  int32_t*   weight
;     r2  int        n
; OUT -
ifft_pass
    stmdb   r13!,{r4-r12,r14}
; butterfly0
	ldmia   r0,{r4-r5} ; a0.r,a0.i
    ; buf1 = buf + n;
	add     r3,r0,r2,lsl #3
    ldmia   r3,{r6-r7} ; a1.r,a1.i
    ; buf2 = buf1 + n;
	add     r3,r3,r2,lsl #3
    ldmia   r3,{r8-r9} ; a2.r,a2.i
    ; buf3 = buf2 + n;
	add     r3,r3,r2,lsl #3
    ldmia   r3,{r10-r11}; a3.r,a3.i
    ; tmp1-4
    add     r8,r8,r10
    add     r9,r9,r11
	sub     r14,r8,r10,lsl #1
	sub     r11,r9,r11,lsl #1
	; a0-a3
    add     r4,r4,r8
    add     r5,r5,r9
    add     r6,r6,r11
    sub     r7,r7,r14
	; buf
	stmia   r0!,{r4-r5} ; a0.r,a0.i
	sub     r8,r4,r8,lsl #1
	sub     r9,r5,r9,lsl #1
	sub     r10,r6,r11,lsl #1
	add     r11,r7,r14,lsl #1
	; buf3
    stmia   r3,{r10-r11}; a3.r,a3.i
	; buf2 = buf3 - n
	sub     r3,r3,r2,lsl #3
    stmia   r3,{r8-r9} ; a2.r,a2.i
	; buf1 = buf2 - n
	sub     r3,r3,r2,lsl #3
    stmia   r3,{r6-r7} ; a1.r,a1.i
; i = n - 1
    sub     r12,r2,#1
; butterfly
ifft_pass_loop
    ; buf2 = buf + 2*n;
	add     r3,r0,r2,lsl #4
    ldmia   r3,{r6-r7} ; a2.r,a2.i

	ldr     r4,[r1,r2,lsl #2] ;wr
	ldr     r5,[r1,r12,lsl #3] ;wi
	add     r1,r1,#4

	; tmp2-4
    smull   r14,r11,r6,r4
    smlal   r14,r11,r7,r5
	movs    r14,r14,lsr #31
	adc     r11,r14,r11,lsl #1
	rsb     r5,r5,#0
    smull   r14,r9,r7,r4
    smlal   r14,r9,r6,r5
	movs    r14,r14,lsr #31
	adc     r9,r14,r9,lsl #1
    ; buf3 = buf2 + n;
	add     r3,r3,r2,lsl #3
    ldmia   r3,{r6-r7} ; a3.r,a3.i
	; tmp1-3
    smull   r14,r8,r6,r4
    smlal   r14,r8,r7,r5
	movs    r14,r14,lsr #31
	adc     r8,r14,r8,lsl #1
	rsb     r5,r5,#0
    smull   r14,r10,r7,r4
    smlal   r14,r10,r6,r5
	movs    r14,r14,lsr #31
	adc     r10,r14,r10,lsl #1
	; buf
	ldmia   r0,{r4-r5} ; a0.r,a0.i
    ; buf1 = buf + n;
	add     r3,r0,r2,lsl #3
    ldmia   r3,{r6-r7} ; a1.r,a1.i

    add     r8,r8,r11         ; tmp1
	sub     r11,r8,r11,lsl #1 ; tmp4
    add     r9,r9,r10         ; tmp2
	sub     r10,r9,r10,lsl #1 ; tmp3
	; a0-a3
    add     r4,r4,r8
    add     r5,r5,r9
	; buf0
	stmia   r0!,{r4-r5} ; a0.r,a0.i

    add     r6,r6,r10
    add     r7,r7,r11
	; buf 1
    stmia   r3,{r6-r7} ; a1.r,a1.i

	sub     r8,r4,r8,lsl #1
	sub     r9,r5,r9,lsl #1
	; buf2 = buf1 + n
	add     r3,r3,r2,lsl #3
    stmia   r3,{r8-r9} ; a2.r,a2.i

	sub     r10,r6,r10,lsl #1
	sub     r11,r7,r11,lsl #1
	; buf3 = buf3 + n
	add     r3,r3,r2,lsl #3
    stmia   r3,{r10-r11}; a3.r,a3.i

iff_pass_loop_next
	subs    r12,r12,#1
	bgt     ifft_pass_loop

	ldmia   r13!,{r4-r12,pc}

;==========================================================================================

; parameters:
;  out: r0 = 0 if ok, error ptr otherwise

Err_StrongARM
	DCD     0
	= "Processor with 64-bit multiplication instructions (StrongARM or later) is required.",0
	ALIGN
|check_hardware|
; Check if CPU supports SMLA
	MOV     R0,#0
	MOV     R1,#0
	MOV     R2,#251
	MOV     R3,#241
	SMLAL   R0,R1,R2,R3
	MUL     R1,R2,R3
	CMP     R0,R1
	ADRNE   R0,Err_StrongARM
	MOVEQ   R0,#0
	MOV     PC,R14

	END
