;some optimized functions (yes!)

	AREA |A$$code|, CODE, READONLY

	EXPORT |DEQUANT|
	EXPORT |UMULSH|
	EXPORT |LMUL|
	EXPORT |EXPn3|
	EXPORT |subbandsynthesis_2|
	EXPORT |subbandsynthesis_3|
	EXPORT |dct36|
	EXPORT |antialias_1|
	EXPORT |check_hardware|

xunit * 23

;==========================================================================================

	MACRO
	LMULX   $dest,$m1,$m2,$shift ;$dest!=$m1!=$m2, $m1 clobbered

	smull   $m1,$dest,$m2,$m1
	movs    $m1,$m1,lsr #$shift
	adc     $dest,$m1,$dest,lsl #32-$shift

	MEND

;==========================================================================================

expx_3
	DCD     0
	DCD     (&285145F0>>4) ; 2^(1/3) unit is 2^25
	DCD     (&32CBFD40>>4) ; 2^(2/3) unit is 2^25
	DCD     (&40000000>>4) ; 2^(3/3) unit is 2^25

; parameters:
;  r0 = a       inout  a, = a*expx_3(n)>>shift  fixed, unit is 2^30 in and 2^xunit out
;  r1 = scale   in     (shift << 2) + n         encoded

|EXPn3|
	adr     r3,expx_3
	and     r2,r1,#3
	ldr     r3,[r3,r2,lsl #2]
	smull   r2,r0,r3,r0
	mov     r1,r1,lsr #2
;	sub     r1,r1,# (xunit + 32 - 25 - 30)
	movs    r0,r0,asr r1
	addcs   r0,r0,#1

	mov     pc,r14

;==========================================================================================

; parameters:
;  r0 = a       inout  a, = (a*b) >> (32-shift) fixed
;  r1 = b       in     b                        fixed
;  r2 = shift   in     shift                    unsigned integer

|LMUL|

	smull   r3,r0,r1,r0
	movs    r3,r3,lsr r2
	rsb     r2,r2,#32
	adc     r0,r3,r0,lsl r2

	mov     pc,r14

;==========================================================================================

pow2tab
	DCD &40000000 ; 2^(-0/4) unit is 2^30
	DCD &35d13f32 ; 2^(-1/4) unit is 2^30
	DCD &2d413ccd ; 2^(-2/4) unit is 2^30
	DCD &260dfc14 ; 2^(-3/4) unit is 2^30

; parameters:
;  r0 = xr      inout  ptr to bands array       fixed, 2^xunit on exit
;  r1 = pow43   in     ptr to pow43tab array    encoded
;                      (exp <<27) + mantissa : val = mantissa * 2^-exp, unit is 1
;  r2 = xyz     in     value for 2^xyz          (signed?) integer
;  r3 = counter in     amount of bands          integer

|DEQUANT|
	STMFD   R13!,{R4-R8,R14}
	MOV     R4,#(xunit + 32 - 30)
	RSB     R4,R4,R2,LSR #2
	AND     R2,R2,#3
	ADR     R5,pow2tab
	LDR     R2,[R5,R2,LSL #2]

dequant_loop
	LDR     R5,[R0]
	MOVS    R7,R5
	RSBLT   R5,R5,#0
;	CMP     R5,#1040
;	MOVGE   R5,R5,LSR #3
;	SUBGE   R8,R4,#4
;	MOVLT   R8,R4
	MOV     R8,R4

	LDR     R5,[R1,R5,LSL #2]
	ADD     R8,R8,R5,LSR #27

	CMP     R8,#31
	MOVGT   R5,#0
	BGT     dequant_loop_store

	BIC     R5,R5,#0xF8000000
	UMULL   R6,R5,R2,R5

	CMP     R8,#0
	MOVGT   R5,R5,LSR R8
	BGE     dequant_loop_store

	ADD     R8,R8,#32
    MOVS    R6,R6,LSR #8
    RSB     R8,R8,#32
    ADC     R5,R6,R5,LSL R8

dequant_loop_store
	CMP     R7,#0
	RSBLT   R5,R5,#0
	STR     R5,[R0],#4
	SUBS    R3,R3,#1
	BGT     dequant_loop
	LDMFD   R13!,{R4-R8,PC}

; parameters:
;  r0 = a       inout  a, = (a*b) >> (32-shift) fixed
;  r1 = b       in     b                        fixed
;  r2 = shift   in     shift                    signed integer !!

|UMULSH|

	cmp     r2,#31
	movgt   r0,#0
	movgt   pc,r14

	umull   r3,r0,r1,r0

	cmp     r2,#0
	movgt   r0,r0,lsr r2
	movge   pc,r14

	rsb     r2,r2,#0
	mov     r0,r0,lsl r2
	rsb     r2,r2,#32
	orr     r0,r0,r3,lsr r2

	mov     pc,r14

;==========================================================================================

; 1/(2*cos(PI*x/n)
; 1 is 1<<27
cos1_64
	DCD &04013c25
cos3_64
	DCD &040b345b
cos5_64
	DCD &041fa2d6
cos7_64
	DCD &043f9342
cos9_64
	DCD &046cc1bc
cos11_64
	DCD &04a9d9cf
cos13_64
	DCD &04fae371
cos15_64
	DCD &056601ea
cos17_64
	DCD &05f4cf6e
cos19_64
	DCD &06b6fcf2
cos21_64
	DCD &07c7d1d8
cos23_64
	DCD &095b0352
cos25_64
	DCD &0bdf91b2
cos27_64
	DCD &107655e4
cos29_64
	DCD &1b42c834
cos31_64
	DCD &518522fa
cos1_32
	DCD &0404f467
cos3_32
	DCD &042e13c1
cos5_32
	DCD &048919f4
cos7_32
	DCD &052cb0e6
cos9_32
	DCD &064e2402
cos11_32
	DCD &087c4495
cos13_32
	DCD &0dc79258
cos15_32
	DCD &28cf2701
cos1_16
	DCD &04140fb4
cos3_16
	DCD &04cf8de8
cos5_16
	DCD &073326bb
cos7_16
	DCD &1480d9d0
cos1_8
	DCD &04545e9e
cos3_8
	DCD &0a73d748
cos1_4
	DCD &05a82799

; parameters:
;  r0 = bptr1   out    ptr to hanbuf1 array     fixed
;  r1 = bptr2   out    ptr to hanbuf2 array     fixed
;  r2 = bandp   in     ptr to bands array       fixed

|subbandsynthesis_2|

	stmfd   r13!,{r4-r12,r14}

	stmfd   r13!,{r0-r2}
	sub     r13,r13,#16*4 ; buffer of intermediate values

	mov     r8,r13
	mov     r12,r2
	add     r14,r2,#32*4

	; p[i] = b[i] + b[31-i]
	; i = 0 to 3
	ldmia   r12!,{r0-r3}
	ldmdb   r14!,{r4-r7}
	add     r0,r0,r7
	add     r1,r1,r6
	add     r2,r2,r5
	add     r3,r3,r4
	stmia   r8!,{r0-r3}

	; i = 4 to 7
	ldmia   r12!,{r0-r3}
	ldmdb   r14!,{r4-r7}
	add     r0,r0,r7
	add     r1,r1,r6
	add     r2,r2,r5
	add     r3,r3,r4
	stmia   r8!,{r0-r3}

	; i = 8 to 11
	ldmia   r12!,{r0-r3}
	ldmdb   r14!,{r4-r7}
	add     r0,r0,r7
	add     r1,r1,r6
	add     r2,r2,r5
	add     r3,r3,r4

	; i = 12 to 15
	ldmia   r12!,{r4-r7}
	ldmdb   r14!,{r8-r11}
	add     r4,r4,r11
	add     r5,r5,r10
	add     r6,r6,r9
	add     r7,r7,r8

	; r0-r7 is p8-p15    r13[0-7] is p0-p7

;---------------------------------------------------

	ldmia   r13,{r9-r12}    ; p0-p3

	ldr     r14,cos1_32
	sub     r8,r9,r7
	mov     r8,r8,asl#5     ;pp8 = (p0 - p15) * cos1_32
	add     r7,r9,r7        ;pp0 = (p0 + p15)
	smull   r9,r8,r14,r8

	ldr     r14,cos3_32
	sub     r9,r10,r6
	mov     r9,r9,asl#5     ;pp9 = (p1 - p14) * cos3_32
	add     r6,r10,r6       ;pp1 = (p1 + p14)
	smull   r10,r9,r14,r9

	ldr     r14,cos5_32
	sub     r10,r11,r5
	mov     r10,r10,asl#5   ;pp10 = (p2 - p13) * cos5_32
	add     r5,r11,r5       ;pp2  = (p2 + p13)
	smull   r11,r10,r14,r10

	ldr     r14,cos7_32
	sub     r11,r12,r4
	mov     r11,r11,asl#5   ;pp11 = (p3 - p12) * cos7_32
	add     r4,r12,r4       ;pp3  = (p3 + p12)
	smull   r12,r11,r14,r11

	add     r12,r13,#8*4
	stmia   r12,{r8-r11}

	ldmdb   r12,{r9-r12}

	ldr     r14,cos9_32
	sub     r8,r9,r3
	mov     r8,r8,asl#5     ;pp12 = (p4 - p11) * cos9_32
	add     r3,r9,r3        ;pp4  = (p4 + p11)
	smull   r9,r8,r14,r8

	ldr     r14,cos11_32
	sub     r9,r10,r2
	mov     r9,r9,asl#5     ;pp13 = (p5 - p10) * cos11_32
	add     r2,r10,r2       ;pp5  = (p5 + p10)
	smull   r10,r9,r14,r9

	ldr     r14,cos13_32
	sub     r10,r11,r1
	mov     r10,r10,asl#5   ;pp14 = (p6 - p9) * cos13_32
	add     r1,r11,r1       ;pp6  = (p6 + p9)
	smull   r11,r10,r14,r10

	ldr     r14,cos15_32
	sub     r11,r12,r0
	mov     r11,r11,asl#5   ;pp15 = (p7 - p8) * cos15_32
	add     r0,r12,r0       ;pp7  = (p7 + p8)
	smull   r12,r11,r14,r11

	add     r12,r13,#12*4
	stmia   r12,{r8-r11}

	; r0-r7 is pp7-pp0  |  r13[8-15] is pp8-pp15

;---------------------------------------------------

	sub     r8,r7,r0
	mov     r8,r8,asl#5 ;p4 = (pp0 - pp7) * cos1_16
	add     r0,r7,r0    ;p0 = (pp0 + pp7)

	sub     r9,r6,r1
	mov     r9,r9,asl#5 ;p5 = (pp1 - pp6) * cos3_16
	add     r1,r6,r1    ;p1 = (pp1 + pp6)

	sub     r6,r5,r2
	mov     r6,r6,asl#5 ;p6 = (pp2 - pp5) * cos5_16
	add     r2,r5,r2    ;p2 = (pp2 + pp5)

	sub     r7,r4,r3
	mov     r7,r7,asl#5 ;p7 = (pp3 - pp4) * cos7_16
	add     r3,r4,r3    ;p3 = (pp3 + pp4)

	mov     r4,r8
	mov     r5,r9

	adr     r14,cos1_16
	ldmia   r14,{r8-r11}
	smull   r14,r4,r8,r4
	smull   r14,r5,r9,r5
	smull   r14,r6,r10,r6
	smull   r14,r7,r11,r7

	; r0-r7 is p0-p7
;==
	ldr     r14,cos3_8

	sub     r8,r1,r2
	mov     r8,r8,asl#5 ;pp3 = (p1 - p2) * cos3_8
	add     r1,r1,r2    ;pp1 = (p1 + p2)
	smull   r12,r8,r14,r8

	sub     r9,r5,r6
	mov     r9,r9,asl#5 ;pp7 = (p5 - p6) * cos3_8
	add     r5,r5,r6    ;pp5 = (p5 + p6)
	smull   r12,r9,r14,r9

	ldr     r14,cos1_8

	sub     r2,r0,r3
	mov     r2,r2,asl#5 ;pp2 = (p0 - p3) * cos1_8
	add     r0,r0,r3    ;pp0 = (p0 + p3)
	smull   r12,r2,r14,r2

	sub     r6,r4,r7
	mov     r6,r6,asl#5 ;pp6 = (p4 - p7) * cos1_8
	add     r4,r4,r7    ;pp4 = (p4 + p7)
	smull   r12,r6,r14,r6

	mov     r3,r8
	mov     r7,r9

	; r0-r7 is pp0-pp7
;==
	ldr     r14,cos1_4

	sub     r8,r0,r1
	mov     r8,r8,asl#5 ;p1 = (pp0 - pp1) * cos1_4
	add     r0,r0,r1    ;p0 = (pp0 + pp1)
	smull   r12,r1,r14,r8

	sub     r8,r2,r3
	mov     r8,r8,asl#5 ;p3 = (pp2 - pp3) * cos1_4
	add     r2,r2,r3    ;p2 = (pp2 + pp3)
	smull   r12,r3,r14,r8

	sub     r8,r4,r5
	mov     r8,r8,asl#5 ;p5 = (pp4 - pp5) * cos1_4
	add     r4,r4,r5    ;p4 = (pp4 + pp5)
	smull   r12,r5,r14,r8

	sub     r8,r6,r7
	mov     r8,r8,asl#5 ;p7 = (pp6 - pp7) * cos1_4
	add     r6,r6,r7    ;p6 = (pp6 + pp7)
	smull   r12,r7,r14,r8

	; r0-r7 is p0-p7
;==---------------------------
	ldr     r8,[r13,#16*4]  ; reload buffer pointers
	ldr     r9,[r13,#17*4]

	add     r14,r6,r7       ;tmp = p6 + p7
	rsb     r14,r14,#0
	sub     r11,r14,r5
	str     r11,[r9,#4*64]  ;buf[36] = -(p5 + tmp)
	sub     r11,r14,r4
	str     r11,[r9,#12*64] ;buf[44] = -(p4 + tmp)
	add     r11,r5,r7
	str     r11,[r8,#4*64]  ;buf[4] = p5 + p7
	rsb     r11,r0,#0
	str     r11,[r9,#16*64] ;buf[48] = -p0
	str     r1,[r8,#0*64]   ;buf[0] = p1
	rsb     r1,r1,#0
	str     r1,[r9,#0*64]   ;buf[32] = - buf[0]
	str     r3,[r8,#8*64]   ;buf[8] = p3
	str     r7,[r8,#12*64]  ;buf[12] = p7
	add     r11,r2,r3
	rsb     r11,r11,#0
	str     r11,[r9,#8*64]  ;buf[40] = -(p2  + p3)

;---------------------------------------------------

	add     r12,r13,#8*4
	ldmia   r12,{r0-r3,r8-r11}  ;pp8-pp15

	sub     r4,r0,r11
	mov     r4,r4,asl#5 ;p12 = (pp8 - pp15) * cos1_16
	add     r0,r0,r11   ;p8  = (pp8 + pp15)

	sub     r5,r1,r10
	mov     r5,r5,asl#5 ;p13 = (pp9 - pp14) * cos3_16
	add     r1,r1,r10   ;p9  = (pp9 + pp14)

	sub     r6,r2,r9
	mov     r6,r6,asl#5 ;p14 = (pp10 - pp13) * cos5_16
	add     r2,r2,r9    ;p10 = (pp10 + pp13)

	sub     r7,r3,r8
	mov     r7,r7,asl#5 ;p15 = (pp11 - pp12) * cos7_16
	add     r3,r3,r8    ;p11 = (pp11 + pp12)

	adr     r14,cos1_16
	ldmia   r14,{r8-r11}
	smull   r14,r4,r8,r4
	smull   r14,r5,r9,r5
	smull   r14,r6,r10,r6
	smull   r14,r7,r11,r7

	; r0-r7 is p8-p15
;==
	ldr     r14,cos1_8

	sub     r8,r0,r3
	mov     r8,r8,asl#5 ;pp10 = (p8 - p11) * cos1_8
	add     r0,r0,r3    ;pp8  = (p8 + p11)
	smull   r12,r8,r14,r8

	sub     r9,r4,r7
	mov     r9,r9,asl#5 ;pp14 = (p12 - p15) * cos1_8
	add     r4,r4,r7    ;pp12 = (p12 + p15)
	smull   r12,r9,r14,r9

	ldr     r14,cos3_8

	sub     r3,r1,r2
	mov     r3,r3,asl#5 ;pp11 = (p9 - p10) * cos3_8
	add     r1,r1,r2    ;pp9  = (p9 + p10)
	smull   r12,r3,r14,r3

	sub     r7,r5,r6
	mov     r7,r7,asl#5 ;pp15 = (p13 - p14) * cos3_8
	add     r5,r5,r6    ;pp13 = (p13 + p14)
	smull   r12,r7,r14,r7

	mov     r2,r8
	mov     r6,r9

	; r0-r7 is pp8-pp15
;==
	ldr     r14,cos1_4

	sub     r8,r0,r1
	mov     r8,r8,asl#5 ;p9 = (pp8 - pp9) * cos1_4
	add     r0,r0,r1    ;p8 = (pp8 + pp9)
	smull   r12,r1,r14,r8

	sub     r8,r2,r3
	mov     r8,r8,asl#5 ;p11 = (pp10 - pp11) * cos1_4
	add     r2,r2,r3    ;p10 = (pp10 + pp11)
	smull   r12,r3,r14,r8

	sub     r8,r4,r5
	mov     r8,r8,asl#5 ;p13 = (pp12 - pp13) * cos1_4
	add     r4,r4,r5    ;p12 = (pp12 + pp13)
	smull   r12,r5,r14,r8

	sub     r8,r6,r7
	mov     r8,r8,asl#5 ;p15 = (pp14 - pp15) * cos1_4
	add     r6,r6,r7    ;p14 = (pp14 + pp15)
	smull   r12,r7,r14,r8

	; r0-r7 is p8-p15
;==----------------------------
	ldr     r8,[r13,#16*4]  ; reload buffer pointers
	ldr     r9,[r13,#17*4]

	add     r14,r3,r7       ;tmp = p11 + p15
	str     r14,[r8,#10*64] ;buf[10] = tmp
	add     r11,r14,r5
	str     r11,[r8,#6*64]  ;buf[6] = p13 + tmp
	add     r14,r6,r7       ;tmp = p14 + p15
	rsb     r14,r14,#0
	sub     r11,r14,r0
	sub     r11,r11,r4
	str     r11,[r9,#14*64] ;buf[46] = -(p8  + p12 + tmp)
	sub     r11,r14,r1
	sub     r11,r11,r5
	str     r11,[r9,#2*64]  ;buf[34] = -(p9  + p13 + tmp)
	sub     r14,r14,r2
	sub     r14,r14,r3      ;tmp += p10 + p11
	sub     r11,r14,r5
	str     r11,[r9,#6*64]  ;buf[38] = -(p13 + tmp)
	sub     r11,r14,r4
	str     r11,[r9,#10*64] ;buf[42] = -(p12 + tmp)
	add     r11,r1,r5
	add     r11,r11,r7
	str     r11,[r8,#2*64]  ;buf[2] = p9 + p13 + p15
	str     r7,[r8,#14*64]  ;buf[14] = p15

;++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	ldr     r2,[r13,#18*4] ; bandp

	mov     r8,r13
	ADRL    r9,cos1_64
	mov     r12,r2
	add     r14,r2,#32*4

	; p[i] = (b[i] - b[31-i]) * cos_64[i]
	; i = 0 to 3
	ldmia   r12!,{r0-r3}
	ldmdb   r14!,{r4-r7}
	sub     r0,r0,r7
	sub     r1,r1,r6
	sub     r2,r2,r5
	sub     r3,r3,r4
	mov     r0,r0,asl #5
	mov     r1,r1,asl #5
	mov     r2,r2,asl #5
	mov     r3,r3,asl #5
	ldmia   r9!,{r4-r7}
	smull   r10,r0,r4,r0
	smull   r10,r1,r5,r1
	smull   r10,r2,r6,r2
	smull   r10,r3,r7,r3
	stmia   r8!,{r0-r3}

	; i = 4 to 7
	ldmia   r12!,{r0-r3}
	ldmdb   r14!,{r4-r7}
	sub     r0,r0,r7
	sub     r1,r1,r6
	sub     r2,r2,r5
	sub     r3,r3,r4
	mov     r0,r0,asl #5
	mov     r1,r1,asl #5
	mov     r2,r2,asl #5
	mov     r3,r3,asl #5
	ldmia   r9!,{r4-r7}
	smull   r10,r0,r4,r0
	smull   r10,r1,r5,r1
	smull   r10,r2,r6,r2
	smull   r10,r3,r7,r3
	stmia   r8!,{r0-r3}

	; i = 8 to 11
	ldmia   r12!,{r0-r3}
	ldmdb   r14!,{r4-r7}
	sub     r0,r0,r7
	sub     r1,r1,r6
	sub     r2,r2,r5
	sub     r3,r3,r4
	mov     r0,r0,asl #5
	mov     r1,r1,asl #5
	mov     r2,r2,asl #5
	mov     r3,r3,asl #5
	ldmia   r9!,{r4-r7}
	smull   r10,r0,r4,r0
	smull   r10,r1,r5,r1
	smull   r10,r2,r6,r2
	smull   r10,r3,r7,r3

	; i = 12 to 15
	ldmia   r12!,{r4-r7}
	ldmdb   r14!,{r8-r11}
	sub     r4,r4,r11
	mov     r4,r4,asl#5     ;p12
	ldr     r14,cos25_64
	smull   r12,r4,r14,r4
	sub     r5,r5,r10
	mov     r5,r5,asl#5     ;p13
	ldr     r14,cos27_64
	smull   r12,r5,r14,r5
	sub     r6,r6,r9
	mov     r6,r6,asl#5     ;p14
	ldr     r14,cos29_64
	smull   r12,r6,r14,r6
	sub     r7,r7,r8
	mov     r7,r7,asl#5     ;p15
	ldr     r14,cos31_64
	smull   r12,r7,r14,r7

	; r0-r7 is p8-p15  |  r13[0-7] is p0-p7

;---------------------------------------------------

	ldmia   r13,{r9-r12}

	ldr     r14,cos1_32
	sub     r8,r9,r7
	mov     r8,r8,asl#5     ;pp8 = (p0 - p15) * cos1_32
	add     r7,r9,r7        ;pp0 = (p0 + p15)
	smull   r9,r8,r14,r8

	ldr     r14,cos3_32
	sub     r9,r10,r6
	mov     r9,r9,asl#5     ;pp9 = (p1 - p14) * cos3_32
	add     r6,r10,r6       ;pp1 = (p1 + p14)
	smull   r10,r9,r14,r9

	ldr     r14,cos5_32
	sub     r10,r11,r5
	mov     r10,r10,asl#5   ;pp10 = (p2 - p13) * cos5_32
	add     r5,r11,r5       ;pp2 = (p2 + p13)
	smull   r11,r10,r14,r10

	ldr     r14,cos7_32
	sub     r11,r12,r4
	mov     r11,r11,asl#5   ;pp11 = (p3 - p12) * cos7_32
	add     r4,r12,r4       ;pp3 = (p3 + p12)
	smull   r12,r11,r14,r11

	add     r12,r13,#8*4
	stmia   r12,{r8-r11}

	ldmdb   r12,{r9-r12}

	ldr     r14,cos9_32
	sub     r8,r9,r3
	mov     r8,r8,asl#5     ;pp12 = (p4 - p11) * cos9_32
	add     r3,r9,r3        ;pp4 = (p4 + p11)
	smull   r9,r8,r14,r8

	ldr     r14,cos11_32
	sub     r9,r10,r2
	mov     r9,r9,asl#5     ;pp13 = (p5 - p10) * cos11_32
	add     r2,r10,r2       ;pp5 = (p5 + p10)
	smull   r10,r9,r14,r9

	ldr     r14,cos13_32
	sub     r10,r11,r1
	mov     r10,r10,asl#5   ;pp14 = (p6 - p9) * cos13_32
	add     r1,r11,r1       ;pp6 = (p6 + p9)
	smull   r11,r10,r14,r10

	ldr     r14,cos15_32
	sub     r11,r12,r0
	mov     r11,r11,asl#5   ;pp15 = (p7 - p8) * cos15_32
	add     r0,r12,r0       ;pp7 = (p7 + p8)
	smull   r12,r11,r14,r11

	add     r12,r13,#12*4
	stmia   r12,{r8-r11}

	; r0-r7 is pp7-pp0  |  r13[8-12] is pp8-pp15

;---------------------------------------------------

	sub     r8,r7,r0
	mov     r8,r8,asl#5 ;p4 = (pp0 - pp7) * cos1_16
	add     r0,r7,r0    ;p0 = (pp0 + pp7)

	sub     r9,r6,r1
	mov     r9,r9,asl#5 ;p5 = (pp1 - pp6) * cos3_16
	add     r1,r6,r1    ;p1 = (pp1 + pp6)

	sub     r6,r5,r2
	mov     r6,r6,asl#5 ;p6 = (pp2 - pp5) * cos5_16
	add     r2,r5,r2    ;p2 = (pp2 + pp5)

	sub     r7,r4,r3
	mov     r7,r7,asl#5 ;p7 = (pp3 - pp4) * cos7_16
	add     r3,r4,r3    ;p3 = (pp3 + pp4)

	mov     r4,r8
	mov     r5,r9

	ADRL    r14,cos1_16
	ldmia   r14,{r8-r11}
	smull   r14,r4,r8,r4
	smull   r14,r5,r9,r5
	smull   r14,r6,r10,r6
	smull   r14,r7,r11,r7

	; r0-r7 is p0-p7
;==
	ldr     r14,cos3_8

	sub     r8,r1,r2
	mov     r8,r8,asl#5 ;pp3 = (p1 - p2) * cos3_8
	add     r1,r1,r2    ;pp1 = (p1 + p2)
	smull   r12,r8,r14,r8

	sub     r9,r5,r6
	mov     r9,r9,asl#5 ;pp7 = (p5 - p6) * cos3_8
	add     r5,r5,r6    ;pp5 = (p5 + p6)
	smull   r12,r9,r14,r9

	ldr     r14,cos1_8

	sub     r2,r0,r3
	mov     r2,r2,asl#5 ;pp2 = (p0 - p3) * cos1_8
	add     r0,r0,r3    ;pp0 = (p0 + p3)
	smull   r12,r2,r14,r2

	sub     r6,r4,r7
	mov     r6,r6,asl#5 ;pp6 = (p4 - p7) * cos1_8
	add     r4,r4,r7    ;pp4 = (p4 + p7)
	smull   r12,r6,r14,r6

	mov     r3,r8
	mov     r7,r9

	; r0-r7 is pp0-pp7
;==
	ldr     r14,cos1_4

	sub     r8,r0,r1
	mov     r8,r8,asl#5 ;p1 = (pp0 - pp1) * cos1_4
	add     r0,r0,r1    ;p0 = (pp0 + pp1)
	smull   r12,r1,r14,r8

	sub     r8,r2,r3
	mov     r8,r8,asl#5 ;p3 = (pp2 - pp3) * cos1_4
	add     r2,r2,r3    ;p2 = (pp2 + pp3)
	smull   r12,r3,r14,r8

	sub     r8,r4,r5
	mov     r8,r8,asl#5 ;p5 = (pp4 - pp5) * cos1_4
	add     r4,r4,r5    ;p4 = (pp4 + pp5)
	smull   r12,r5,r14,r8

	sub     r8,r6,r7
	mov     r8,r8,asl#5 ;p7 = (pp6 - pp7) * cos1_4
	add     r6,r6,r7    ;p6 = (pp6 + pp7)
	smull   r12,r7,r14,r8

	stmia   r13,{r0-r7}     ; p0-p7

;---------------------------------------------------

	add     r12,r13,#8*4
	ldmia   r12,{r0-r3,r8-r11}  ;pp8-pp15

	sub     r4,r0,r11
	mov     r4,r4,asl#5 ;p12 = (pp8 - pp15) * cos1_16
	add     r0,r0,r11   ;p8  = (pp8 + pp15)

	sub     r5,r1,r10
	mov     r5,r5,asl#5 ;p13 = (pp9 - pp14) * cos3_16
	add     r1,r1,r10   ;p9  = (pp9 + pp14)

	sub     r6,r2,r9
	mov     r6,r6,asl#5 ;p14 = (pp10 - pp13) * cos5_16
	add     r2,r2,r9    ;p10 = (pp10 + pp13)

	sub     r7,r3,r8
	mov     r7,r7,asl#5 ;p15 = (pp11 - pp12) * cos7_16
	add     r3,r3,r8    ;p11 = (pp11 + pp12)

	ADRL    r14,cos1_16
	ldmia   r14,{r8-r11}
	smull   r14,r4,r8,r4
	smull   r14,r5,r9,r5
	smull   r14,r6,r10,r6
	smull   r14,r7,r11,r7

	; r0-r7 is p8-p15
;==
	ldr     r14,cos1_8

	sub     r8,r0,r3
	mov     r8,r8,asl#5 ;pp10 = (p8 - p11) * cos1_8
	add     r0,r0,r3    ;pp8  = (p8 + p11)
	smull   r12,r8,r14,r8

	sub     r9,r4,r7
	mov     r9,r9,asl#5 ;pp14 = (p12 - p15) * cos1_8
	add     r4,r4,r7    ;pp12 = (p12 + p15)
	smull   r12,r9,r14,r9

	ldr     r14,cos3_8

	sub     r3,r1,r2
	mov     r3,r3,asl#5 ;pp11 = (p9 - p10) * cos3_8
	add     r1,r1,r2    ;pp9  = (p9 + p10)
	smull   r12,r3,r14,r3

	sub     r7,r5,r6
	mov     r7,r7,asl#5 ;pp15 = (p13 - p14) * cos3_8
	add     r5,r5,r6    ;pp13 = (p13 + p14)
	smull   r12,r7,r14,r7

	mov     r2,r8
	mov     r6,r9

	; r0-r7 is pp8-pp15
;==
	ldr     r14,cos1_4

	sub     r8,r0,r1
	mov     r8,r8,asl#5 ;p9 = (pp8 - pp9) * cos1_4
	add     r0,r0,r1    ;p8 = (pp8 + pp9)
	smull   r12,r1,r14,r8

	sub     r8,r2,r3
	mov     r8,r8,asl#5 ;p11 = (pp10 - pp11) * cos1_4
	add     r2,r2,r3    ;p10 = (pp10 + pp11)
	smull   r12,r3,r14,r8

	sub     r8,r4,r5
	mov     r8,r8,asl#5 ;p13 = (pp12 - pp13) * cos1_4
	add     r4,r4,r5    ;p12 = (pp12 + pp13)
	smull   r12,r5,r14,r8

	sub     r8,r6,r7
	mov     r8,r8,asl#5 ;p15 = (pp14 - pp15) * cos1_4
	add     r6,r6,r7    ;p14 = (pp14 + pp15)
	smull   r12,r7,r14,r8

	; r0-r7 is p8-p15

;---------------------------------------------------

	ldr     r8,[r13,#16*4]  ; reload buffer pointers
	ldr     r9,[r13,#17*4]

	add     r14,r5,r7       ;tmp = p13 + p15
	ldr     r11,[r13,#1*4]  ;r11 = p1
	add     r12,r11,r1
	add     r12,r12,r14
	str     r12,[r8,#1*64]  ;buf[1] = p1 + p9 + tmp
	ldr     r11,[r13,#5*4]  ;r11 = p5
	ldr     r10,[r13,#7*4]  ;r10 = p7
	add     r12,r11,r10
	add     r12,r12,r3
	add     r12,r12,r14
	str     r12,[r8,#5*64]  ;buf[5]  = p5 + p7 + p11 + tmp
	add     r14,r14,r1      ;tmp += p9
	ldr     r1,[r13,#1*4]   ;r1 = p1
	add     r12,r1,r6
	add     r12,r12,r14
	rsb     r12,r12,#0
	str     r12,[r9,#1*64]  ;buf[33] = -(p1 + p14 + tmp)
	add     r14,r14,r11
	add     r14,r14,r10     ;tmp += p5 + p7
	str     r14,[r8,#3*64]  ;buf[3]  = tmp
	ldr     r1,[r13,#6*4]   ;r1 = p6
	add     r12,r1,r6
	add     r12,r12,r14
	rsb     r12,r12,#0
	str     r12,[r9,#3*64]  ;buf[35] = -(p6 + p14 + tmp)
	add     r14,r2,r3
	add     r14,r14,r6
	add     r14,r14,r7      ;tmp = p10 + p11 + p14 + p15
	ldr     r2,[r13,#4*4]   ;r2 = p4
	add     r2,r2,r1
	add     r2,r2,r10       ;r2 = p4+p6+p7
	add     r12,r2,r14
	add     r12,r12,r4
	rsb     r12,r12,#0
	str     r12,[r9,#11*64] ;buf[43] = -(p4 + p6 + p7 + tmp + p12)
	add     r12,r11,r1
	add     r12,r12,r10
	add     r12,r12,r14
	add     r12,r12,r5
	rsb     r12,r12,#0
	str     r12,[r9,#5*64]  ;buf[37] = -(p5 + p6 + p7 + tmp + p13)
	ldr     r1,[r13,#2*4]   ;r1 = p2
	ldr     r11,[r13,#3*4]  ;r11 = p3
	add     r1,r1,r11       ;r1 = p2+p3
	add     r12,r1,r14
	add     r12,r12,r5
	rsb     r12,r12,#0
	str     r12,[r9,#7*64]  ;buf[39] = -(p2 + p3 + tmp + p13)
	add     r12,r1,r14
	add     r12,r12,r4
	rsb     r12,r12,#0
	str     r12,[r9,#9*64]  ;buf[41] = -(p2 + p3 + tmp + p12)
	add     r14,r0,r4
	add     r14,r14,r6
	add     r14,r14,r7      ;tmp = p8 + p12 + p14 + p15
	ldr     r0,[r13,#0*4]   ;r0 = p0
	add     r12,r2,r14
	rsb     r12,r12,#0
	str     r12,[r9,#13*64] ;buf[45] = -(p4 + p6 + p7 + tmp)
	add     r12,r0,r14
	rsb     r12,r12,#0
	str     r12,[r9,#15*64] ;buf[47] = -(p0 + tmp)
	add     r14,r3,r7       ;tmp = p11 + p15
	add     r12,r10,r14
	str     r12,[r8,#11*64] ;buf[11] = p7 + tmp
	add     r14,r14,r11     ;tmp += p3
	str     r14,[r8,#9*64]  ;buf[9]  = tmp
	add     r12,r5,r14
	str     r12,[r8,#7*64]  ;buf[7]  = p13 + tmp
	add     r12,r10,r7
	str     r12,[r8,#13*64] ;buf[13] = p7 + p15
	str     r7,[r8,#15*64]  ;buf[15] = p15
	mov     r12,#0
	str     r12,[r8,#16*64] ;buf[16] = 0

	add     r13,r13,#(16+3)*4

	ldmfd r13!,{r4-r12,pc}

;==========================================================================================

hannunit * 24

; parameters:
;  r0 = smps    out    ptr to array of shorts   fixed, unit is 2^15
;  r1 = hann    in     ptr to hann window       fixed, unit is 2^hannunit
;  r2 = buf     in     ptr to channel buffer    fixed, unit is 2^xunit
;  r3 = step    in     steps in smps (channels) integer
;  [r13,#0] = offset                            integer
|subbandsynthesis_3|

;r3=j
;r4=step
;r12=sum
;r14=clip

	stmfd   r13!,{r4-r12,r14}

	mov     r14,#0      ; clip = 0
	mov     r4,r3

	mov     r3,#16      ;j=0-15

	; Forward windowing
sbs3_loop
	stmfd   r13!,{r4,r14}

	ldmia   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
	smull   r14,r12,r4,r8
	smlal   r14,r12,r5,r9
	smlal   r14,r12,r6,r10
	smlal   r14,r12,r7,r11

	ldmia   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
	smlal   r14,r12,r4,r8
	smlal   r14,r12,r5,r9
	smlal   r14,r12,r6,r10
	smlal   r14,r12,r7,r11

	ldmia   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
	smlal   r14,r12,r4,r8
	smlal   r14,r12,r5,r9
	smlal   r14,r12,r6,r10
	smlal   r14,r12,r7,r11

	ldmia   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
	smlal   r14,r12,r4,r8
	smlal   r14,r12,r5,r9
	smlal   r14,r12,r6,r10
	smlal   r14,r12,r7,r11
 [ (xunit + hannunit - 32) < 15
	movs    r14,r14,lsr #32+(xunit + hannunit - 32 - 15)
	adc     r12,r14,r12,lsl #-(xunit + hannunit - 32 - 15)
 ]
 [ (xunit + hannunit - 32) = 15
	cmp     r14,#0
	addlt   r12,r12,#1
 ]
 [ (xunit + hannunit - 32) > 15
	movs    r12,r12,asr #(xunit + hannunit - 32 - 15)
	adc     r12,r12,#0
 ]

	add     r1,r1,#16*4

	mov     r10,#0x8000
	ldmfd   r13!,{r4,r14}

	cmp     r12,r10
	subge   r12,r10,#1
	addge   r14,r14,#1

	cmn     r12,r10
	rsblt   r12,r10,#0
	addlt   r14,r14,#1

	strb    r12,[r0]
	mov     r12,r12,lsr#8
	strb    r12,[r0,#1]
	add     r0,r0,r4,LSL#1

	subs    r3,r3,#1
	bgt     sbs3_loop

        ;j=16

	stmfd   r13!,{r4,r14}

	ldmia   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
; is 0	smull   r14,r12,r4,r8
;	smlal   r14,r12,r5,r9
	smull   r14,r12,r5,r9
; is 0	smlal   r14,r12,r6,r10
	smlal   r14,r12,r7,r11

	ldmia   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
; is 0	smlal   r14,r12,r4,r8
	smlal   r14,r12,r5,r9
; is 0	smlal   r14,r12,r6,r10
	smlal   r14,r12,r7,r11

	ldmia   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
; is 0	smlal   r14,r12,r4,r8
	smlal   r14,r12,r5,r9
; is 0	smlal   r14,r12,r6,r10
	smlal   r14,r12,r7,r11

	ldmia   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
; is 0	smlal   r14,r12,r4,r8
	smlal   r14,r12,r5,r9
; is 0	smlal   r14,r12,r6,r10
	smlal   r14,r12,r7,r11
 [ (xunit + hannunit - 32) < 15
	movs    r14,r14,lsr #32+(xunit + hannunit - 32 - 15)
	adc     r12,r14,r12,lsl #-(xunit + hannunit - 32 - 15)
 ]
 [ (xunit + hannunit - 32) = 15
	cmp     r14,#0
	addlt   r12,r12,#1
 ]
 [ (xunit + hannunit - 32) > 15
	movs    r12,r12,asr #(xunit + hannunit - 32 - 15)
	adc     r12,r12,#0
 ]

;	add     r1,r1,#16*4   ; (1)

	mov     r10,#0x8000
	ldmfd   r13!,{r4,r14}

	cmp     r12,r10
	subge   r12,r10,#1
	addge   r14,r14,#1

	cmn     r12,r10
	rsblt   r12,r10,#0
	addlt   r14,r14,#1

	strb    r12,[r0]
	mov     r12,r12,lsr#8
	strb    r12,[r0,#1]
	add     r0,r0,r4,LSL#1

	; Reverse windowing
	mov     r3,#15      ;j=17-31
	; but we are already at middle + 16 - offset + 16 [and not + 32 because (1)]
    ; and we need to go to middle -16 + offset
	ldr     r5,[r13,#10*4] ; offset
	add     r1,r1,r5,LSL#3
	sub     r1,r1,#48*4

sbs3_loop2
	sub     r2,r2,#32*4
	stmfd   r13!,{r4,r14}

	ldmdb   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
	rsb     r5,r5,#0
	rsb     r7,r7,#0
	smull   r14,r12,r4,r11
	smlal   r14,r12,r5,r10
	smlal   r14,r12,r6,r9
	smlal   r14,r12,r7,r8

	ldmdb   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
	rsb     r5,r5,#0
	rsb     r7,r7,#0
	smlal   r14,r12,r4,r11
	smlal   r14,r12,r5,r10
	smlal   r14,r12,r6,r9
	smlal   r14,r12,r7,r8

	ldmdb   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
	rsb     r5,r5,#0
	rsb     r7,r7,#0
	smlal   r14,r12,r4,r11
	smlal   r14,r12,r5,r10
	smlal   r14,r12,r6,r9
	smlal   r14,r12,r7,r8

	ldmdb   r1!,{r8-r11}
	ldmia   r2!,{r4-r7}
	rsb     r5,r5,#0
	rsb     r7,r7,#0
	smlal   r14,r12,r4,r11
	smlal   r14,r12,r5,r10
	smlal   r14,r12,r6,r9
	smlal   r14,r12,r7,r8
 [ (xunit + hannunit - 32) < 15
	movs    r14,r14,lsr #32+(xunit + hannunit - 32 - 15)
	adc     r12,r14,r12,lsl #-(xunit + hannunit - 32 - 15)
 ]
 [ (xunit + hannunit - 32) = 15
	cmp     r14,#0
	addlt   r12,r12,#1
 ]
 [ (xunit + hannunit - 32) > 15
	movs    r12,r12,asr #(xunit + hannunit - 32 - 15)
	adc     r12,r12,#0
 ]

	sub     r1,r1,#16*4

	mov     r10,#0x8000
	ldmfd   r13!,{r4,r14}

	cmp     r12,r10
	subge   r12,r10,#1
	addge   r14,r14,#1

	cmn     r12,r10
	rsblt   r12,r10,#0
	addlt   r14,r14,#1

	strb    r12,[r0]
	mov     r12,r12,lsr#8
	strb    r12,[r0,#1]
	add     r0,r0,r4,LSL#1

	subs    r3,r3,#1
	bgt     sbs3_loop2

	mov     r0,R14 ;clipped
	ldmfd   r13!,{r4-r12,pc}


;==========================================================================================

; parameters:
;  r0 = o1      inout  ptr to 18 values         fixed
;  r1 = wintab  in     ptr to array of values   fixed, unit is 2^winshift
;  r2 = tsbuf   out    ptr to array of values   fixed

winshift * 27

	MACRO
	DCT36_MACRO $n

	ldr     r4,[r13],#4             ;sum0 = tmp[0+(v<<1)]
	ldr     r5,[r13],#4             ;sum1 = tmp[1+(v<<1)]

	ldmia   r1!,{r8-r11}            ;w[]

	add     r3,r4,r5                ;tmpval = sum0 + sum1
	sub     r4,r4,r5                ;sum0 -= sum1

	ldr     r14,[r0,#(8-$n)*4]      ;r14 = out1[8-(v)]
	LMULX   r12,r8,r4,winshift
	add     r12,r14,r12
	str     r12,[r2,#32*(8-$n)*4]   ;ts[SBLIMIT*(8-(v))] = out1[8-(v)] +  sum0 * w[0+(v*4)]
	ldr     r14,[r0,#(9+$n)*4]      ;r14 = out1[9+(v)]
	LMULX   r12,r9,r4,winshift
	add     r12,r14,r12
	str     r12,[r2,#32*(9+$n)*4]   ;ts[SBLIMIT*(9+(v))] = out1[9+(v)] +  sum0 * w[1+(v*4)]

	LMULX   r12,r11,r3,winshift
	str     r12,[r0,#(9+$n)*4]      ;out2[9+(v)] =  tmpval * w[3+(v*4)]

	LMULX   r12,r10,r3,winshift
	str     r12,[r0,#(8-$n)*4]      ;out2[8-(v)] = tmpval * w[2+(v*4)]
	MEND

	MACRO
	DCT36_MACRO2 $n,$m

; macro 0-1
	ldmia   r13!,{r4-r7}            ;tmp[]

; macro 0
	ldmia   r1!,{r8-r11}            ;w[]

	add     r3,r4,r5                ;tmpval = sum0 + sum1
	sub     r4,r4,r5                ;sum0 -= sum1

	ldr     r14,[r0,#(8-$n)*4]      ;r14 = out1[8-(v)]
	LMULX   r12,r8,r4,winshift
	add     r12,r14,r12
	str     r12,[r2,#32*(8-$n)*4]   ;ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[0+(v*4)]
	ldr     r14,[r0,#(9+$n)*4]      ;r14 = out1[9+(v)]
	LMULX   r12,r9,r4,winshift
	add     r12,r14,r12
	str     r12,[r2,#32*(9+$n)*4]   ;ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[1+(v*4)]

	LMULX   r12,r11,r3,winshift
	str     r12,[r0,#(9+$n)*4]      ;out2[9+(v)] = tmpval * w[3+(v*4)]

	LMULX   r12,r10,r3,winshift
	str     r12,[r0,#(8-$n)*4]      ;out2[8-(v)] = tmpval * w[2+(v*4)]
; macro 1
	ldmia   r1!,{r8-r11}

	add     r3,r6,r7                ;tmpval = sum0 + sum1
	sub     r6,r6,r7                ;sum0 -= sum1

	ldr     r14,[r0,#(8-$m)*4]      ;r14 = out1[8-(v)]
	LMULX   r12,r8,r6,winshift
	add     r12,r14,r12
	str     r12,[r2,#32*(8-$m)*4]   ;ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[0+(v*4)]
	ldr     r14,[r0,#(9+$m)*4]      ;r14 = out1[9+(v)]
	LMULX   r12,r9,r6,winshift
	add     r12,r14,r12
	str     r12,[r2,#32*(9+$m)*4]   ;ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[1+(v*4)]

	LMULX   r12,r11,r3,winshift
	str     r12,[r0,#(9+$m)*4]      ;out2[9+(v)] = tmpval * w[3+(v*4)]

	LMULX   r12,r10,r3,winshift
	str     r12,[r0,#(8-$m)*4]      ;out2[8-(v)] = tmpval * w[2+(v*4)]
	MEND

;==========================================================================================

cosshift * 30
; 1 is 1<<30
cos6_1  ;   pi/6
	DCD &376cf5d0
cos6_2  ;   3*pi/6
	DCD &20000000
cos9    ;   pi/9       - 4*pi/9   7*pi/9
	DCD &3c23ec84, &f4e2f2c1, &cef920bb
cos18   ;   pi/18      - 7*pi/18  13*pi/18
	DCD &3f071719, &ea1c478c, &d6dc915c

tfshift * 28
; 1 is 1<<28
tfcos36
	DCD &0807D2B1, &08483EE0, &08D3B7CD
    DCD &09C42577, &0B504F33, &0DF2943B
    DCD &12EDFB18, &1EE8DD47, &5BCA2A26

; parameters:
;  r0 = inbuf   in     ptr to 18 bands          fixed, corrupted on exit?
;  r1 = o1      inout  ptr to 18 values         fixed
;  r2 = wintab  in     ptr to array of values   fixed, unit is 2^winshift
;  r3 = tsbuf   out    ptr to array of values   fixed

|dct36|

;r1-r8=t0-t7

	stmfd   r13!,{r1-r3}
	stmfd   r13!,{r4-r12,r14}
	sub     r13,r13,#18*4   ;r13=tmp

	add     r0,r0,#8*4

	ldmia   r0,{r4-r12,r14}

	add     r14,r14,r12     ;in[17]+=in[16]
	add     r12,r12,r11     ;in[16]+=in[15]
	add     r11,r11,r10     ;in[15]+=in[14]
	add     r10,r10,r9      ;in[14]+=in[13]
	add     r9,r9,r8        ;in[13]+=in[12]
	add     r8,r8,r7        ;in[12]+=in[11]
	add     r7,r7,r6        ;in[11]+=in[10]
	add     r6,r6,r5        ;in[10]+=in[9]
	add     r5,r5,r4        ;in[9]+=in[8]

	add     r14,r14,r11     ;in[17]+=in[15]
	add     r11,r11,r9      ;in[15]+=in[13]
	add     r9,r9,r7        ;in[13]+=in[11]
	add     r7,r7,r5        ;in[11]+=in[9]

	add     r0,r0,#2*4
	stmia   r0,{r6-r12,r14}

	mov     r12,r4
	mov     r14,r5
	sub     r0,r0,#10*4

	ldmia   r0,{r4-r11}

	add     r12,r12,r11     ;in[8]+=in[7]
	add     r11,r11,r10     ;in[7]+=in[6]
	add     r10,r10,r9      ;in[6]+=in[5]
	add     r9,r9,r8        ;in[5]+=in[4]
	add     r8,r8,r7        ;in[4]+=in[3]
	add     r7,r7,r6        ;in[3]+=in[2]
	add     r6,r6,r5        ;in[2]+=in[1]
	add     r5,r5,r4        ;in[1]+=in[0]

	add     r14,r14,r11     ;in[9]+=in[7]
	add     r11,r11,r9      ;in[7]+=in[5]
	add     r9,r9,r7        ;in[5]+=in[3]
	add     r7,r7,r5        ;in[3]+=in[1]

	stmia   r0,{r4-r12,r14}

;----------------------------------------------------------------------------

;;	ldr     r14,cos6_2
	ldr     r12,[r0,#12*4]      ;r12=in[12]
	ldr     r11,[r0,#8*4]       ;r11=in[8]
;;	LMULX   r2,r12,r14,cosshift ;t1 = COS6_2 * in[12]
	MOV     R2,R12,ASR #1

	ldr     r10,[r0,#16*4]      ;r10=in[16]
	ldr     r9,[r0,#4*4]        ;r9=in[4]
	ldr     r12,[r0,#0*4]       ;r12=in[0]
	add     r4,r11,r10
	sub     r4,r4,r9
;;	LMULX   r3,r4,r14,cosshift  ;t2 = COS6_2 * (in[8] + in[16] - in[4])
	MOV     R3,R4,ASR #1

	add     r4,r12,r2           ;t3 = in[0] + t1
	sub     r5,r12,r2,asl#1     ;t4 = in[0] - t1 - t1
	sub     r6,r5,r3            ;t5 = t4 - t2

	ldr     r14,cos9
	add     r12,r9,r11
	LMULX   r1,r12,r14,cosshift ;t0 = cos9[0] * (in[4] + in[8])

	ldr     r14,cos9+4
	sub     r12,r11,r10
	LMULX   r2,r12,r14,cosshift ;t1 = cos9[1] * (in[8] - in[16])

	add     r12,r5,r3,asl#1
	str     r12,[r13,#(4*2)*4]  ;tmp[4*2] = t4 + t2 + t2

	ldr     r14,cos9+8
	add     r12,r9,r10
	LMULX   r3,r12,r14,cosshift ;t2 = cos9[2] * (in[4] + in[16])

	sub     r7,r4,r1
	sub     r7,r7,r3            ;t6 = t3 - t0 - t2
	add     r1,r1,r4
	add     r1,r1,r2            ;t0 += t3 + t1
	add     r4,r4,r3
	sub     r4,r4,r2            ;t3 += t2 - t1

	ldr     r9,[r0,#2*4]        ;r9=in[2]
	ldr     r10,[r0,#10*4]      ;r10=in[10]
	ldr     r14,cos18
	add     r12,r9,r10
	LMULX   r3,r12,r14,cosshift ;t2 = cos18[0] * (in[2]  + in[10])

	ldr     r11,[r0,#14*4]      ;r11=in[14]
	ldr     r14,cos18+4
	sub     r12,r10,r11
	LMULX   r5,r12,r14,cosshift ;t4 = cos18[1] * (in[10] - in[14])

	ldr     r12,[r0,#6*4]
	ldr     r14,cos6_1
	LMULX   r8,r12,r14,cosshift ;t7 = COS6_1 * in[6]

	add     r2,r3,r5
	add     r2,r2,r8            ;t1 = t2 + t4 + t7

	add     r12,r1,r2
	str     r12,[r13,#(0*2)*4]  ;tmp[0*2] = t0 + t1

	sub     r12,r1,r2
	str     r12,[r13,#(8*2)*4]  ;tmp[8*2] = t0 - t1

	ldr     r14,cos18+8
	add     r12,r9,r11
	LMULX   r2,r12,r14,cosshift ;t1 =  cos18[2] * (in[2] + in[14])

	add     r3,r3,r2
	sub     r3,r3,r8            ;t2 += t1 - t7

	ldr     r14,cos6_1
	add     r12,r10,r11
	sub     r12,r12,r9
	LMULX   r1,r12,r14,cosshift ;t0 =  COS6_1 * (in[10] + in[14] - in[2])

	sub     r5,r5,r2
	sub     r5,r5,r8            ;t4 -= t1 + t7

	sub     r12,r6,r1
	str     r12,[r13,#(1*2)*4]  ;tmp[1*2] = t5 - t0

	add     r12,r7,r5
	str     r12,[r13,#(2*2)*4]  ;tmp[2*2] = t6 + t4

	add     r12,r4,r3
	str     r12,[r13,#(3*2)*4]  ;tmp[3*2] = t3 + t2

	sub     r12,r4,r3
	str     r12,[r13,#(5*2)*4]  ;tmp[5*2] = t3 - t2

	sub     r12,r7,r5
	str     r12,[r13,#(6*2)*4]  ;tmp[6*2] = t6 - t4

	add     r12,r6,r1
	str     r12,[r13,#(7*2)*4]  ;tmp[7*2] = t5 + t0

;----------------------------------------------------------------------------

	ldr     r12,[r0,#13*4]
;;	ldr     r14,cos6_2
	ldr     r9,[r0,#9*4]        ;r9=in[9]
;;	LMULX   r2,r12,r14,cosshift ;t1 =  COS6_2 * in[13]
	MOV     R2,R12,ASR #1

	ldr     r10,[r0,#17*4]      ;r10=in[17]
	ldr     r11,[r0,#5*4]       ;r11=in[5]
	add     r4,r9,r10
	ldr     r12,[r0,#1*4]       ;r12=in[1]
	sub     r4,r4,r11
;;	LMULX   r3,r4,r14,cosshift  ;t2 =  COS6_2 * (in[9] + in[17] - in[5])
	MOV     R3,R4,ASR #1

	add     r4,r12,r2           ;t3 = in[1] + t1
	sub     r5,r12,r2,asl#1     ;t4 = in[1] - t1 - t1
	sub     r6,r5,r3            ;t5 = t4 - t2

	ldr     r14,cos9
	add     r12,r11,r9
	LMULX   r1,r12,r14,cosshift ;t0 =  cos9[0] * (in[5] + in[9])

	ldr     r14,cos9+4
	sub     r12,r9,r10
	LMULX   r2,r12,r14,cosshift ;t1 =  cos9[1] * (in[9] - in[17])

	ldr     r14,tfcos36+(17-13)*4
	add     r12,r5,r3,asl#1
	LMULX   r9,r12,r14,tfshift  ;r9=xxx
	str     r9,[r13,#(1+4*2)*4] ;tmp[1+4*2] =  (t4 + t2 + t2) * tfcos36[17-13]

	ldr     r14,cos9+8
	add     r12,r11,r10
	LMULX   r3,r12,r14,cosshift ;t2 =  cos9[2] * (in[5] + in[17])

	sub     r7,r4,r1
	sub     r7,r7,r3            ;t6 = t3 - t0 - t2
	add     r1,r1,r4
	add     r1,r1,r2            ;t0 += t3 + t1
	add     r4,r4,r3
	sub     r4,r4,r2            ;t3 += t2 - t1

	ldr     r9,[r0,#3*4]        ;r9=in[3]
	ldr     r10,[r0,#11*4]      ;r10=in[11]
	ldr     r14,cos18
	add     r12,r9,r10
	LMULX   r3,r12,r14,cosshift ;t2 = cos18[0] * (in[3]  + in[11])

	ldr     r11,[r0,#15*4]      ;r11=in[15]
	ldr     r14,cos18+4
	sub     r12,r10,r11
	LMULX   r5,r12,r14,cosshift ;t4 = cos18[1] * (in[11] - in[15])

	ldr     r12,[r0,#7*4]
	ldr     r14,cos6_1
	LMULX   r8,r12,r14,cosshift ;t7 = COS6_1 * in[7]

	add     r2,r3,r5
	add     r2,r2,r8            ;t1 = t2 + t4 + t7

	ldr     r14,tfcos36+(17-17)*4
	add     r12,r1,r2
	LMULX   r10,r12,r14,tfshift ;r10=xxx
	str     r10,[r13,#(1+0*2)*4];tmp[1+0*2] = (t0 + t1) * tfcos36[17-17]

	ldr     r14,tfcos36+(17-9)*4
	sub     r12,r1,r2
	LMULX   r10,r12,r14,tfshift
	str     r10,[r13,#(1+8*2)*4];tmp[1+8*2]  = (t0 - t1) * tfcos36[17-9]

	ldr     r14,cos18+8
	add     r12,r9,r11
	LMULX   r2,r12,r14,cosshift ;t1 =  cos18[2] * (in[3] + in[15])

	add     r3,r3,r2
	sub     r3,r3,r8            ;t2 += t1 - t7

	ldr     r10,[r0,#11*4]
	ldr     r14,cos6_1
	add     r12,r10,r11
	sub     r12,r12,r9
	LMULX   r1,r12,r14,cosshift ;t0 =  COS6_1 * (in[11] + in[15] - in[3])

	sub     r5,r5,r2
	sub     r5,r5,r8            ;t4 -= t1 + t7

	ldr     r14,tfcos36+(17-16)*4
	sub     r12,r6,r1
	LMULX   r10,r12,r14,tfshift
	str     r10,[r13,#(1+1*2)*4];tmp[1+1*2] = (t5 - t0) * tfcos36[17-16]

	ldr     r14,tfcos36+(17-15)*4
	add     r12,r7,r5
	LMULX   r10,r12,r14,tfshift
	str     r10,[r13,#(1+2*2)*4];tmp[1+2*2] = (t6 + t4) * tfcos36[17-15]

	ldr     r14,tfcos36+(17-14)*4
	add     r12,r4,r3
	LMULX   r10,r12,r14,tfshift
	str     r10,[r13,#(1+3*2)*4];tmp[1+3*2] = (t3 + t2) * tfcos36[17-14]

	ldr     r14,tfcos36+(17-12)*4
	sub     r12,r4,r3
	LMULX   r10,r12,r14,tfshift
	str     r10,[r13,#(1+5*2)*4];tmp[1+5*2] = (t3 - t2) * tfcos36[17-12]

	ldr     r14,tfcos36+(17-11)*4
	sub     r12,r7,r5
	LMULX   r10,r12,r14,tfshift
	str     r10,[r13,#(1+6*2)*4];tmp[1+6*2] = (t6 - t4) * tfcos36[17-11]

	ldr     r14,tfcos36+(17-10)*4
	add     r12,r6,r1
	LMULX   r10,r12,r14,tfshift
	str     r10,[r13,#(1+7*2)*4];tmp[1+7*2] = (t5 + t0) * tfcos36[17-10]

;----------------------------------------------------------------------------
;r0=o1, r1=wintab, r2=tsbuf

	; restore input parameters
	add     r12,r13,#28*4
	ldmfd   r12,{r0-r2}

	DCT36_MACRO2    0,1
	DCT36_MACRO2    2,3
	DCT36_MACRO2    4,5
	DCT36_MACRO2    6,7
	DCT36_MACRO     8

	ldmfd   r13!,{r4-r12,r14}
	add     r13,r13,#3*4
	mov     pc,r14

;==========================================================================================

; parameters:
;  r0 = sblim   in     number of bands          integer
;  r1 = csa     in     ptr to (cs, ca) array    fixed 2^31
;  r2 = xr      inout  ptr to bands             fixed

|antialias_1|

; work
;r3=current csa
;r4=xinc
;r5=xdec
;r6=*xinc
;r7=*xdec
;r8=*cs1
;r9=*ca1
;r10=*cs2
;r11=*ca2
;r12
;r14

	STMFD   R13!,{R4-R12,R14}

	MOV     R0,R0,LSL #3 ; sblim * 8
aa1_sb
	ADD     R2,R2,#(18<<2)
	MOV     R4,R2          ; start of next band
	SUB     R5,R2,#(1<<2)  ; end of current band
	MOV     R3,R1
aa1_ss
	LDMIA   R3!,{R8-R11}

	LDR     R6,[R4]     ; u
	LDR     R7,[R5]     ; d
;	MOV     R6,R6,LSL #1
;	MOV     R7,R7,LSL #1

	SMULL   R12,R14,R6,R8 ; cs1
	SMLAL   R12,R14,R7,R9 ; ca1
	MOVS    R12,R12,LSR #31
	ADC     R14,R12,R14,LSL #1
	STR     R14,[R4],#4

	RSB     R6,R6,#0
	SMULL   R12,R14,R6,R9 ; ca1
	SMLAL   R12,R14,R7,R8 ; cs1
	MOVS    R12,R12,LSR #31
	ADC     R14,R12,R14,LSL #1
	STR     R14,[R5],#-4

	LDR     R6,[R4]     ; u
	LDR     R7,[R5]     ; d
;	MOV     R6,R6,LSL #1
;	MOV     R7,R7,LSL #1

	SMULL   R12,R14,R6,R10 ; cs2
	SMLAL   R12,R14,R7,R11 ; ca2
	MOVS    R12,R12,LSR #31
	ADC     R14,R12,R14,LSL #1
	STR     R14,[R4],#4

	RSB     R6,R6,#0
	SMULL   R12,R14,R6,R11 ; ca2
	SMLAL   R12,R14,R7,R10 ; cs2
	MOVS    R12,R12,LSR #31
	ADC     R14,R12,R14,LSL #1
	STR     R14,[R5],#-4

	SUB     R0,R0,#2
	TST     R0,#&7
	BNE     aa1_ss
	CMP     R0,#0
	BGT     aa1_sb

	LDMFD   R13!,{R4-R12,PC}

;==========================================================================================

; parameters:
;  out: r0 = 0 if ok, error ptr otherwise

Err_StrongARM
	DCD     0
	= "Processor with 64-bit multiplication instructions (StrongARM or later) is required.",0
	ALIGN
|check_hardware|
; Check if CPU supports SMLA
	MOV     R0,#0
	MOV     R1,#0
	MOV     R2,#251
	MOV     R3,#241
	SMLAL   R0,R1,R2,R3
	MUL     R1,R2,R3
	CMP     R0,R1
	ADRNE   R0,Err_StrongARM
	MOVEQ   R0,#0
	MOV     PC,R14

	END
