;some optimized functions (yes!)

	AREA |A$$code|,READONLY,CODE

	EXPORT mdct_butterfly_32
	EXPORT mdct_butterfly_generic
	EXPORT mdct_b0
	EXPORT mdct_b1
	EXPORT mdct_bit

;****************************************************************************

; in  R12 = x  (x -> x + 7 preloaded in R0 -> R7
; out R12 = x + 8
; called by mdct_butterfly_16, so give a damn abour registers saving
mdct_butterfly_8

	ADD     R8 ,R6,R2   ; r0 = x6 + x2
	SUB     R9 ,R6,R2   ; r1 = x6 - x2
	ADD     R10,R4,R0   ; r2 = x4 + x0
	SUB     R11,R4,R0   ; r3 = x4 - x0

	ADD     R6,R8,R10   ; y6 = r0 + r2
	SUB     R4,R8,R10   ; y4 = r0 - r2

	SUB     R8 ,R5,R1   ; r0 = x5 - x1
	SUB     R10,R7,R3   ; r2 = x7 - x3
	ADD     R0,R9,R8    ; y0 = r1 + r0
	SUB     R2,R9,R8    ; y2 = r1 - r0

	ADD     R8 ,R5,R1   ; r0 = x5 + x1
	ADD     R9 ,R7,R3   ; r1 = x7 + x3
	ADD     R3,R10,R11  ; y3 = r2 + r3
	SUB     R1,R10,R11  ; y1 = r2 - r3
	ADD     R7,R9,R8    ; y7 = r1 + r0
	SUB     R5,R9,R8    ; y5 = r1 - r0

	STMIA   R12!,{R0-R7}

	MOV     PC,R14

cPI1_8 ; unit is 1 << 31
	DCD     &7641AF3C ; cos(PI/8)
cPI2_8 ; unit is 1 << 31
	DCD     &5A827999 ; cos(2*PI/8)
cPI3_8 ; unit is 1 << 31
	DCD     &30FBC54E ; cos(3*PI/8)

; in  R12 = x
; out R12 = x + 16
; called by mdct_butterfly_32, so give a damn abour registers saving

mdct_butterfly_16
	STR     R14,[R13,#-4]!

	ADD     R10,R12,#8*4
	LDMIA   R12,{R4-R7}       ; x0-x3
	LDMIA   R10,{R0-R1,R8-R9} ; x8-x11
	LDR     R11,cPI2_8

	SUB     R2,R7,R9          ; y2 = x3 - x11
	SUB     R3,R8,R6          ; y3 = x10 - x2
	ADD     R6,R8,R6          ; y10 = x10 + x2
	ADD     R7,R9,R7          ; y11 = x11 + x3
	SUB     R8,R5,R1          ; r0 = x1 - x9
	SUB     R9,R4,R0          ; r1 = x0 - x8
	ADD     R4,R0,R4          ; y8 = x8 + x0
	ADD     R5,R1,R5          ; y9 = x9 + x1
	STMIA   R10!,{R4-R7}

	ADD     R8,R8,R9
	SUB     R9,R8,R9,LSL #1
	MOV     R8,R8,ASL #1
	MOV     R9,R9,ASL #1
	SMULL   R8,R0,R11,R8      ; y0 = MULT_NORM((r0   + r1), cPI2_8)
	SMULL   R9,R1,R11,R9      ; y1 = MULT_NORM((r0   - r1), cPI2_8)
	STMIA   R12!,{R0-R3}

	LDMIA   R12,{R4-R7}       ; x4-x7
	LDMIA   R10,{R2-R3,R8-R9} ; x12-x15

	ADD     R0,R2,R4          ; y12 = x12 + x4
	ADD     R1,R3,R5          ; y13 = x13 + x5
	SUB     R4,R2,R4          ; r0 = x12 - x4
	SUB     R5,R3,R5          ; r1 = x13 - x5
	ADD     R2,R8,R6          ; y14 = x14 + x6
	ADD     R3,R9,R7          ; y15 = x15 + x7
	SUB     R6,R8,R6          ; y6 = x14 - x6
	SUB     R7,R9,R7          ; y7 = x15 - x7
	STMIA   R10,{R0-R3}

	SUB     R8,R4,R5
	ADD     R9,R4,R5
	MOV     R4,R8,ASL #1
	MOV     R5,R9,ASL #1
	SMULL   R8,R4,R11,R4      ; y4 = MULT_NORM((r0   - r1), cPI2_8)
	SMULL   R9,R5,R11,R5      ; y5 = MULT_NORM((r0   + r1), cPI2_8)

	SUB     R12,R12,#4*4
    LDMIA   R12,{R0-R3}
	BL      mdct_butterfly_8
    LDMIA   R12,{R0-R7}
	BL      mdct_butterfly_8

	LDR     PC,[R13],#4

mdct_butterfly_32
	STMDB   R13!,{R4-R12,R14}
	MOV     R14,R0
	ADD     R12,R0,#16*4
	LDR     R11,cPI2_8

	LDMIA   R12,{R4-R7}
	LDMIA   R14,{R0-R3}

	SUB     R8,R2,R6        ; r0 = x2 - x18
	SUB     R9,R3,R7        ; r1 = x3 - x19
	ADD     R6,R6,R2        ; y18 = x18 + x2
	ADD     R7,R7,R3        ; y19 = x19 + x3
	ADD     R9,R9,R8
	SUB     R8,R9,R8,LSL #1
	MOV     R8,R8,ASL #1
	MOV     R9,R9,ASL #1
	SMULL   R8,R3,R11,R8    ; y3 = MULT_NORM((r1 - r0), cPI2_8)
	SMULL   R9,R2,R11,R9    ; y2 = MULT_NORM((r1 + r0), cPI2_8)

	SUB     R8,R0,R4        ; r0 = x0 - x16
	SUB     R9,R1,R5        ; r1 = x1 - x17
	ADD     R4,R4,R0        ; y16 = x16 + x0
	ADD     R5,R5,R1        ; y17 = x17 + x1
	STMIA   R12!,{R4-R7}

	LDR     R4,cPI1_8
	LDR     R5,cPI3_8
	MOV     R8,R8,ASL #1
	MOV     R9,R9,ASL #1
	SMULL   R6,R0,R9,R5     ; y0 =  MULT_NORM(r1, cPI3_8)
	SMULL   R7,R1,R9,R4     ; y1 =  MULT_NORM(r1, cPI1_8)
	RSB     R5,R5,#0
	SMLAL   R6,R0,R8,R4     ; y0 += MULT_NORM(r0, cPI1_8)
	SMLAL   R7,R1,R8,R5     ; y1 -= MULT_NORM(r0, cPI3_8)

	STMIA   R14!,{R0-R3}

	LDMIA   R12,{R4-R7}
	LDMIA   R14,{R0-R3}

	SUB     R8,R6,R2        ; r0 = x22 - x6
	SUB     R9,R3,R7        ; r1 = x7 - x23
	ADD     R6,R6,R2        ; y22 = x22 + x6
	ADD     R7,R7,R3        ; y23 = x23 + x7
	MOV     R2,R9           ; y6 = r1
	MOV     R3,R8           ; y7 = r0

	SUB     R8,R0,R4        ; r0 = x4 - x20
	SUB     R9,R1,R5        ; r1 = x5 - x21
	ADD     R4,R4,R0        ; y20 = x20 + x4
	ADD     R5,R5,R1        ; y21 = x21 + x5
	STMIA   R12!,{R4-R7}

	LDR     R4,cPI1_8
	LDR     R5,cPI3_8
	MOV     R8,R8,ASL #1
	MOV     R9,R9,ASL #1
	SMULL   R6,R0,R9,R4     ; y4 =  MULT_NORM(r1, cPI1_8)
	SMULL   R7,R1,R9,R5     ; y5 =  MULT_NORM(r1, cPI3_8)
	RSB     R4,R4,#0
	SMLAL   R6,R0,R8,R5     ; y4 += MULT_NORM(r0, cPI3_8)
	SMLAL   R7,R1,R8,R4     ; y5 -= MULT_NORM(r0, cPI1_8)
	STMIA   R14!,{R0-R3}

	LDMIA   R12,{R4-R7}
	LDMIA   R14,{R0-R3}

	SUB     R8,R6,R2        ; r0 = x26 - x10
	SUB     R9,R7,R3        ; r1 = x27 - x11
	ADD     R6,R6,R2        ; y26 = x26 + x10
	ADD     R7,R7,R3        ; y27 = x27 + x11
	ADD     R8,R8,R9
	SUB     R9,R8,R9,LSL #1
	MOV     R8,R8,ASL #1
	MOV     R9,R9,ASL #1
	SMULL   R8,R3,R11,R8    ; y11 = MULT_NORM((r0 + r1), cPI2_8)
	SMULL   R9,R2,R11,R9    ; y10 = MULT_NORM((r0 - r1), cPI2_8)

	SUB     R8,R4,R0        ; r0 = x24 - x8
	SUB     R9,R5,R1        ; r1 = x25 - x9
	ADD     R4,R4,R0        ; y24 = x24 + x8
	ADD     R5,R5,R1        ; y25 = x25 + x9
	STMIA   R12!,{R4-R7}

	LDR     R4,cPI1_8
	LDR     R5,cPI3_8
	MOV     R8,R8,ASL #1
	MOV     R9,R9,ASL #1
	SMULL   R6,R0,R8,R5     ; y8 =  MULT_NORM(r0, cPI3_8)
	SMULL   R7,R1,R8,R4     ; y9 =  MULT_NORM(r0, cPI1_8)
	RSB     R4,R4,#0
	SMLAL   R6,R0,R9,R4     ; y8 -= MULT_NORM(r1, cPI1_8)
	SMLAL   R7,R1,R9,R5     ; y9 += MULT_NORM(r1, cPI3_8)
	STMIA   R14!,{R0-R3}

	LDMIA   R12,{R4-R7}
	LDMIA   R14,{R0-R3}

	ADD     R6,R6,R2        ; y30 = x30 + x14
	ADD     R7,R7,R3        ; y31 = x31 + x15
	SUB     R2,R6,R2,ASL #1 ; y14 = x30 - x14
	SUB     R3,R7,R3,ASL #1 ; y15 = x31 - x15

	SUB     R8,R4,R0        ; r0 = x28 - x12
	SUB     R9,R5,R1        ; r1 = x29 - x13
	ADD     R4,R4,R0        ; y28 = x28 + x12
	ADD     R5,R5,R1        ; y29 = x29 + x13
	STMIA   R12!,{R4-R7}

	LDR     R4,cPI1_8
	LDR     R5,cPI3_8
	MOV     R8,R8,ASL #1
	MOV     R9,R9,ASL #1
	SMULL   R6,R0,R8,R4     ; y12 =  MULT_NORM(r0, cPI1_8)
	SMULL   R7,R1,R8,R5     ; y13 =  MULT_NORM(r0, cPI3_8)
	RSB     R5,R5,#0
	SMLAL   R6,R0,R9,R5     ; y12 -= MULT_NORM(r1, cPI3_8)
	SMLAL   R7,R1,R9,R4     ; y13 += MULT_NORM(r1, cPI1_8)
	STMIA   R14!,{R0-R3}

	SUB     R12,R14,#16*4
	BL      mdct_butterfly_16
	BL      mdct_butterfly_16

	LDMIA   R13!,{R4-R12,PC}

;-------------------------------------------------------------------------------
; In r0 trig (unit is 1<<31)
; In r1 x
; In r2 points
; In r3 trigint

mdct_butterfly_generic
	STMDB   R13!,{R4-R12,R14}
	ADD     R14,R1,R2,LSL #2
	MOV     R2,R2,LSR #1
	ADD     R12,R1,R2,LSL #2

	ADD     R1,R1,R2,LSL #1
	ADD     R1,R1,R2
mdct_butterfly_generic_Loop1
	LDMDB   R14,{R4-R7}
	LDMDB   R12,{R8-R11}
	ADD     R4,R4,R8          ; x1[0] += x2[0]
	ADD     R5,R5,R9          ; x1[1] += x2[1]
	ADD     R6,R6,R10         ; x1[2] += x2[2]
	ADD     R7,R7,R11         ; x1[3] += x2[3]
	SUB     R8,R4,R8,ASL #1   ; r0 = x1[0] - x2[0]
	SUB     R9,R5,R9,ASL #1   ; r1 = x1[1] - x2[1]
	SUB     R10,R6,R10,ASL #1 ; r2 = x1[2] - x2[2]
	SUB     R11,R7,R11,ASL #1 ; r3 = x1[3] - x2[3]
	STMDB   R14!,{R4-R7}

	LDMIA   R0,{R4-R5}        ; T[0], T[1]
	ADD     R0,R0,R3,LSL #2   ; T += trigint
	SMULL   R7,R6,R10,R4      ; x2[2] = MULT_NORM(r2, T[0])
	SMLAL   R7,R6,R11,R5      ;       + MULT_NORM(r3, T[1])
	MOVS    R7,R7,LSR #31
	ADC     R6,R7,R6,LSL #1
	SMULL   R4,R7,R11,R4      ; x2[3] = MULT_NORM(r3, T[0])
	RSB     R5,R5,#0
	SMLAL   R4,R7,R10,R5      ;       - MULT_NORM(r2, T[1])
	MOVS    R4,R4,LSR #31
	ADC     R7,R4,R7,LSL #1

	LDMIA   R0,{R10-R11}      ; T[0], T[1]
	ADD     R0,R0,R3,LSL #2   ; T += trigint
	SMULL   R5,R4,R8,R10      ; x2[0] = MULT_NORM(r0, T[0])
	SMLAL   R5,R4,R9,R11      ;       + MULT_NORM(r1, T[1])
	MOVS    R5,R5,LSR #31
	ADC     R4,R5,R4,LSL #1
	SMULL   R10,R5,R9,R10     ; x2[1] = MULT_NORM(r1, T[0])
	RSB     R11,R11,#0
	SMLAL   R10,R5,R8,R11     ;       - MULT_NORM(r0, T[1])
	MOVS    R10,R10,LSR #31
	ADC     R5,R10,R5,LSL #1

	STMDB   R12!,{R4-R7}

	CMP     R12,R1
	BHI     mdct_butterfly_generic_Loop1

	SUB     R1,R1,R2
mdct_butterfly_generic_Loop2
	LDMDB   R14,{R4-R7}
	LDMDB   R12,{R8-R11}
	ADD     R4,R4,R8          ; x1[0] += x2[0]
	ADD     R5,R5,R9          ; x1[1] += x2[1]
	ADD     R6,R6,R10         ; x1[2] += x2[2]
	ADD     R7,R7,R11         ; x1[3] += x2[3]
	SUB     R8,R4,R8,ASL #1   ; r0 = x1[0] - x2[0]
	SUB     R9,R5,R9,ASL #1   ; r1 = x1[1] - x2[1]
	SUB     R10,R6,R10,ASL #1 ; r2 = x1[2] - x2[2]
	SUB     R11,R7,R11,ASL #1 ; r3 = x1[3] - x2[3]
	STMDB   R14!,{R4-R7}

	LDMIA   R0,{R4-R5}        ; -T[1],-T[0]
	SUB     R0,R0,R3,LSL #2   ; T -= trigint
	RSB     R5,R5,#0
	SMULL   R6,R7,R11,R5      ; x2[3] = MULT_NORM(r3, T[0])
	SMLAL   R6,R7,R10,R4      ;       - MULT_NORM(r2, T[1])
	MOVS    R6,R6,LSR #31
	ADC     R7,R6,R7,LSL #1
	SMULL   R5,R6,R10,R5      ; x2[2] = MULT_NORM(r2, T[0])
	RSB     R4,R4,#0
	SMLAL   R5,R6,R11,R4      ;       + MULT_NORM(r3, T[1])
	MOVS    R5,R5,LSR #31
	ADC     R6,R5,R6,LSL #1

	LDMIA   R0,{R10-R11}      ; -T[1],-T[0]
	SUB     R0,R0,R3,LSL #2   ; T -= trigint
	RSB     R11,R11,#0
	SMULL   R4,R5,R9,R11      ; x2[1] = MULT_NORM(r1, T[0])
	SMLAL   R4,R5,R8,R10      ;       - MULT_NORM(r0, T[1])
	MOVS    R4,R4,LSR #31
	ADC     R5,R4,R5,LSL #1
	SMULL   R11,R4,R8,R11     ; x2[0] = MULT_NORM(r0, T[0])
	RSB     R10,R10,#0
	SMLAL   R11,R4,R9,R10     ;       + MULT_NORM(r1, T[1])
	MOVS    R11,R11,LSR #31
	ADC     R4,R11,R4,LSL #1

	STMDB   R12!,{R4-R7}

	CMP     R12,R1
	BHI     mdct_butterfly_generic_Loop2

	SUB     R1,R1,R2
mdct_butterfly_generic_Loop3
	LDMDB   R14,{R4-R7}
	LDMDB   R12,{R8-R11}
	ADD     R4,R4,R8          ; x1[0] += x2[0]
	ADD     R5,R5,R9          ; x1[1] += x2[1]
	ADD     R6,R6,R10         ; x1[2] += x2[2]
	ADD     R7,R7,R11         ; x1[3] += x2[3]
	SUB     R8,R4,R8,ASL #1   ; r0 = x1[0] - x2[0]
	SUB     R9,R5,R9,ASL #1   ; r1 = x1[1] - x2[1]
	SUB     R10,R6,R10,ASL #1 ; r2 = x1[2] - x2[2]
	SUB     R11,R7,R11,ASL #1 ; r3 = x1[3] - x2[3]
	STMDB   R14!,{R4-R7}

	LDMIA   R0,{R4-R5}        ; -T[1],T[0]
	ADD     R0,R0,R3,LSL #2   ; T += trigint
	SMULL   R6,R7,R11,R5      ; x2[3] = MULT_NORM(r3, T[0])
	SMLAL   R6,R7,R10,R4      ;       - MULT_NORM(r2, T[1])
	MOVS    R6,R6,LSR #31
	ADC     R7,R6,R7,LSL #1
	SMULL   R5,R6,R10,R5      ; x2[2] = MULT_NORM(r2, T[0])
	RSB     R4,R4,#0
	SMLAL   R5,R6,R11,R4      ;       + MULT_NORM(r3, T[1])
	MOVS    R5,R5,LSR #31
	ADC     R6,R5,R6,LSL #1

	LDMIA   R0,{R10-R11}      ; -T[1],T[0]
	ADD     R0,R0,R3,LSL #2   ; T += trigint
	SMULL   R4,R5,R9,R11      ; x2[1] = MULT_NORM(r1, T[0])
	SMLAL   R4,R5,R8,R10      ;       - MULT_NORM(r0, T[1])
	MOVS    R4,R4,LSR #31
	ADC     R5,R4,R5,LSL #1
	SMULL   R11,R4,R8,R11     ; x2[0] = MULT_NORM(r0, T[0])
	RSB     R10,R10,#0
	SMLAL   R11,R4,R9,R10     ;       + MULT_NORM(r1, T[1])
	MOVS    R11,R11,LSR #31
	ADC     R4,R11,R4,LSL #1

	STMDB   R12!,{R4-R7}

	CMP     R12,R1
	BHI     mdct_butterfly_generic_Loop3

	SUB     R1,R1,R2
mdct_butterfly_generic_Loop4
	LDMDB   R14,{R4-R7}
	LDMDB   R12,{R8-R11}
	ADD     R4,R4,R8          ; x1[0] += x2[0]
	ADD     R5,R5,R9          ; x1[1] += x2[1]
	ADD     R6,R6,R10         ; x1[2] += x2[2]
	ADD     R7,R7,R11         ; x1[3] += x2[3]
	SUB     R8,R4,R8,ASL #1   ; r0 = x1[0] - x2[0]
	SUB     R9,R5,R9,ASL #1   ; r1 = x1[1] - x2[1]
	SUB     R10,R6,R10,ASL #1 ; r2 = x1[2] - x2[2]
	SUB     R11,R7,R11,ASL #1 ; r3 = x1[3] - x2[3]
	STMDB   R14!,{R4-R7}

	LDMIA   R0,{R4-R5}        ; -T[0],T[1]
	SUB     R0,R0,R3,LSL #2   ; T -= trigint
	RSB     R4,R4,#0
	SMULL   R7,R6,R10,R4      ; x2[2] = MULT_NORM(r2, T[0])
	SMLAL   R7,R6,R11,R5      ;       + MULT_NORM(r3, T[1])
	MOVS    R7,R7,LSR #31
	ADC     R6,R7,R6,LSL #1
	SMULL   R4,R7,R11,R4      ; x2[3] = MULT_NORM(r3, T[0])
	RSB     R5,R5,#0
	SMLAL   R4,R7,R10,R5      ;       - MULT_NORM(r2, T[1])
	MOVS    R4,R4,LSR #31
	ADC     R7,R4,R7,LSL #1

	LDMIA   R0,{R10-R11}      ; -T[0],T[1]
	SUB     R0,R0,R3,LSL #2   ; T -= trigint
	RSB     R10,R10,#0
	SMULL   R5,R4,R8,R10      ; x2[0] = MULT_NORM(r0, T[0])
	SMLAL   R5,R4,R9,R11      ;       + MULT_NORM(r1, T[1])
	MOVS    R5,R5,LSR #31
	ADC     R4,R5,R4,LSL #1
	SMULL   R10,R5,R9,R10     ; x2[1] = MULT_NORM(r1, T[0])
	RSB     R11,R11,#0
	SMLAL   R10,R5,R8,R11     ;       - MULT_NORM(r0, T[1])
	MOVS    R10,R10,LSR #31
	ADC     R5,R10,R5,LSL #1

	STMDB   R12!,{R4-R7}

	CMP     R12,R1
	BHI     mdct_butterfly_generic_Loop4

	LDMIA   R13!,{R4-R12,PC}

;-------------------------------------------------------------------------------
; In R0 out
; In R1 in
; In R2 n (n multiple of 4)
; In R3 trig (unit is 1<<31)
mdct_b0
	STMDB   R13!,{R0,R3-R12,R14}

	ADD     R4,R1,R2,LSL #1 ; iX = in  + n/2
	SUB     R4,R4,#1*4      ;    - 1
	ADD     R0,R0,R2,LSL #1 ; oX = out + n/2 + n/4
	ADD     R0,R0,R2

	ADD     R1,R1,R2
mdct_b0_loop0
	LDMIA   R3!,{R5-R8}     ; -T[1],[T0],-T[3],T[2]
	LDR     R9 ,[R4],#-8
	LDR     R10,[R4],#-8
	RSB     R6,R6,#0
	SMULL   R14,R11,R9 ,R5  ; R11 = oX[-2] = MULT_NORM (iX[-1], -T[1])
	SMLAL   R14,R11,R10,R6  ;              - MULT_NORM (iX[-3], T[0])
	MOVS    R14,R14,LSR #31
	ADC     R11,R14,R11,LSL #1
	RSB     R5,R5,#0
	SMULL   R14,R12,R10,R5  ; R12 = oX[-1] = MULT_NORM (iX[-3], T[1])
	SMLAL   R14,R12,R9 ,R6  ;              - MULT_NORM (iX[-1], T[0])
	MOVS    R14,R14,LSR #31
	ADC     R12,R14,R12,LSL #1
	LDR     R5,[R4],#-8
	LDR     R6,[R4],#-8
	RSB     R8,R8,#0
	SMULL   R14,R9 ,R5,R7   ; R9  = oX[-4] = MULT_NORM (iX[-5], -T[3])
	SMLAL   R14,R9 ,R6,R8   ;              - MULT_NORM (iX[-7], T[2])
	MOVS    R14,R14,LSR #31
	ADC     R9 ,R14,R9 ,LSL #1
	RSB     R7,R7,#0
	SMULL   R14,R10,R6,R7   ; R10 = oX[-3] = MULT_NORM (iX[-7], T[3])
	SMLAL   R14,R10,R5,R8   ;              - MULT_NORM (iX[-5], T[2])
	MOVS    R14,R14,LSR #31
	ADC     R10,R14,R10,LSL #1
	STMDB   R0!,{R9-R12}

	CMP     R4,R1
	BHS     mdct_b0_loop0

	ADD     R3,R3,#8
	SUB     R1,R1,R2
mdct_b0_loop1
	LDMDB   R3!,{R5-R8}     ; -T[2],[T3],-T[0],T[1]
	LDR     R9 ,[R4],#-8
	LDR     R10,[R4],#-8
	SMULL   R14,R12,R10,R8  ; R12 = oX[-1] = MULT_NORM (iX[-3], T[1])
	SMLAL   R14,R12,R9 ,R7  ;              - MULT_NORM (iX[-1], T[0])
	MOVS    R14,R14,LSR #31
	ADC     R12,R14,R12,LSL #1
	RSB     R8,R8,#0
	SMULL   R14,R11,R9 ,R8  ; R11 = oX[-2] = MULT_NORM (iX[-1], -T[1])
	SMLAL   R14,R11,R10,R7  ;              - MULT_NORM (iX[-3], T[0])
	MOVS    R14,R14,LSR #31
	ADC     R11,R14,R11,LSL #1
	LDR     R7,[R4],#-8
	LDR     R8,[R4],#-8
	SMULL   R14,R10,R8,R6   ; R10 = oX[-3] = MULT_NORM (iX[-7], T[3])
	SMLAL   R14,R10,R7,R5   ;              - MULT_NORM (iX[-5], T[2])
	MOVS    R14,R14,LSR #31
	ADC     R10,R14,R10,LSL #1
	RSB     R6,R6,#0
	SMULL   R14,R9 ,R7,R6   ; R9  = oX[-4] = MULT_NORM (iX[-5], -T[3])
	SMLAL   R14,R9 ,R8,R5   ;              - MULT_NORM (iX[-7], T[2])
	MOVS    R14,R14,LSR #31
	ADC     R9 ,R14,R9 ,LSL #1
	STMDB   R0!,{R9-R12}

	CMP     R4,R1
	BHS     mdct_b0_loop1

	LDR     R0,[R13,#0]     ; don't pull from stack
	ADD     R4,R1,R2,LSL #1 ; iX = in  + n/2
	SUB     R4,R4,#2*4      ;    - 2
	ADD     R0,R0,R2,LSL #1 ; oX = out + n/2 + n/4
	ADD     R0,R0,R2

	ADD     R1,R1,R2
mdct_b0_loop2
	LDMIA   R3!,{R5-R8}     ; -T[1],-T[0],-T[3],-T[2]
	LDR     R11,[R4],#-8
	LDR     R12,[R4],#-8
	RSB     R8,R8,#0
	SMULL   R14,R10,R12,R8  ; R10 = oX[1] = MULT_NORM (iX[-4], T[2])
	SMLAL   R14,R10,R11,R7  ;             - MULT_NORM (iX[-2], T[3])
	MOVS    R14,R14,LSR #31
	ADC     R10,R14,R10,LSL #1
	RSB     R7,R7,#0
	SMULL   R14,R9 ,R11,R8  ; R9  = oX[0] = MULT_NORM (iX[-2], T[2])
	SMLAL   R14,R9 ,R12,R7  ;             + MULT_NORM (iX[-4], T[3])
	MOVS    R14,R14,LSR #31
	ADC     R9 ,R14,R9 ,LSL #1
	LDR     R7,[R4],#-8
	LDR     R8,[R4],#-8
	RSB     R6,R6,#0
	SMULL   R14,R12,R8,R6   ; R12 = oX[3] = MULT_NORM (iX[-8], T[0])
	SMLAL   R14,R12,R7,R5   ;             - MULT_NORM (iX[-6], T[1])
	MOVS    R14,R14,LSR #31
	ADC     R12,R14,R12,LSL #1
	RSB     R5,R5,#0
	SMULL   R14,R11,R7,R6   ; R11 = oX[2] = MULT_NORM (iX[-6], T[0])
	SMLAL   R14,R11,R8,R5   ;             + MULT_NORM (iX[-8], T[1])
	MOVS    R14,R14,LSR #31
	ADC     R11,R14,R11,LSL #1
	STMIA   R0!,{R9-R12}

	CMP     R4,R1
	BHS     mdct_b0_loop2

	SUB     R3,R3,#8
	SUB     R1,R1,R2
mdct_b0_loop3
	LDMDB   R3!,{R5-R8}     ; T[2],T[3],T[0],T[1]
	LDR     R11,[R4],#-8
	LDR     R12,[R4],#-8
	SMULL   R14,R9 ,R11,R5  ; R9  = oX[0] = MULT_NORM (iX[-2], T[2])
	SMLAL   R14,R9 ,R12,R6  ;             + MULT_NORM (iX[-4], T[3])
	MOVS    R14,R14,LSR #31
	ADC     R9 ,R14,R9 ,LSL #1
	RSB     R6,R6,#0
	SMULL   R14,R10,R12,R5  ; R10 = oX[1] = MULT_NORM (iX[-4], T[2])
	SMLAL   R14,R10,R11,R6  ;             - MULT_NORM (iX[-2], T[3])
	MOVS    R14,R14,LSR #31
	ADC     R10,R14,R10,LSL #1
	LDR     R5,[R4],#-8
	LDR     R6,[R4],#-8
	SMULL   R14,R11,R5,R7   ; R11 = oX[2] = MULT_NORM (iX[-6], T[0])
	SMLAL   R14,R11,R6,R8   ;             + MULT_NORM (iX[-8], T[2])
	MOVS    R14,R14,LSR #31
	ADC     R11,R14,R11,LSL #1
	RSB     R8,R8,#0
	SMULL   R14,R12,R6,R7   ; R12 = oX[3] = MULT_NORM (iX[-8], T[0])
	SMLAL   R14,R12,R5,R8   ;             - MULT_NORM (iX[-6], T[1])
	MOVS    R14,R14,LSR #31
	ADC     R12,R14,R12,LSL #1
	STMIA   R0!,{R9-R12}

	CMP     R4,R1
	BHS     mdct_b0_loop3

	LDMIA   R13!,{R0,R3-R12,PC}

;-------------------------------------------------------------------------------
; In R0 out
; In R1 n2
; In R2 n4
; In R3 trig2 (unit is 1<<31)
mdct_b1
	STMDB   R13!,{R4-R12,R14}

	MOV     R4,R0           ; iX  = out
	ADD     R5,R0,R1,LSL #2 ; oX1 = out + n2 + n4
	ADD     R5,R5,R2,LSL #2
	MOV     R6,R5           ; oX2 = out + n2 + n4

mdct_b1_loop0
	LDMIA   R4!,{R7,R8}
	LDMIA   R3!,{R9,R10}
	SMULL   R11,R14,R7,R10  ; oX1[-1] = MULT_NORM(iX[0], T2[1])
	RSB     R9,R9,#0
	SMLAL   R11,R14,R8,R9   ;         - MULT_NORM(iX[1], T2[0])
	MOVS    R11,R11,LSR #31
	ADC     R14,R11,R14,LSL #1
	SMULL   R11,R12,R7,R9   ; oX2[0] =- MULT_NORM(iX[0], T2[0])
	RSB     R10,R10,#0
	SMLAL   R11,R12,R8,R10  ;         - MULT_NORM(iX[1], T2[1])
	MOVS    R11,R11,LSR #31
	ADC     R12,R11,R12,LSL #1
	STR     R14,[R5,#-4]!
	STR     R12,[R6],#4

	CMP     R4,R5
	BLO     mdct_b1_loop0

	ADD     R5,R0,R2,LSL #2 ; oX1 = out + n4
	MOV     R6,R5           ; oX2 = out + n4
	ADD     R4,R5,R1,LSL #2 ; iX  = out + n2 + n4

mdct_b1_loop1
	LDMDB   R4!,{R7-R10}
	RSB     R14,R7,#0
	RSB     R12,R8,#0
	RSB     R11,R9,#0
	STMDB   R5!,{R7-R10}
	RSB     R10,R10,#0
	STMIA   R6!,{R10-R12,R14}

	CMP     R6,R4
	BLO     mdct_b1_loop1

	ADD     R6,R0,R1,LSL#2  ; oX2 = out + n2
	ADD     R4,R6,R2,LSL#2  ; iX  = out + n2 + n4
	MOV     R5,R4           ; oX1 = out + n2 + n4

mdct_b1_loop2
	LDMIA   R4!,{R7-R10}
	MOV     R14,R7
	MOV     R12,R8
	MOV     R11,R9
	STMDB   R5!,{R10-R12,R14}

	CMP     R5,R6
	BHI     mdct_b1_loop2

	LDMIA   R13!,{R4-R12,PC}

	ALIGN
bitrev
	DCB     0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15

;-------------------------------------------------------------------------------
; In R0 x
; In R1 trig3 (unit is 1<<31)
; In R2 log2n
; In R3 n
mdct_bit
	STMDB   R13!,{R2-R3,R4-R12,R14}
	MOV     R6,R0           ; w0 = x
	ADD     R7,R6,R3,LSL #1 ; w1 = x + n/2
	MOV     R0,R7           ; x = w1
	RSB     R2,R2,#13       ; bitrev shift
	STR     R2,[R13,#0]
	MOV     R3,#0           ; bitrev pos
	STR     R3,[R13,#4]

mdct_bit_loop
	LDR     R2,[R13,#0]     ; reload saved shift
	LDR     R3,[R13,#4]     ; reload saved pos
	ADR     R8,bitrev
	LDRB    R9,[R8,R3,LSR #8]
	AND     R10,R3,#&0F0
	LDRB    R10,[R8,R10,LSR #4]
	AND     R11,R3,#&00F
	LDRB    R11,[R8,R11]
	ORR     R9,R9,R10,LSL #4
	ORR     R9,R9,R11,LSL #8
	MOV     R11,#-1
	EOR     R10,R9,R11
	MOV     R10,R10,LSL #20
	MOV     R10,R10,LSR #20
	MOV     R5,R9,LSR R2
	MOV     R4,R10,LSR R2
	SUB     R4,R4,#1
	ADD     R5,R0,R5,LSL #2
	ADD     R4,R0,R4,LSL #2
	LDMIA   R4,{R8-R9}
	LDMIA   R5,{R10-R11}
	ADD     R3,R3,#1
	STR     R3,[R13,#4]     ; store updated pos
	SUB     R4,R9,R11       ; r0 = x0[1] - x1[1]
	ADD     R5,R8,R10       ; r1 = x0[0] + x1[0]
	LDMIA   R1!,{R2-R3}     ; T3[0], T3[1]
	SMULL   R14,R12,R4,R3   ; r2 = MULT_NORM(r0, T3[1])
	SMLAL   R14,R12,R5,R2   ;    + MULT_NORM(r1, T3[0])
	MOVS    R14,R14,LSR #31
	ADC     R12,R14,R12,LSL #1
	RSB     R2,R2,#0
	SMULL   R14,R3,R5,R3    ; r3 = MULT_NORM(r1, T3[1])
	SMLAL   R14,R3,R4,R2    ;    - MULT_NORM(r0, T3[0])
	MOVS    R14,R14,LSR #31
	ADC     R3,R14,R3,LSL #1

	ADD     R4,R9,R11       ; r0 = (x0[1] + x1[1])/2
	MOV     R4,R4,ASR #1
	SUB     R5,R8,R10       ; r1 = (x0[0] - x1[0])/2
	MOV     R5,R5,ASR #1

	ADD     R8,R4,R12       ; w0[0] = r0 + r2
	ADD     R9,R5,R3        ; w0[1] = r1 + r3
	STMIA   R6!,{R8-R9}

	SUB     R10,R4,R12      ; w1[-2] = r0 - r2
	SUB     R11,R3,R5       ; w1[-1] = r3 - r1
	STMDB   R7!,{R10-R11}

	CMP     R6,R7
	BLO     mdct_bit_loop

	LDMIA   R13!,{R2-R3,R4-R12,PC}

	END
