1@ Tremolo library
2@-----------------------------------------------------------------------
3@ Copyright (C) 2002-2009, Xiph.org Foundation
4@ Copyright (C) 2010, Robin Watts for Pinknoise Productions Ltd
5@ All rights reserved.
6
7@ Redistribution and use in source and binary forms, with or without
8@ modification, are permitted provided that the following conditions
9@ are met:
10
11@     * Redistributions of source code must retain the above copyright
12@ notice, this list of conditions and the following disclaimer.
13@     * Redistributions in binary form must reproduce the above
14@ copyright notice, this list of conditions and the following disclaimer
15@ in the documentation and/or other materials provided with the
16@ distribution.
17@     * Neither the names of the Xiph.org Foundation nor Pinknoise
18@ Productions Ltd nor the names of its contributors may be used to
19@ endorse or promote products derived from this software without
20@ specific prior written permission.
21@
22@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23@ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26@ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33@ ----------------------------------------------------------------------
34
35    .text
36
37	@ low accuracy version
38
39	.global mdct_backwardARM
40	.global mdct_shift_right
41	.global mdct_unroll_prelap
42	.global mdct_unroll_part2
43	.global mdct_unroll_part3
44	.global mdct_unroll_postlap
45
46	.extern	sincos_lookup0
47	.extern	sincos_lookup1
48
49mdct_unroll_prelap:
50	@ r0 = out
51	@ r1 = post
52	@ r2 = r
53	@ r3 = step
54	STMFD	r13!,{r4-r7,r14}
55	MVN	r4, #0x8000
56	MOV	r3, r3, LSL #1
57	SUB	r1, r2, r1		@ r1 = r - post
58	SUBS	r1, r1, #16		@ r1 = r - post - 16
59	BLT	unroll_over
60unroll_loop:
61	LDMDB	r2!,{r5,r6,r7,r12}
62
63	MOV	r5, r5, ASR #9		@ r5 = (*--r)>>9
64	MOV	r6, r6, ASR #9		@ r6 = (*--r)>>9
65	MOV	r7, r7, ASR #9		@ r7 = (*--r)>>9
66	MOV	r12,r12,ASR #9		@ r12= (*--r)>>9
67
68	MOV	r14,r12,ASR #15
69	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
70	EORNE	r12,r4, r14,ASR #31
71	STRH	r12,[r0], r3
72
73	MOV	r14,r7, ASR #15
74	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
75	EORNE	r7, r4, r14,ASR #31
76	STRH	r7, [r0], r3
77
78	MOV	r14,r6, ASR #15
79	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
80	EORNE	r6, r4, r14,ASR #31
81	STRH	r6, [r0], r3
82
83	MOV	r14,r5, ASR #15
84	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
85	EORNE	r5, r4, r14,ASR #31
86	STRH	r5, [r0], r3
87
88	SUBS	r1, r1, #16
89	BGE	unroll_loop
90
91unroll_over:
92	ADDS	r1, r1, #16
93	BLE	unroll_end
94unroll_loop2:
95	LDR	r5,[r2,#-4]!
96	@ stall
97	@ stall (Xscale)
98	MOV	r5, r5, ASR #9		@ r5 = (*--r)>>9
99	MOV	r14,r5, ASR #15
100	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
101	EORNE	r5, r4, r14,ASR #31
102	STRH	r5, [r0], r3
103	SUBS	r1, r1, #4
104	BGT	unroll_loop2
105unroll_end:
106	LDMFD	r13!,{r4-r7,PC}
107
108mdct_unroll_postlap:
109	@ r0 = out
110	@ r1 = post
111	@ r2 = l
112	@ r3 = step
113	STMFD	r13!,{r4-r7,r14}
114	MVN	r4, #0x8000
115	MOV	r3, r3, LSL #1
116	SUB	r1, r1, r2		@ r1 = post - l
117	MOV	r1, r1, ASR #1		@ r1 = (post - l)>>1
118	SUBS	r1, r1, #16		@ r1 = ((post - l)>>1) - 4
119	BLT	unroll_over3
120unroll_loop3:
121	LDR	r12,[r2],#8
122	LDR	r7, [r2],#8
123	LDR	r6, [r2],#8
124	LDR	r5, [r2],#8
125
126	RSB	r12,r12,#0
127	RSB	r5, r5, #0
128	RSB	r6, r6, #0
129	RSB	r7, r7, #0
130
131	MOV	r12, r12,ASR #9		@ r12= (-*l)>>9
132	MOV	r5,  r5, ASR #9		@ r5 = (-*l)>>9
133	MOV	r6,  r6, ASR #9		@ r6 = (-*l)>>9
134	MOV	r7,  r7, ASR #9		@ r7 = (-*l)>>9
135
136	MOV	r14,r12,ASR #15
137	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
138	EORNE	r12,r4, r14,ASR #31
139	STRH	r12,[r0], r3
140
141	MOV	r14,r7, ASR #15
142	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
143	EORNE	r7, r4, r14,ASR #31
144	STRH	r7, [r0], r3
145
146	MOV	r14,r6, ASR #15
147	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
148	EORNE	r6, r4, r14,ASR #31
149	STRH	r6, [r0], r3
150
151	MOV	r14,r5, ASR #15
152	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
153	EORNE	r5, r4, r14,ASR #31
154	STRH	r5, [r0], r3
155
156	SUBS	r1, r1, #16
157	BGE	unroll_loop3
158
159unroll_over3:
160	ADDS	r1, r1, #16
161	BLE	unroll_over4
162unroll_loop4:
163	LDR	r5,[r2], #8
164	@ stall
165	@ stall (Xscale)
166	RSB	r5, r5, #0
167	MOV	r5, r5, ASR #9		@ r5 = (-*l)>>9
168	MOV	r14,r5, ASR #15
169	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
170	EORNE	r5, r4, r14,ASR #31
171	STRH	r5, [r0], r3
172	SUBS	r1, r1, #4
173	BGT	unroll_loop4
174unroll_over4:
175	LDMFD	r13!,{r4-r7,PC}
176
177mdct_unroll_part2:
178	@ r0 = out
179	@ r1 = post
180	@ r2 = l
181	@ r3 = r
182	@ <> = step
183	@ <> = wL
184	@ <> = wR
185	MOV	r12,r13
186	STMFD	r13!,{r4,r6-r11,r14}
187	LDMFD	r12,{r8,r9,r10}		@ r8 = step
188					@ r9 = wL
189					@ r10= wR
190	MVN	r4, #0x8000
191	MOV	r8, r8, LSL #1
192	SUBS	r1, r3, r1		@ r1 = (r - post)
193	BLE	unroll_over5
194unroll_loop5:
195	LDR	r12,[r2, #-8]!		@ r12= *l       (but l -= 2 first)
196	LDR	r7, [r3, #-4]!		@ r7 = *--r
197	LDRB	r6, [r10,#-1]!		@ r6 = *--wR
198	LDRB	r11,[r9],#1		@ r11= *wL++
199
200	MOV	r12, r12, ASR #8
201	@ Can save a cycle here, at the cost of 1bit errors in rounding
202	MUL	r11,r12,r11		@ r11  = *l   * *wL++
203	MOV	r7, r7, ASR #8
204	MLA	r6, r7, r6, r11		@ r6   = *--r * *--wR
205	MOV	r6, r6, ASR #9
206	MOV	r14,r6, ASR #15
207	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
208	EORNE	r6, r4, r14,ASR #31
209	STRH	r6, [r0], r8
210
211	SUBS	r1, r1, #4
212	BGT	unroll_loop5
213
214unroll_over5:
215	LDMFD	r13!,{r4,r6-r11,PC}
216
217mdct_unroll_part3:
218	@ r0 = out
219	@ r1 = post
220	@ r2 = l
221	@ r3 = r
222	@ <> = step
223	@ <> = wL
224	@ <> = wR
225	MOV	r12,r13
226	STMFD	r13!,{r4,r6-r11,r14}
227	LDMFD	r12,{r8,r9,r10}		@ r8 = step
228					@ r9 = wL
229					@ r10= wR
230	MVN	r4, #0x8000
231	MOV	r8, r8, LSL #1
232	SUBS	r1, r1, r3		@ r1 = (post - r)
233	BLE	unroll_over6
234unroll_loop6:
235	LDR	r12,[r2],#8		@ r12= *l       (but l += 2 first)
236	LDR	r7, [r3],#4		@ r7 = *r++
237	LDRB	r11,[r9],#1		@ r11= *wL++
238	LDRB	r6, [r10,#-1]!		@ r6 = *--wR
239
240	@ Can save a cycle here, at the cost of 1bit errors in rounding
241	MOV	r12,r12,ASR #8
242	MUL	r11,r12,r11		@ (r14,r11)  = *l   * *wL++
243	MOV	r7, r7, ASR #8
244	MUL	r6, r7, r6		@ (r14,r6)   = *--r * *--wR
245	SUB	r6, r6, r11
246	MOV	r6, r6, ASR #9
247	MOV	r14,r6, ASR #15
248	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
249	EORNE	r6, r4, r14,ASR #31
250	STRH	r6, [r0], r8
251
252	SUBS	r1, r1, #4
253	BGT	unroll_loop6
254
255unroll_over6:
256	LDMFD	r13!,{r4,r6-r11,PC}
257
258mdct_shift_right:
259	@ r0 = n
260	@ r1 = in
261	@ r2 = right
262	STMFD	r13!,{r4-r11,r14}
263
264	MOV	r0, r0, LSR #2		@ n >>= 2
265	ADD	r1, r1, #4
266
267	SUBS	r0, r0,	#8
268	BLT	sr_less_than_8
269sr_loop:
270	LDR	r3, [r1], #8
271	LDR	r4, [r1], #8
272	LDR	r5, [r1], #8
273	LDR	r6, [r1], #8
274	LDR	r7, [r1], #8
275	LDR	r8, [r1], #8
276	LDR	r12,[r1], #8
277	LDR	r14,[r1], #8
278	SUBS	r0, r0, #8
279	STMIA	r2!,{r3,r4,r5,r6,r7,r8,r12,r14}
280	BGE	sr_loop
281sr_less_than_8:
282	ADDS	r0, r0, #8
283	BEQ	sr_end
284sr_loop2:
285	LDR	r3, [r1], #8
286	SUBS	r0, r0, #1
287	STR	r3, [r2], #4
288	BGT	sr_loop2
289sr_end:
290	LDMFD	r13!,{r4-r11,PC}
291
292mdct_backwardARM:
293	@ r0 = n
294	@ r1 = in
295	STMFD	r13!,{r4-r11,r14}
296
297	MOV	r2, #1<<4	@ r2 = 1<<shift
298	MOV	r3, #13-4	@ r3 = 13-shift
299find_shift_loop:
300	TST	r0, r2		@ if (n & (1<<shift)) == 0
301	MOV	r2, r2, LSL #1
302	SUBEQ	r3, r3, #1	@ shift--
303	BEQ	find_shift_loop
304	MOV	r2, #2
305	MOV	r2, r2, LSL r3	@ r2 = step = 2<<shift
306
307	@ presymmetry
308	@ r0 = n (a multiple of 4)
309	@ r1 = in
310	@ r2 = step
311	@ r3 = shift
312
313	ADD	r4, r1, r0, LSL #1	@ r4 = aX = in+(n>>1)
314	ADD	r14,r1, r0		@ r14= in+(n>>2)
315	SUB	r4, r4, #3*4		@ r4 = aX = in+n2-3
316	LDR	r5, =sincos_lookup0	@ r5 = T=sincos_lookup0
317
318presymmetry_loop1:
319	LDR	r7, [r4,#8]		@ r6 = s2 = aX[2]
320	LDRB	r11,[r5,#1]		@ r11= T[1]
321	LDR	r6, [r4],#-16		@ r6 = s0 = aX[0]
322	LDRB	r10,[r5],r2		@ r10= T[0]   T += step
323	MOV	r6, r6, ASR #8
324	MOV	r7, r7, ASR #8
325
326	@ XPROD31(s0, s2, T[0], T[1], 0xaX[0], &ax[2])
327	MUL	r9, r6, r10		@ r9   = s0*T[0]
328	RSB	r6, r6, #0
329	MLA	r9, r7, r11,r9		@ r9  += s2*T[1]
330	CMP	r4, r14
331	MUL	r12,r7, r10		@ r12  = s2*T[0]
332	STR	r9, [r4,#16]		@ aX[0] = r9
333	MLA	r12,r6, r11,r12		@ r12 -= s0*T[1]
334	STR	r12,[r4,#8+16]		@ aX[2] = r12
335
336	BGE	presymmetry_loop1	@ while (aX >= in+n4)
337
338presymmetry_loop2:
339	LDR	r6, [r4],#-16		@ r6 = s0 = aX[0]
340	LDRB	r10,[r5,#1]		@ r10= T[1]
341	LDR	r7, [r4,#16+8]		@ r6 = s2 = aX[2]
342	LDRB	r11,[r5],-r2		@ r11= T[0]   T -= step
343	MOV	r6, r6, ASR #8
344	MOV	r7, r7, ASR #8
345
346	@ XPROD31(s0, s2, T[1], T[0], 0xaX[0], &ax[2])
347	MUL	r9, r6, r10		@ r9   = s0*T[1]
348	RSB	r6, r6, #0
349	MLA	r9, r7, r11,r9		@ r9  += s2*T[0]
350	CMP	r4, r1
351	MUL	r12,r7, r10		@ r12  = s2*T[1]
352	STR	r9, [r4,#16]		@ aX[0] = r9
353	MLA	r12,r6, r11,r12		@ r12 -= s0*T[0]
354	STR	r12,[r4,#8+16]		@ aX[2] = r12
355
356	BGE	presymmetry_loop2	@ while (aX >= in)
357
358	@ r0 = n
359	@ r1 = in
360	@ r2 = step
361	@ r3 = shift
362	STMFD	r13!,{r3}
363	LDR	r5, =sincos_lookup0	@ r5 = T=sincos_lookup0
364	ADD	r4, r1, r0, LSL #1	@ r4 = aX = in+(n>>1)
365	SUB	r4, r4, #4*4		@ r4 = aX = in+(n>>1)-4
366	LDRB	r11,[r5,#1]		@ r11= T[1]
367	LDRB	r10,[r5],r2		@ r10= T[0]    T += step
368presymmetry_loop3:
369	LDR	r8, [r1],#16 		@ r8 = ro0 = bX[0]
370	LDR	r9, [r1,#8-16]		@ r9 = ro2 = bX[2]
371	LDR	r6, [r4],#-16		@ r6 = ri0 = aX[0]
372	LDR	r7, [r4,#8+16]		@ r7 = ri2 = aX[2]
373	MOV	r8, r8, ASR #8
374	MOV	r9, r9, ASR #8
375	MOV	r6, r6, ASR #8
376
377	@ XNPROD31( ro2, ro0, T[1], T[0], 0xaX[0], &aX[2] )
378	@ aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31
379	MUL	r12,r8, r11		@ r12  = ro0*T[1]
380	MOV	r7, r7, ASR #8
381	MLA	r12,r9, r10,r12		@ r12 += ro2*T[0]
382	RSB	r8, r8, #0		@ r8 = -ro0
383	MUL	r3, r9, r11		@ r3   = ro2*T[1]
384	LDRB	r11,[r5,#1]		@ r11= T[1]
385	MLA	r3, r8, r10,r3		@ r3  -= ro0*T[0]
386	LDRB	r10,[r5],r2		@ r10= T[0]    T += step
387	STR	r12,[r4,#16+8]
388	STR	r3, [r4,#16]
389
390	@ XNPROD31( ri2, ri0, T[0], T[1], 0xbX[0], &bX[2] )
391	@ bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31
392	MUL	r12,r6, r10		@ r12  = ri0*T[0]
393	RSB	r6, r6, #0		@ r6 = -ri0
394	MLA	r12,r7, r11,r12		@ r12 += ri2*T[1]
395	CMP	r4, r1
396	MUL	r3, r7, r10		@ r3   = ri2*T[0]
397	STR	r12,[r1,#8-16]
398	MLA	r3, r6, r11,r3		@ r3  -= ri0*T[1]
399	STR	r3, [r1,#-16]
400
401	BGE	presymmetry_loop3
402
403	SUB	r1,r1,r0		@ r1 = in -= n>>2 (i.e. restore in)
404
405	LDR	r3,[r13]
406	STR	r2,[r13,#-4]!
407
408	@ mdct_butterflies
409	@ r0 = n  = (points * 2)
410	@ r1 = in = x
411	@ r2 = i
412	@ r3 = shift
413	STMFD	r13!,{r0-r1}
414	RSBS	r4,r3,#6		@ r4 = stages = 7-shift then --stages
415	LDR	r5,=sincos_lookup0
416	BLE	no_generics
417	MOV	r14,#4			@ r14= 4               (i=0)
418	MOV	r6, r14,LSL r3		@ r6 = (4<<i)<<shift
419mdct_butterflies_loop1:
420	MOV	r0, r0, LSR #1		@ r0 = points>>i = POINTS
421	MOV	r2, r14,LSR #2		@ r2 = (1<<i)-j        (j=0)
422	STMFD	r13!,{r4,r14}
423mdct_butterflies_loop2:
424
425	@ mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift))
426	@ mdct_butterfly_generic(r1, r0, r6)
427	@ r0 = points
428	@ r1 = x
429	@ preserve r2 (external loop counter)
430	@ preserve r3
431	@ preserve r4 (external loop counter)
432	@ r5 = T = sincos_lookup0
433	@ r6 = step
434	@ preserve r14
435
436	STR	r2,[r13,#-4]!		@ stack r2
437	ADD	r1,r1,r0,LSL #1		@ r1 = x2+4 = x + (POINTS>>1)
438	ADD	r7,r1,r0,LSL #1		@ r7 = x1+4 = x + POINTS
439	ADD	r12,r5,#1024		@ r12= sincos_lookup0+1024
440
441mdct_bufferfly_generic_loop1:
442	LDMDB	r7!,{r2,r3,r8,r11}	@ r2 = x1[0]
443					@ r3 = x1[1]
444					@ r8 = x1[2]
445					@ r11= x1[3]    x1 -= 4
446	LDMDB	r1!,{r4,r9,r10,r14}	@ r4 = x2[0]
447					@ r9 = x2[1]
448					@ r10= x2[2]
449					@ r14= x2[3]    x2 -= 4
450
451	SUB	r2, r2, r3		@ r2 = s0 = x1[0] - x1[1]
452	ADD	r3, r2, r3, LSL #1	@ r3 =      x1[0] + x1[1] (-> x1[0])
453	SUB	r11,r11,r8		@ r11= s1 = x1[3] - x1[2]
454	ADD	r8, r11,r8, LSL #1	@ r8 =      x1[3] + x1[2] (-> x1[2])
455	SUB	r9, r9, r4		@ r9 = s2 = x2[1] - x2[0]
456	ADD	r4, r9, r4, LSL #1	@ r4 =      x2[1] + x2[0] (-> x1[1])
457	SUB	r14,r14,r10		@ r14= s3 = x2[3] - x2[2]
458	ADD	r10,r14,r10,LSL #1	@ r10=      x2[3] + x2[2] (-> x1[3])
459	STMIA	r7,{r3,r4,r8,r10}
460
461	@ r0 = points
462	@ r1 = x2
463	@ r2 = s0
464	@ r3 free
465	@ r4 free
466	@ r5 = T
467	@ r6 = step
468	@ r7 = x1
469	@ r8 free
470	@ r9 = s2
471	@ r10 free
472	@ r11= s1
473	@ r12= limit
474	@ r14= s3
475
476	LDRB	r8, [r5,#1]		@ r8 = T[1]
477	LDRB	r10,[r5],r6		@ r10= T[0]		T += step
478	MOV	r2, r2, ASR #8
479	MOV	r11,r11,ASR #8
480	MOV	r9, r9, ASR #8
481	MOV	r14,r14,ASR #8
482
483	@ XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2])
484	@ x2[0] = (s1*T[0] + s0*T[1])>>31     x2[2] = (s0*T[0] - s1*T[1])>>31
485	@ stall Xscale
486	MUL	r3, r2, r8		@ r3   = s0*T[1]
487	MLA	r3, r11,r10,r3		@ r3  += s1*T[0]
488	RSB	r11,r11,#0
489	MUL	r4, r8, r11		@ r4   = -s1*T[1]
490	MLA	r4, r2, r10,r4		@ r4  += s0*T[0] = Value for x2[2]
491	MOV	r2, r3			@ r2 = r3 = Value for x2[0]
492
493	@ XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3])
494	@ x2[1] = (s2*T[0] + s3*T[1])>>31     x2[3] = (s3*T[0] - s2*T[1])>>31
495	MUL	r3, r9, r10		@ r3   = s2*T[0]
496	MLA	r3, r14,r8, r3		@ r3  += s3*T[1] = Value for x2[1]
497	RSB	r9, r9, #0
498	MUL	r11,r14,r10		@ r11  = s3*T[0]
499	MLA	r11,r9, r8, r11		@ r11 -= s2*T[1] = Value for x2[3]
500	CMP	r5, r12
501
502	STMIA	r1,{r2,r3,r4,r11}
503
504	BLT	mdct_bufferfly_generic_loop1
505
506	SUB	r12,r12,#1024
507mdct_bufferfly_generic_loop2:
508	LDMDB	r7!,{r2,r3,r9,r10}	@ r2 = x1[0]
509					@ r3 = x1[1]
510					@ r9 = x1[2]
511					@ r10= x1[3]    x1 -= 4
512	LDMDB	r1!,{r4,r8,r11,r14}	@ r4 = x2[0]
513					@ r8 = x2[1]
514					@ r11= x2[2]
515					@ r14= x2[3]    x2 -= 4
516
517	SUB	r2, r2, r3		@ r2 = s0 = x1[0] - x1[1]
518	ADD	r3, r2, r3, LSL #1	@ r3 =      x1[0] + x1[1] (-> x1[0])
519	SUB	r9, r9,r10		@ r9 = s1 = x1[2] - x1[3]
520	ADD	r10,r9,r10, LSL #1	@ r10=      x1[2] + x1[3] (-> x1[2])
521	SUB	r4, r4, r8		@ r4 = s2 = x2[0] - x2[1]
522	ADD	r8, r4, r8, LSL #1	@ r8 =      x2[0] + x2[1] (-> x1[1])
523	SUB	r14,r14,r11		@ r14= s3 = x2[3] - x2[2]
524	ADD	r11,r14,r11,LSL #1	@ r11=      x2[3] + x2[2] (-> x1[3])
525	STMIA	r7,{r3,r8,r10,r11}
526
527	@ r0 = points
528	@ r1 = x2
529	@ r2 = s0
530	@ r3 free
531	@ r4 = s2
532	@ r5 = T
533	@ r6 = step
534	@ r7 = x1
535	@ r8 free
536	@ r9 = s1
537	@ r10 free
538	@ r11 free
539	@ r12= limit
540	@ r14= s3
541
542	LDRB	r8, [r5,#1]		@ r8 = T[1]
543	LDRB	r10,[r5],-r6		@ r10= T[0]		T -= step
544	MOV	r2, r2, ASR #8
545	MOV	r9, r9, ASR #8
546	MOV	r4, r4, ASR #8
547	MOV	r14,r14,ASR #8
548
549	@ XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2])
550	@ x2[0] = (s0*T[0] - s1*T[1])>>31     x2[2] = (s1*T[0] + s0*T[1])>>31
551	@ stall Xscale
552	MUL	r11,r2, r8		@ r11  = s0*T[1]
553	MLA	r11,r9, r10,r11		@ r11 += s1*T[0]
554	RSB	r9, r9, #0
555	MUL	r2, r10,r2		@ r2   = s0*T[0]
556	MLA	r2, r9, r8, r2		@ r2  += -s1*T[1] = Value for x2[0]
557	MOV	r9, r11			@ r9 = r11 = Value for x2[2]
558
559	@ XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3])
560	@ x2[1] = (s3*T[0] - s2*T[1])>>31     x2[3] = (s2*T[0] + s3*T[1])>>31
561	MUL	r11,r4, r10		@ r11   = s2*T[0]
562	MLA	r11,r14,r8, r11		@ r11  += s3*T[1] = Value for x2[3]
563	RSB	r4, r4, #0
564	MUL	r3, r14,r10		@ r3   = s3*T[0]
565	MLA	r3, r4, r8, r3		@ r3  -= s2*T[1] = Value for x2[1]
566	CMP	r5, r12
567
568	STMIA	r1,{r2,r3,r9,r11}
569
570	BGT	mdct_bufferfly_generic_loop2
571
572	LDR	r2,[r13],#4		@ unstack r2
573	ADD	r1, r1, r0, LSL #2	@ r1 = x+POINTS*j
574	@ stall Xscale
575	SUBS	r2, r2, #1		@ r2--                 (j++)
576	BGT	mdct_butterflies_loop2
577
578	LDMFD	r13!,{r4,r14}
579
580	LDR	r1,[r13,#4]
581
582	SUBS	r4, r4, #1		@ stages--
583	MOV	r14,r14,LSL #1		@ r14= 4<<i            (i++)
584	MOV	r6, r6, LSL #1		@ r6 = step <<= 1      (i++)
585	BGE	mdct_butterflies_loop1
586	LDMFD	r13,{r0-r1}
587
588no_generics:
589	@ mdct_butterflies part2 (loop around mdct_bufferfly_32)
590	@ r0 = points
591	@ r1 = in
592	@ r2 = step
593	@ r3 = shift
594
595mdct_bufferflies_loop3:
596	@ mdct_bufferfly_32
597
598	@ block1
599	ADD	r4, r1, #16*4		@ r4 = &in[16]
600	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[16]
601					@ r6 = x[17]
602					@ r9 = x[18]
603					@ r10= x[19]
604	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[0]
605					@ r8 = x[1]
606					@ r11= x[2]
607					@ r12= x[3]
608	SUB	r5, r5, r6		@ r5 = s0 = x[16] - x[17]
609	ADD	r6, r5, r6, LSL #1	@ r6 =      x[16] + x[17]  -> x[16]
610	SUB	r9, r9, r10		@ r9 = s1 = x[18] - x[19]
611	ADD	r10,r9, r10,LSL #1	@ r10=      x[18] + x[19]  -> x[18]
612	SUB	r8, r8, r7		@ r8 = s2 = x[ 1] - x[ 0]
613	ADD	r7, r8, r7, LSL #1	@ r7 =      x[ 1] + x[ 0]  -> x[17]
614	SUB	r12,r12,r11		@ r12= s3 = x[ 3] - x[ 2]
615	ADD	r11,r12,r11, LSL #1	@ r11=      x[ 3] + x[ 2]  -> x[19]
616	STMIA	r4!,{r6,r7,r10,r11}
617
618	MOV	r6,#0xed		@ r6 =cPI1_8
619	MOV	r7,#0x62		@ r7 =cPI3_8
620
621	MOV	r5, r5, ASR #8
622	MOV	r9, r9, ASR #8
623	MOV	r8, r8, ASR #8
624	MOV	r12,r12,ASR #8
625
626	@ XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] )
627	@ x[0] = s0*cPI3_8 - s1*cPI1_8     x[2] = s1*cPI3_8 + s0*cPI1_8
628	@ stall Xscale
629	MUL	r11,r5, r6		@ r11  = s0*cPI1_8
630	MLA	r11,r9, r7, r11		@ r11 += s1*cPI3_8
631	RSB	r9, r9, #0
632	MUL	r5, r7, r5		@ r5   = s0*cPI3_8
633	MLA	r5, r9, r6, r5		@ r5  -= s1*cPI1_8
634
635	@ XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] )
636	@ x[1] = s2*cPI1_8 + s3*cPI3_8     x[3] = s3*cPI1_8 - s2*cPI3_8
637	MUL	r9, r8, r6		@ r9   = s2*cPI1_8
638	MLA	r9, r12,r7, r9		@ r9  += s3*cPI3_8
639	RSB	r8,r8,#0
640	MUL	r12,r6, r12		@ r12  = s3*cPI1_8
641	MLA	r12,r8, r7, r12		@ r12 -= s2*cPI3_8
642	STMIA	r1!,{r5,r9,r11,r12}
643
644	@ block2
645	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[20]
646					@ r6 = x[21]
647					@ r9 = x[22]
648					@ r10= x[23]
649	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[4]
650					@ r8 = x[5]
651					@ r11= x[6]
652					@ r12= x[7]
653	SUB	r5, r5, r6		@ r5 = s0 = x[20] - x[21]
654	ADD	r6, r5, r6, LSL #1	@ r6 =      x[20] + x[21]  -> x[20]
655	SUB	r9, r9, r10		@ r9 = s1 = x[22] - x[23]
656	ADD	r10,r9, r10,LSL #1	@ r10=      x[22] + x[23]  -> x[22]
657	SUB	r8, r8, r7		@ r8 = s2 = x[ 5] - x[ 4]
658	ADD	r7, r8, r7, LSL #1	@ r7 =      x[ 5] + x[ 4]  -> x[21]
659	SUB	r12,r12,r11		@ r12= s3 = x[ 7] - x[ 6]
660	ADD	r11,r12,r11, LSL #1	@ r11=      x[ 7] + x[ 6]  -> x[23]
661	MOV	r14,#0xb5		@ cPI2_8
662	STMIA	r4!,{r6,r7,r10,r11}
663
664	SUB	r5, r5, r9		@ r5 = s0 - s1
665	ADD	r9, r5, r9, LSL #1	@ r9 = s0 + s1
666	MOV	r5, r5, ASR #8
667	MUL	r5, r14,r5		@ r5 = (s0-s1)*cPI2_8
668	SUB	r12,r12,r8		@ r12= s3 - s2
669	ADD	r8, r12,r8, LSL #1	@ r8 = s3 + s2
670
671	MOV	r8, r8, ASR #8
672	MUL	r8, r14,r8		@ r8  = (s3+s2)*cPI2_8
673	MOV	r9, r9, ASR #8
674	MUL	r9, r14,r9		@ r9  = (s0+s1)*cPI2_8
675	MOV	r12,r12,ASR #8
676	MUL	r12,r14,r12		@ r12 = (s3-s2)*cPI2_8
677	STMIA	r1!,{r5,r8,r9,r12}
678
679	@ block3
680	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[24]
681					@ r6 = x[25]
682					@ r9 = x[25]
683					@ r10= x[26]
684	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[8]
685					@ r8 = x[9]
686					@ r11= x[10]
687					@ r12= x[11]
688	SUB	r5, r5, r6		@ r5 = s0 = x[24] - x[25]
689	ADD	r6, r5, r6, LSL #1	@ r6 =      x[24] + x[25]  -> x[25]
690	SUB	r9, r9, r10		@ r9 = s1 = x[26] - x[27]
691	ADD	r10,r9, r10,LSL #1	@ r10=      x[26] + x[27]  -> x[26]
692	SUB	r8, r8, r7		@ r8 = s2 = x[ 9] - x[ 8]
693	ADD	r7, r8, r7, LSL #1	@ r7 =      x[ 9] + x[ 8]  -> x[25]
694	SUB	r12,r12,r11		@ r12= s3 = x[11] - x[10]
695	ADD	r11,r12,r11, LSL #1	@ r11=      x[11] + x[10]  -> x[27]
696	STMIA	r4!,{r6,r7,r10,r11}
697
698	MOV	r6,#0x62		@ r6 = cPI3_8
699	MOV	r7,#0xED		@ r7 = cPI1_8
700
701	@ XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] )
702	@ x[8] = s0*cPI1_8 - s1*cPI3_8     x[10] = s1*cPI1_8 + s0*cPI3_8
703	@ stall Xscale
704	MOV	r5, r5, ASR #8
705	MUL	r11,r5, r6		@ r11  = s0*cPI3_8
706	MOV	r9, r9, ASR #8
707	MLA	r11,r9, r7, r11		@ r11 += s1*cPI1_8
708	RSB	r9, r9, #0
709	MUL	r5, r7, r5		@ r5   = s0*cPI1_8
710	MLA	r5, r9, r6, r5		@ r5  -= s1*cPI3_8
711
712	@ XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] )
713	@ x[9] = s2*cPI3_8 + s3*cPI1_8     x[11] = s3*cPI3_8 - s2*cPI1_8
714	MOV	r8, r8, ASR #8
715	MUL	r9, r8, r6		@ r9   = s2*cPI3_8
716	MOV	r12,r12,ASR #8
717	MLA	r9, r12,r7, r9		@ r9  += s3*cPI1_8
718	RSB	r8,r8,#0
719	MUL	r12,r6, r12		@ r12  = s3*cPI3_8
720	MLA	r12,r8, r7, r12		@ r12 -= s2*cPI1_8
721	STMIA	r1!,{r5,r9,r11,r12}
722
723	@ block4
724	LDMIA	r4,{r5,r6,r10,r11}	@ r5 = x[28]
725					@ r6 = x[29]
726					@ r10= x[30]
727					@ r11= x[31]
728	LDMIA	r1,{r8,r9,r12,r14}	@ r8 = x[12]
729					@ r9 = x[13]
730					@ r12= x[14]
731					@ r14= x[15]
732	SUB	r5, r5, r6		@ r5 = s0 = x[28] - x[29]
733	ADD	r6, r5, r6, LSL #1	@ r6 =      x[28] + x[29]  -> x[28]
734	SUB	r7, r14,r12		@ r7 = s3 = x[15] - x[14]
735	ADD	r12,r7, r12, LSL #1	@ r12=      x[15] + x[14]  -> x[31]
736	SUB	r10,r10,r11		@ r10= s1 = x[30] - x[31]
737	ADD	r11,r10,r11,LSL #1	@ r11=      x[30] + x[31]  -> x[30]
738	SUB	r14, r8, r9		@ r14= s2 = x[12] - x[13]
739	ADD	r9, r14, r9, LSL #1	@ r9 =      x[12] + x[13]  -> x[29]
740	STMIA	r4!,{r6,r9,r11,r12}
741	STMIA	r1!,{r5,r7,r10,r14}
742
743	@ mdct_butterfly16 (1st version)
744	@ block 1
745	SUB	r1,r1,#16*4
746	ADD	r4,r1,#8*4
747	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[ 8]
748					@ r6 = x[ 9]
749					@ r9 = x[10]
750					@ r10= x[11]
751	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[0]
752					@ r8 = x[1]
753					@ r11= x[2]
754					@ r12= x[3]
755	SUB	r5, r5, r6		@ r5 = s0 = x[ 8] - x[ 9]
756	ADD	r6, r5, r6, LSL #1	@ r6 =      x[ 8] + x[ 9]  -> x[ 8]
757	SUB	r9, r9, r10		@ r9 = s1 = x[10] - x[11]
758	ADD	r10,r9, r10,LSL #1	@ r10=      x[10] + x[11]  -> x[10]
759	SUB	r8, r8, r7		@ r8 = s2 = x[ 1] - x[ 0]
760	ADD	r7, r8, r7, LSL #1	@ r7 =      x[ 1] + x[ 0]  -> x[ 9]
761	SUB	r12,r12,r11		@ r12= s3 = x[ 3] - x[ 2]
762	ADD	r11,r12,r11, LSL #1	@ r11=      x[ 3] + x[ 2]  -> x[11]
763	MOV	r14,#0xB5		@ r14= cPI2_8
764	STMIA	r4!,{r6,r7,r10,r11}
765
766	SUB	r5, r5, r9		@ r5 = s0 - s1
767	ADD	r9, r5, r9, LSL #1	@ r9 = s0 + s1
768	MOV	r5, r5, ASR #8
769	MUL	r5, r14,r5		@ r5  = (s0-s1)*cPI2_8
770	SUB	r12,r12,r8		@ r12= s3 - s2
771	ADD	r8, r12,r8, LSL #1	@ r8 = s3 + s2
772
773	MOV	r8, r8, ASR #8
774	MUL	r8, r14,r8		@ r8  = (s3+s2)*cPI2_8
775	MOV	r9, r9, ASR #8
776	MUL	r9, r14,r9		@ r9  = (s0+s1)*cPI2_8
777	MOV	r12,r12,ASR #8
778	MUL	r12,r14,r12		@ r12 = (s3-s2)*cPI2_8
779	STMIA	r1!,{r5,r8,r9,r12}
780
781	@ block2
782	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[12]
783					@ r6 = x[13]
784					@ r9 = x[14]
785					@ r10= x[15]
786	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[ 4]
787					@ r8 = x[ 5]
788					@ r11= x[ 6]
789					@ r12= x[ 7]
790	SUB	r14,r7, r8		@ r14= s0 = x[ 4] - x[ 5]
791	ADD	r8, r14,r8, LSL #1	@ r8 =      x[ 4] + x[ 5]  -> x[13]
792	SUB	r7, r12,r11		@ r7 = s1 = x[ 7] - x[ 6]
793	ADD	r11,r7, r11, LSL #1	@ r11=      x[ 7] + x[ 6]  -> x[15]
794	SUB	r5, r5, r6		@ r5 = s2 = x[12] - x[13]
795	ADD	r6, r5, r6, LSL #1	@ r6 =      x[12] + x[13]  -> x[12]
796	SUB	r12,r9, r10		@ r12= s3 = x[14] - x[15]
797	ADD	r10,r12,r10,LSL #1	@ r10=      x[14] + x[15]  -> x[14]
798	STMIA	r4!,{r6,r8,r10,r11}
799	STMIA	r1!,{r5,r7,r12,r14}
800
801	@ mdct_butterfly_8
802	LDMDB	r1,{r6,r7,r8,r9,r10,r11,r12,r14}
803					@ r6 = x[0]
804					@ r7 = x[1]
805					@ r8 = x[2]
806					@ r9 = x[3]
807					@ r10= x[4]
808					@ r11= x[5]
809					@ r12= x[6]
810					@ r14= x[7]
811	ADD	r6, r6, r7		@ r6 = s0 = x[0] + x[1]
812	SUB	r7, r6, r7, LSL #1	@ r7 = s1 = x[0] - x[1]
813	ADD	r8, r8, r9		@ r8 = s2 = x[2] + x[3]
814	SUB	r9, r8, r9, LSL #1	@ r9 = s3 = x[2] - x[3]
815	ADD	r10,r10,r11		@ r10= s4 = x[4] + x[5]
816	SUB	r11,r10,r11,LSL #1	@ r11= s5 = x[4] - x[5]
817	ADD	r12,r12,r14		@ r12= s6 = x[6] + x[7]
818	SUB	r14,r12,r14,LSL #1	@ r14= s7 = x[6] - x[7]
819
820	ADD	r2, r11,r9		@ r2 = x[0] = s5 + s3
821	SUB	r4, r2, r9, LSL #1	@ r4 = x[2] = s5 - s3
822	SUB	r3, r14,r7		@ r3 = x[1] = s7 - s1
823	ADD	r5, r3, r7, LSL #1	@ r5 = x[3] = s7 + s1
824	SUB	r10,r10,r6		@ r10= x[4] = s4 - s0
825	SUB	r11,r12,r8		@ r11= x[5] = s6 - s2
826	ADD	r12,r10,r6, LSL #1	@ r12= x[6] = s4 + s0
827	ADD	r14,r11,r8, LSL #1	@ r14= x[7] = s6 + s2
828	STMDB	r1,{r2,r3,r4,r5,r10,r11,r12,r14}
829
830	@ mdct_butterfly_8
831	LDMIA	r1,{r6,r7,r8,r9,r10,r11,r12,r14}
832					@ r6 = x[0]
833					@ r7 = x[1]
834					@ r8 = x[2]
835					@ r9 = x[3]
836					@ r10= x[4]
837					@ r11= x[5]
838					@ r12= x[6]
839					@ r14= x[7]
840	ADD	r6, r6, r7		@ r6 = s0 = x[0] + x[1]
841	SUB	r7, r6, r7, LSL #1	@ r7 = s1 = x[0] - x[1]
842	ADD	r8, r8, r9		@ r8 = s2 = x[2] + x[3]
843	SUB	r9, r8, r9, LSL #1	@ r9 = s3 = x[2] - x[3]
844	ADD	r10,r10,r11		@ r10= s4 = x[4] + x[5]
845	SUB	r11,r10,r11,LSL #1	@ r11= s5 = x[4] - x[5]
846	ADD	r12,r12,r14		@ r12= s6 = x[6] + x[7]
847	SUB	r14,r12,r14,LSL #1	@ r14= s7 = x[6] - x[7]
848
849	ADD	r2, r11,r9		@ r2 = x[0] = s5 + s3
850	SUB	r4, r2, r9, LSL #1	@ r4 = x[2] = s5 - s3
851	SUB	r3, r14,r7		@ r3 = x[1] = s7 - s1
852	ADD	r5, r3, r7, LSL #1	@ r5 = x[3] = s7 + s1
853	SUB	r10,r10,r6		@ r10= x[4] = s4 - s0
854	SUB	r11,r12,r8		@ r11= x[5] = s6 - s2
855	ADD	r12,r10,r6, LSL #1	@ r12= x[6] = s4 + s0
856	ADD	r14,r11,r8, LSL #1	@ r14= x[7] = s6 + s2
857	STMIA	r1,{r2,r3,r4,r5,r10,r11,r12,r14}
858
859	@ mdct_butterfly16 (2nd version)
860	@ block 1
861	ADD	r1,r1,#16*4-8*4
862	ADD	r4,r1,#8*4
863	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[ 8]
864					@ r6 = x[ 9]
865					@ r9 = x[10]
866					@ r10= x[11]
867	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[0]
868					@ r8 = x[1]
869					@ r11= x[2]
870					@ r12= x[3]
871	SUB	r5, r5, r6		@ r5 = s0 = x[ 8] - x[ 9]
872	ADD	r6, r5, r6, LSL #1	@ r6 =      x[ 8] + x[ 9]  -> x[ 8]
873	SUB	r9, r9, r10		@ r9 = s1 = x[10] - x[11]
874	ADD	r10,r9, r10,LSL #1	@ r10=      x[10] + x[11]  -> x[10]
875	SUB	r8, r8, r7		@ r8 = s2 = x[ 1] - x[ 0]
876	ADD	r7, r8, r7, LSL #1	@ r7 =      x[ 1] + x[ 0]  -> x[ 9]
877	SUB	r12,r12,r11		@ r12= s3 = x[ 3] - x[ 2]
878	ADD	r11,r12,r11, LSL #1	@ r11=      x[ 3] + x[ 2]  -> x[11]
879	MOV	r14,#0xb5		@ r14= cPI2_8
880	STMIA	r4!,{r6,r7,r10,r11}
881
882	SUB	r5, r5, r9		@ r5 = s0 - s1
883	ADD	r9, r5, r9, LSL #1	@ r9 = s0 + s1
884	MOV	r5, r5, ASR #8
885	MUL	r5, r14,r5		@ r5  = (s0-s1)*cPI2_8
886	SUB	r12,r12,r8		@ r12= s3 - s2
887	ADD	r8, r12,r8, LSL #1	@ r8 = s3 + s2
888
889	MOV	r8, r8, ASR #8
890	MUL	r8, r14,r8		@ r8  = (s3+s2)*cPI2_8
891	MOV	r9, r9, ASR #8
892	MUL	r9, r14,r9		@ r9  = (s0+s1)*cPI2_8
893	MOV	r12,r12,ASR #8
894	MUL	r12,r14,r12		@ r12 = (s3-s2)*cPI2_8
895	STMIA	r1!,{r5,r8,r9,r12}
896
897	@ block2
898	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[12]
899					@ r6 = x[13]
900					@ r9 = x[14]
901					@ r10= x[15]
902	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[ 4]
903					@ r8 = x[ 5]
904					@ r11= x[ 6]
905					@ r12= x[ 7]
906	SUB	r5, r5, r6		@ r5 = s2 = x[12] - x[13]
907	ADD	r6, r5, r6, LSL #1	@ r6 =      x[12] + x[13]  -> x[12]
908	SUB	r9, r9, r10		@ r9 = s3 = x[14] - x[15]
909	ADD	r10,r9, r10,LSL #1	@ r10=      x[14] + x[15]  -> x[14]
910	SUB	r14,r7, r8		@ r14= s0 = x[ 4] - x[ 5]
911	ADD	r8, r14,r8, LSL #1	@ r8 =      x[ 4] + x[ 5]  -> x[13]
912	SUB	r7, r12,r11		@ r7 = s1 = x[ 7] - x[ 6]
913	ADD	r11,r7, r11, LSL #1	@ r11=      x[ 7] + x[ 6]  -> x[15]
914	STMIA	r4!,{r6,r8,r10,r11}
915	STMIA	r1!,{r5,r7,r9,r14}
916
917	@ mdct_butterfly_8
918	LDMDB	r1,{r6,r7,r8,r9,r10,r11,r12,r14}
919					@ r6 = x[0]
920					@ r7 = x[1]
921					@ r8 = x[2]
922					@ r9 = x[3]
923					@ r10= x[4]
924					@ r11= x[5]
925					@ r12= x[6]
926					@ r14= x[7]
927	ADD	r6, r6, r7		@ r6 = s0 = x[0] + x[1]
928	SUB	r7, r6, r7, LSL #1	@ r7 = s1 = x[0] - x[1]
929	ADD	r8, r8, r9		@ r8 = s2 = x[2] + x[3]
930	SUB	r9, r8, r9, LSL #1	@ r9 = s3 = x[2] - x[3]
931	ADD	r10,r10,r11		@ r10= s4 = x[4] + x[5]
932	SUB	r11,r10,r11,LSL #1	@ r11= s5 = x[4] - x[5]
933	ADD	r12,r12,r14		@ r12= s6 = x[6] + x[7]
934	SUB	r14,r12,r14,LSL #1	@ r14= s7 = x[6] - x[7]
935
936	ADD	r2, r11,r9		@ r2 = x[0] = s5 + s3
937	SUB	r4, r2, r9, LSL #1	@ r4 = x[2] = s5 - s3
938	SUB	r3, r14,r7		@ r3 = x[1] = s7 - s1
939	ADD	r5, r3, r7, LSL #1	@ r5 = x[3] = s7 + s1
940	SUB	r10,r10,r6		@ r10= x[4] = s4 - s0
941	SUB	r11,r12,r8		@ r11= x[5] = s6 - s2
942	ADD	r12,r10,r6, LSL #1	@ r12= x[6] = s4 + s0
943	ADD	r14,r11,r8, LSL #1	@ r14= x[7] = s6 + s2
944	STMDB	r1,{r2,r3,r4,r5,r10,r11,r12,r14}
945
946	@ mdct_butterfly_8
947	LDMIA	r1,{r6,r7,r8,r9,r10,r11,r12,r14}
948					@ r6 = x[0]
949					@ r7 = x[1]
950					@ r8 = x[2]
951					@ r9 = x[3]
952					@ r10= x[4]
953					@ r11= x[5]
954					@ r12= x[6]
955					@ r14= x[7]
956	ADD	r6, r6, r7		@ r6 = s0 = x[0] + x[1]
957	SUB	r7, r6, r7, LSL #1	@ r7 = s1 = x[0] - x[1]
958	ADD	r8, r8, r9		@ r8 = s2 = x[2] + x[3]
959	SUB	r9, r8, r9, LSL #1	@ r9 = s3 = x[2] - x[3]
960	ADD	r10,r10,r11		@ r10= s4 = x[4] + x[5]
961	SUB	r11,r10,r11,LSL #1	@ r11= s5 = x[4] - x[5]
962	ADD	r12,r12,r14		@ r12= s6 = x[6] + x[7]
963	SUB	r14,r12,r14,LSL #1	@ r14= s7 = x[6] - x[7]
964
965	ADD	r2, r11,r9		@ r2 = x[0] = s5 + s3
966	SUB	r4, r2, r9, LSL #1	@ r4 = x[2] = s5 - s3
967	SUB	r3, r14,r7		@ r3 = x[1] = s7 - s1
968	ADD	r5, r3, r7, LSL #1	@ r5 = x[3] = s7 + s1
969	SUB	r10,r10,r6		@ r10= x[4] = s4 - s0
970	SUB	r11,r12,r8		@ r11= x[5] = s6 - s2
971	ADD	r12,r10,r6, LSL #1	@ r12= x[6] = s4 + s0
972	ADD	r14,r11,r8, LSL #1	@ r14= x[7] = s6 + s2
973	STMIA	r1,{r2,r3,r4,r5,r10,r11,r12,r14}
974
975	ADD	r1,r1,#8*4
976	SUBS	r0,r0,#64
977	BGT	mdct_bufferflies_loop3
978
979	LDMFD	r13,{r0-r3}
980
981mdct_bitreverseARM:
982	@ r0 = points
983	@ r1 = in
984	@ r2 = step
985	@ r3 = shift
986
987	MOV	r4, #0			@ r4 = bit = 0
988	ADD	r5, r1, r0, LSL #1	@ r5 = w = x + (n>>1)
989	ADR	r6, bitrev
990	SUB	r3, r3, #2		@ r3 = shift -= 2
991	SUB	r5, r5, #8
992brev_lp:
993	LDRB	r7, [r6, r4, LSR #6]
994	AND	r8, r4, #0x3f
995	LDRB	r8, [r6, r8]
996	ADD	r4, r4, #1		@ bit++
997	@ stall XScale
998	ORR	r7, r7, r8, LSL #6	@ r7 = bitrev[bit]
999	ADD	r9, r1, r7, LSR r3	@ r9 = xx = x + (b>>shift)
1000	CMP	r5, r9			@ if (w > xx)
1001	LDR	r10,[r5],#-8		@   r10 = w[0]		w -= 2
1002	LDRGT	r11,[r5,#12]		@   r11 = w[1]
1003	LDRGT	r12,[r9]		@   r12 = xx[0]
1004	LDRGT	r14,[r9,#4]		@   r14 = xx[1]
1005	STRGT	r10,[r9]		@   xx[0]= w[0]
1006	STRGT	r11,[r9,#4]		@   xx[1]= w[1]
1007	STRGT	r12,[r5,#8]		@   w[0] = xx[0]
1008	STRGT	r14,[r5,#12]		@   w[1] = xx[1]
1009	CMP	r5,r1
1010	BGT	brev_lp
1011
1012	@ mdct_step7
1013	@ r0 = points
1014	@ r1 = in
1015	@ r2 = step
1016	@ r3 = shift-2
1017
1018	CMP	r2, #4			@ r5 = T = (step>=4) ?
1019	LDRGE	r5, =sincos_lookup0	@          sincos_lookup0 +
1020	LDRLT	r5, =sincos_lookup1	@          sincos_lookup0 +
1021	ADD	r7, r1, r0, LSL #1	@ r7 = w1 = x + (n>>1)
1022	ADDGE	r5, r5, r2, LSR #1	@		            (step>>1)
1023	ADD	r8, r5, #1024		@ r8 = Ttop
1024step7_loop1:
1025	LDR	r6, [r1]		@ r6 = w0[0]
1026	LDR	r9, [r1,#4]		@ r9 = w0[1]
1027	LDR	r10,[r7,#-8]!		@ r10= w1[0]	w1 -= 2
1028	LDR	r11,[r7,#4]		@ r11= w1[1]
1029	LDRB	r14,[r5,#1]		@ r14= T[1]
1030	LDRB	r12,[r5],r2		@ r12= T[0]	T += step
1031
1032	ADD	r6, r6, r10		@ r6 = s0 = w0[0] + w1[0]
1033	SUB	r10,r6, r10,LSL #1	@ r10= s1b= w0[0] - w1[0]
1034	SUB	r11,r11,r9		@ r11= s1 = w1[1] - w0[1]
1035	ADD	r9, r11,r9, LSL #1	@ r9 = s0b= w1[1] + w0[1]
1036
1037	MOV	r6, r6, ASR #9
1038	MUL	r3, r6, r14		@ r3   = s0*T[1]
1039	MOV	r11,r11,ASR #9
1040	MUL	r4, r11,r12		@ r4  += s1*T[0] = s2
1041	ADD	r3, r3, r4
1042	MUL	r14,r11,r14		@ r14  = s1*T[1]
1043	MUL	r12,r6, r12		@ r12 += s0*T[0] = s3
1044	SUB	r14,r14,r12
1045
1046	@ r9 = s0b<<1
1047	@ r10= s1b<<1
1048	ADD	r9, r3, r9, ASR #1	@ r9 = s0b + s2
1049	SUB	r3, r9, r3, LSL #1	@ r3 = s0b - s2
1050
1051	SUB	r12,r14,r10,ASR #1	@ r12= s3  - s1b
1052	ADD	r10,r14,r10,ASR #1	@ r10= s3  + s1b
1053	STR	r9, [r1],#4
1054	STR	r10,[r1],#4		@ w0 += 2
1055	STR	r3, [r7]
1056	STR	r12,[r7,#4]
1057
1058	CMP	r5,r8
1059	BLT	step7_loop1
1060
1061step7_loop2:
1062	LDR	r6, [r1]		@ r6 = w0[0]
1063	LDR	r9, [r1,#4]		@ r9 = w0[1]
1064	LDR	r10,[r7,#-8]!		@ r10= w1[0]	w1 -= 2
1065	LDR	r11,[r7,#4]		@ r11= w1[1]
1066	LDRB	r14,[r5,-r2]!		@ r12= T[1]	T -= step
1067	LDRB	r12,[r5,#1]		@ r14= T[0]
1068
1069	ADD	r6, r6, r10		@ r6 = s0 = w0[0] + w1[0]
1070	SUB	r10,r6, r10,LSL #1	@ r10= s1b= w0[0] - w1[0]
1071	SUB	r11,r11,r9		@ r11= s1 = w1[1] - w0[1]
1072	ADD	r9, r11,r9, LSL #1	@ r9 = s0b= w1[1] + w0[1]
1073
1074	MOV	r6, r6, ASR #9
1075	MUL	r3, r6, r14		@ r3   = s0*T[0]
1076	MOV	r11,r11,ASR #9
1077	MUL	r4, r11,r12		@ r4  += s1*T[1] = s2
1078	ADD	r3, r3, r4
1079	MUL	r14,r11,r14		@ r14  = s1*T[0]
1080	MUL	r12,r6, r12		@ r12 += s0*T[1] = s3
1081	SUB	r14,r14,r12
1082
1083	@ r9 = s0b<<1
1084	@ r10= s1b<<1
1085	ADD	r9, r3, r9, ASR #1	@ r9 = s0b + s2
1086	SUB	r3, r9, r3, LSL #1	@ r3 = s0b - s2
1087
1088	SUB	r12,r14,r10,ASR #1	@ r12= s3  - s1b
1089	ADD	r10,r14,r10,ASR #1	@ r10= s3  + s1b
1090	STR	r9, [r1],#4
1091	STR	r10,[r1],#4		@ w0 += 2
1092	STR	r3, [r7]
1093	STR	r12,[r7,#4]
1094
1095	CMP	r1,r7
1096	BLT	step7_loop2
1097
1098	LDMFD	r13!,{r0-r3}
1099
1100	@ r0 = points
1101	@ r1 = in
1102	@ r2 = step
1103	@ r3 = shift
1104	MOV	r2, r2, ASR #2		@ r2 = step >>= 2
1105	CMP	r2, #0
1106	CMPNE	r2, #1
1107	BEQ	mdct_end
1108
1109	@ step > 1 (default case)
1110	CMP	r2, #4			@ r5 = T = (step>=4) ?
1111	LDRGE	r5, =sincos_lookup0	@          sincos_lookup0 +
1112	LDRLT	r5, =sincos_lookup1	@          sincos_lookup1
1113	ADD	r7, r1, r0, LSL #1	@ r7 = iX = x + (n>>1)
1114	ADDGE	r5, r5, r2, LSR #1	@		            (step>>1)
1115mdct_step8_default:
1116	LDR	r6, [r1],#4		@ r6 =  s0 = x[0]
1117	LDR	r8, [r1],#4		@ r8 = -s1 = x[1]
1118	LDRB	r12,[r5,#1]       	@ r12= T[1]
1119	LDRB	r14,[r5],r2		@ r14= T[0]	T += step
1120	RSB	r8, r8, #0		@ r8 = s1
1121
1122	@ XPROD31(s0, s1, T[0], T[1], x, x+1)
1123	@ x[0] = s0 * T[0] + s1 * T[1]      x[1] = s1 * T[0] - s0 * T[1]
1124	MOV	r6, r6, ASR #8
1125	MOV	r8, r8, ASR #8
1126	MUL	r10,r8, r12		@ r10  = s1 * T[1]
1127	CMP	r1, r7
1128	MLA	r10,r6, r14,r10	@ r10 += s0 * T[0]
1129	RSB	r6, r6, #0		@ r6 = -s0
1130	MUL	r11,r8, r14		@ r11  = s1 * T[0]
1131	MLA	r11,r6, r12,r11	@ r11 -= s0 * T[1]
1132	STR	r10,[r1,#-8]
1133	STR	r11,[r1,#-4]
1134	BLT	mdct_step8_default
1135
1136mdct_end:
1137	MOV	r0, r2
1138	LDMFD	r13!,{r4-r11,PC}
1139
1140bitrev:
1141	.byte	0
1142	.byte	32
1143	.byte	16
1144	.byte	48
1145	.byte	8
1146	.byte	40
1147	.byte	24
1148	.byte	56
1149	.byte	4
1150	.byte	36
1151	.byte	20
1152	.byte	52
1153	.byte	12
1154	.byte	44
1155	.byte	28
1156	.byte	60
1157	.byte	2
1158	.byte	34
1159	.byte	18
1160	.byte	50
1161	.byte	10
1162	.byte	42
1163	.byte	26
1164	.byte	58
1165	.byte	6
1166	.byte	38
1167	.byte	22
1168	.byte	54
1169	.byte	14
1170	.byte	46
1171	.byte	30
1172	.byte	62
1173	.byte	1
1174	.byte	33
1175	.byte	17
1176	.byte	49
1177	.byte	9
1178	.byte	41
1179	.byte	25
1180	.byte	57
1181	.byte	5
1182	.byte	37
1183	.byte	21
1184	.byte	53
1185	.byte	13
1186	.byte	45
1187	.byte	29
1188	.byte	61
1189	.byte	3
1190	.byte	35
1191	.byte	19
1192	.byte	51
1193	.byte	11
1194	.byte	43
1195	.byte	27
1196	.byte	59
1197	.byte	7
1198	.byte	39
1199	.byte	23
1200	.byte	55
1201	.byte	15
1202	.byte	47
1203	.byte	31
1204	.byte	63
1205
1206	@ END
1207