17913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang@ Tremolo library
22da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@-----------------------------------------------------------------------
32da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ Copyright (C) 2002-2009, Xiph.org Foundation
42da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ Copyright (C) 2010, Robin Watts for Pinknoise Productions Ltd
52da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ All rights reserved.
62da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang
72da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ Redistribution and use in source and binary forms, with or without
82da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ modification, are permitted provided that the following conditions
92da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ are met:
102da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang
112da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@     * Redistributions of source code must retain the above copyright
122da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ notice, this list of conditions and the following disclaimer.
132da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@     * Redistributions in binary form must reproduce the above
142da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ copyright notice, this list of conditions and the following disclaimer
152da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ in the documentation and/or other materials provided with the
162da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ distribution.
172da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@     * Neither the names of the Xiph.org Foundation nor Pinknoise
182da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ Productions Ltd nor the names of its contributors may be used to
192da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ endorse or promote products derived from this software without
202da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ specific prior written permission.
212da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@
222da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
232da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
242da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
252da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
262da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
272da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
282da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
292da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
302da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
312da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
322da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
332da723a953a18e3c7fec194cec1216cf31130c86Gloria Wang@ ----------------------------------------------------------------------
347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
357913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang    .text
367913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ full accuracy version
387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.global mdct_backwardARM
407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.global mdct_shift_right
417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.global mdct_unroll_prelap
427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.global mdct_unroll_part2
437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.global mdct_unroll_part3
447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.global mdct_unroll_postlap
457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.extern	sincos_lookup0
477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.extern	sincos_lookup1
48277360226c7e0593767fa7f21cdacf0fdf024122Ard Biesheuvel	.hidden	sincos_lookup0
49277360226c7e0593767fa7f21cdacf0fdf024122Ard Biesheuvel	.hidden	sincos_lookup1
507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_unroll_prelap:
527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = out
537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = post
547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = r
557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 = step
567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMFD	r13!,{r4-r7,r14}
577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MVN	r4, #0x8000
587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r3, r3, LSL #1
597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r1, r2, r1		@ r1 = r - post
607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r1, r1, #16		@ r1 = r - post - 16
617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLT	unroll_over
627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_loop:
637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMDB	r2!,{r5,r6,r7,r12}
647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r5, r5, ASR #9		@ r5 = (*--r)>>9
667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r6, r6, ASR #9		@ r6 = (*--r)>>9
677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r7, r7, ASR #9		@ r7 = (*--r)>>9
687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r12,ASR #9		@ r12= (*--r)>>9
697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r12,ASR #15
717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
727913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r12,r4, r14,ASR #31
737913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r12,[r0], r3
747913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r7, ASR #15
767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r7, r4, r14,ASR #31
787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r7, [r0], r3
797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r6, ASR #15
817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r6, r4, r14,ASR #31
837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r6, [r0], r3
847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r5, ASR #15
867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r5, r4, r14,ASR #31
887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r5, [r0], r3
897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r1, r1, #16
917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGE	unroll_loop
927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_over:
947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADDS	r1, r1, #16
957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLE	unroll_end
967913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_loop2:
977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r5,[r2,#-4]!
987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall
997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall (Xscale)
1007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r5, r5, ASR #9		@ r5 = (*--r)>>9
1017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r5, ASR #15
1027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
1037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r5, r4, r14,ASR #31
1047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r5, [r0], r3
1057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r1, r1, #4
1067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGT	unroll_loop2
1077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_end:
1087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r13!,{r4-r7,PC}
1097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
1107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_unroll_postlap:
1117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = out
1127913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = post
1137913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = l
1147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 = step
1157913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMFD	r13!,{r4-r7,r14}
1167913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MVN	r4, #0x8000
1177913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r3, r3, LSL #1
1187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r1, r1, r2		@ r1 = post - l
1197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r1, r1, ASR #1		@ r1 = (post - l)>>1
1207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r1, r1, #16		@ r1 = ((post - l)>>1) - 4
1217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLT	unroll_over3
1227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_loop3:
1237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r12,[r2],#8
1247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r7, [r2],#8
1257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6, [r2],#8
1267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r5, [r2],#8
1277913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
1287913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r12,r12,#0
1297913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r5, r5, #0
1307913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r6, r6, #0
1317913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r7, r7, #0
1327913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
1337913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12, r12,ASR #9		@ r12= (-*l)>>9
1347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r5,  r5, ASR #9		@ r5 = (-*l)>>9
1357913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r6,  r6, ASR #9		@ r6 = (-*l)>>9
1367913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r7,  r7, ASR #9		@ r7 = (-*l)>>9
1377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
1387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r12,ASR #15
1397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
1407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r12,r4, r14,ASR #31
1417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r12,[r0], r3
1427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
1437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r7, ASR #15
1447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
1457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r7, r4, r14,ASR #31
1467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r7, [r0], r3
1477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
1487913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r6, ASR #15
1497913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
1507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r6, r4, r14,ASR #31
1517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r6, [r0], r3
1527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
1537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r5, ASR #15
1547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
1557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r5, r4, r14,ASR #31
1567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r5, [r0], r3
1577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
1587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r1, r1, #16
1597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGE	unroll_loop3
1607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
1617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_over3:
1627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADDS	r1, r1, #16
1637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLE	unroll_over4
1647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_loop4:
1657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r5,[r2], #8
1667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall
1677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall (Xscale)
1687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r5, r5, #0
1697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r5, r5, ASR #9		@ r5 = (-*l)>>9
1707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r5, ASR #15
1717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
1727913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r5, r4, r14,ASR #31
1737913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r5, [r0], r3
1747913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r1, r1, #4
1757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGT	unroll_loop4
1767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_over4:
1777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r13!,{r4-r7,PC}
1787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
1797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_unroll_part2:
1807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = out
1817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = post
1827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = l
1837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 = r
1847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ <> = step
1857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ <> = wL
1867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ <> = wR
1877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r13
1887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMFD	r13!,{r4,r6-r11,r14}
1897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r12,{r8,r9,r10}		@ r8 = step
1907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = wL
1917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= wR
1927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MVN	r4, #0x8000
1937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r8, r8, LSL #1
1947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r1, r3, r1		@ r1 = (r - post)
1957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLE	unroll_over5
1967913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_loop5:
1977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r12,[r2, #-8]!		@ r12= *l       (but l -= 2 first)
1987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r11,[r9],#4		@ r11= *wL++
1997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r7, [r3, #-4]!		@ r7 = *--r
2007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6, [r10,#-4]!		@ r6 = *--wR
2017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ Can save a cycle here, at the cost of 1bit errors in rounding
2037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r11,r12,r11		@ (r14,r11)  = *l   * *wL++
2047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r6, r7, r6		@ (r14,r6)   = *--r * *--wR
2057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r6, r11
2067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r6, r6, ASR #8
2077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r6, ASR #15
2087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
2097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r6, r4, r14,ASR #31
2107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r6, [r0], r8
2117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2127913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r1, r1, #4
2137913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGT	unroll_loop5
2147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2157913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_over5:
2167913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r13!,{r4,r6-r11,PC}
2177913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_unroll_part3:
2197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = out
2207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = post
2217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = l
2227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 = r
2237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ <> = step
2247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ <> = wL
2257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ <> = wR
2267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r13
2277913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMFD	r13!,{r4,r6-r11,r14}
2287913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r12,{r8,r9,r10}		@ r8 = step
2297913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = wL
2307913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= wR
2317913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MVN	r4, #0x8000
2327913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r8, r8, LSL #1
2337913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r1, r1, r3		@ r1 = (post - r)
2347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLE	unroll_over6
2357913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_loop6:
2367913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r12,[r2],#8		@ r12= *l       (but l += 2 first)
2377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r11,[r9],#4		@ r11= *wL++
2387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r7, [r3],#4		@ r7 = *r++
2397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6, [r10,#-4]!		@ r6 = *--wR
2407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ Can save a cycle here, at the cost of 1bit errors in rounding
2427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r11,r12,r11		@ (r14,r11)  = *l   * *wL++
2437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r6, r7, r6		@ (r14,r6)   = *--r * *--wR
2447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r6, r6, r11
2457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r6, r6, ASR #8
2467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r6, ASR #15
2477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TEQ	r14,r14,ASR #31		@ if r14==0 || r14==-1 then in range
2487913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	EORNE	r6, r4, r14,ASR #31
2497913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRH	r6, [r0], r8
2507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r1, r1, #4
2527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGT	unroll_loop6
2537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangunroll_over6:
2557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r13!,{r4,r6-r11,PC}
2567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_shift_right:
2587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = n
2597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = in
2607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = right
2617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMFD	r13!,{r4-r11,r14}
2627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r0, r0, LSR #2		@ n >>= 2
2647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r1, r1, #4
2657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r0, r0,	#8
2677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLT	sr_less_than_8
2687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangsr_loop:
2697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r3, [r1], #8
2707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r4, [r1], #8
2717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r5, [r1], #8
2727913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6, [r1], #8
2737913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r7, [r1], #8
2747913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r8, [r1], #8
2757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r12,[r1], #8
2767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r14,[r1], #8
2777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r0, r0, #8
2787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r2!,{r3,r4,r5,r6,r7,r8,r12,r14}
2797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGE	sr_loop
2807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangsr_less_than_8:
2817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADDS	r0, r0, #8
2827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BEQ	sr_end
2837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangsr_loop2:
2847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r3, [r1], #8
2857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r0, r0, #1
2867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r3, [r2], #4
2877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGT	sr_loop2
2887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangsr_end:
2897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r13!,{r4-r11,PC}
2907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_backwardARM:
2927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = n
2937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = in
2947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMFD	r13!,{r4-r11,r14}
2957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
2967913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r2,#1<<4	@ r2 = 1<<shift
2977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r3,#13-4	@ r3 = 13-shift
2987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangfind_shift_loop:
2997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	TST	r0,r2		@ if (n & (1<<shift)) == 0
3007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r2,r2,LSL #1
3017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBEQ	r3,r3,#1	@ shift--
3027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BEQ	find_shift_loop
3037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r2,#2
3047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r2,r2,LSL r3	@ r2 = step = 2<<shift
3057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
3067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ presymmetry
3077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = n (a multiple of 4)
3087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = in
3097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = step
3107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 = shift
3117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
3127913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r4, r1, r0, LSL #1	@ r4 = aX = in+(n>>1)
3137913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r14,r1, r0		@ r14= in+(n>>2)
3147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r4, r4, #3*4		@ r4 = aX = in+n2-3
315e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADRL	r7, .Lsincos_lookup
316e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	LDR	r5, [r7]		@ r5 = T=sincos_lookup0
317e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADD	r5, r7
3187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
3197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangpresymmetry_loop1:
3207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r7, [r4,#8]		@ r6 = s2 = aX[2]
3217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r11,[r5,#4]		@ r11= T[1]
3227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6, [r4]		@ r6 = s0 = aX[0]
3237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r10,[r5],r2,LSL #2	@ r10= T[0]   T += step
3247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
3257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XPROD31(s0, s2, T[0], T[1], 0xaX[0], &ax[2])
3267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r8, r9, r7, r11		@ (r8, r9)   = s2*T[1]
3277913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall
3287913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall ?
3297913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r8, r9, r6, r10		@ (r8, r9)  += s0*T[0]
3307913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r6, r6, #0
3317913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall ?
3327913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r8, r12,r7, r10		@ (r8, r12)  = s2*T[0]
3337913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r9, r9, LSL #1
3347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall ?
3357913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r8, r12,r6, r11		@ (r8, r12) -= s0*T[1]
3367913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r9, [r4],#-16		@ aX[0] = r9
3377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r4,r14
3387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r12,LSL #1
3397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r12,[r4,#8+16]		@ aX[2] = r12
3407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
3417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGE	presymmetry_loop1	@ while (aX >= in+n4)
3427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
3437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangpresymmetry_loop2:
3447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6,[r4]			@ r6 = s0 = aX[0]
3457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r10,[r5,#4]		@ r10= T[1]
3467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r7,[r4,#8]		@ r6 = s2 = aX[2]
3477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r11,[r5],-r2,LSL #2	@ r11= T[0]   T -= step
3487913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
3497913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XPROD31(s0, s2, T[1], T[0], 0xaX[0], &ax[2])
3507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r8, r9, r6, r10		@ (r8, r9)   = s0*T[1]
3517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall
3527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall ?
3537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r8, r9, r7, r11		@ (r8, r9)  += s2*T[0]
3547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r6, r6, #0
3557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall ?
3567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r8, r12,r7, r10		@ (r8, r12)  = s2*T[1]
3577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r9, r9, LSL #1
3587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall ?
3597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r8, r12,r6, r11		@ (r8, r12) -= s0*T[0]
3607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r9, [r4],#-16		@ aX[0] = r9
3617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r4,r1
3627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r12,LSL #1
3637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r12,[r4,#8+16]		@ aX[2] = r12
3647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
3657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGE	presymmetry_loop2	@ while (aX >= in)
3667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
3677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = n
3687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = in
3697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = step
3707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 = shift
3717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMFD	r13!,{r3}
372e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADRL	r4, .Lsincos_lookup
373e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	LDR	r5, [r4]		@ r5 = T=sincos_lookup0
374e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADD	r5, r4
3757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r4, r1, r0, LSL #1	@ r4 = aX = in+(n>>1)
3767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r4, r4, #4*4		@ r4 = aX = in+(n>>1)-4
3777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r11,[r5,#4]		@ r11= T[1]
3787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r10,[r5],r2, LSL #2	@ r10= T[0]    T += step
3797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangpresymmetry_loop3:
3807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r8,[r1],#16 		@ r8 = ro0 = bX[0]
3817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r9,[r1,#8-16]		@ r9 = ro2 = bX[2]
3827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6,[r4]			@ r6 = ri0 = aX[0]
3837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
3847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XNPROD31( ro2, ro0, T[1], T[0], 0xaX[0], &aX[2] )
3857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31
3867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r12,r8, r11		@ (r14,r12)  = ro0*T[1]
3877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r8,r8,#0		@ r8 = -ro0
3887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ Stall ?
3897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r12,r9, r10		@ (r14,r12) += ro2*T[0]
3907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r7,[r4,#8]		@ r7 = ri2 = aX[2]
3917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ Stall ?
3927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r3, r9, r11		@ (r14,r3)   = ro2*T[1]
3937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r12,LSL #1
3947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r11,[r5,#4]		@ r11= T[1]
3957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r3, r8, r10		@ (r14,r3)  -= ro0*T[0]
3967913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r10,[r5],r2, LSL #2	@ r10= T[0]    T += step
3977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r12,[r4,#8]
3987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r3, r3, LSL #1
3997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r3, [r4],#-16
4007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XNPROD31( ri2, ri0, T[0], T[1], 0xbX[0], &bX[2] )
4027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31
4037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r12,r6, r10		@ (r14,r12)  = ri0*T[0]
4047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r6,r6,#0		@ r6 = -ri0
4057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall ?
4067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r12,r7, r11		@ (r14,r12) += ri2*T[1]
4077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall ?
4087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall ?
4097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r3, r7, r10		@ (r14,r3)   = ri2*T[0]
4107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r12,LSL #1
4117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall ?
4127913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r3, r6, r11		@ (r14,r3)  -= ri0*T[1]
4137913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r4,r1
4147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r12,[r1,#8-16]
4157913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r3, r3, LSL #1
4167913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r3, [r1,#-16]
4177913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGE	presymmetry_loop3
4197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r1,r1,r0		@ r1 = in -= n>>2 (i.e. restore in)
4217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r3,[r13]
4237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r2,[r13,#-4]!
4247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ mdct_butterflies
4267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = n  = (points * 2)
4277913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = in = x
4287913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = i
4297913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 = shift
4307913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMFD	r13!,{r0-r1}
431e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADRL	r4, .Lsincos_lookup
432e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	LDR	r5, [r4]
433e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADD	r5, r4
4347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSBS	r4,r3,#6		@ r4 = stages = 7-shift then --stages
4357913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLE	no_generics
4367913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,#4			@ r14= 4               (i=0)
4377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r6, r14,LSL r3		@ r6 = (4<<i)<<shift
4387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_butterflies_loop1:
4397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r0, r0, LSR #1		@ r0 = points>>i = POINTS
4407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r2, r14,LSR #2		@ r2 = (1<<i)-j        (j=0)
4417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMFD	r13!,{r4,r14}
4427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_butterflies_loop2:
4437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift))
4457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ mdct_butterfly_generic(r1, r0, r6)
4467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = points
4477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = x
4487913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ preserve r2 (external loop counter)
4497913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ preserve r3
4507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ preserve r4 (external loop counter)
4517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r5 = T = sincos_lookup0
4527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r6 = step
4537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ preserve r14
4547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r2,[r13,#-4]!		@ stack r2
4567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r1,r1,r0,LSL #1		@ r1 = x2+4 = x + (POINTS>>1)
4577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r7,r1,r0,LSL #1		@ r7 = x1+4 = x + POINTS
4587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r12,r5,#1024*4		@ r12= sincos_lookup0+1024
4597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_bufferfly_generic_loop1:
4617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMDB	r7!,{r2,r3,r8,r11}	@ r2 = x1[0]
4627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r3 = x1[1]
4637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x1[2]
4647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x1[3]    x1 -= 4
4657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMDB	r1!,{r4,r9,r10,r14}	@ r4 = x2[0]
4667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x2[1]
4677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x2[2]
4687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r14= x2[3]    x2 -= 4
4697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r2, r2, r3		@ r2 = s0 = x1[0] - x1[1]
4717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r3, r2, r3, LSL #1	@ r3 =      x1[0] + x1[1] (-> x1[0])
4727913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r11,r11,r8		@ r11= s1 = x1[3] - x1[2]
4737913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r11,r8, LSL #1	@ r8 =      x1[3] + x1[2] (-> x1[2])
4747913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r9, r4		@ r9 = s2 = x2[1] - x2[0]
4757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r4, r9, r4, LSL #1	@ r4 =      x2[1] + x2[0] (-> x1[1])
4767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r14,r14,r10		@ r14= s3 = x2[3] - x2[2]
4777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r14,r10,LSL #1	@ r10=      x2[3] + x2[2] (-> x1[3])
4787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r7,{r3,r4,r8,r10}
4797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = points
4817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = x2
4827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = s0
4837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 free
4847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r4 free
4857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r5 = T
4867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r6 = step
4877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r7 = x1
4887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r8 free
4897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r9 = s2
4907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r10 free
4917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r11= s1
4927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r12= limit
4937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r14= s3
4947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r8, [r5,#4]		@ r8 = T[1]
4967913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r10,[r5],r6,LSL #2	@ r10= T[0]		T += step
4977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
4987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2])
4997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ x2[0] = (s1*T[0] + s0*T[1])>>31     x2[2] = (s0*T[0] - s1*T[1])>>31
5007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall Xscale
5017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r4, r3, r2, r8		@ (r4, r3)   = s0*T[1]
5027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r4, r3, r11,r10		@ (r4, r3)  += s1*T[0]
5037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r11,r11,#0
5047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r11,r4, r8, r11		@ (r11,r4)   = -s1*T[1]
5057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r11,r4, r2, r10		@ (r11,r4)  += s0*T[0]
5067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r2, r3, LSL #1		@ r2 = r3<<1 = Value for x2[0]
5077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3])
5097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ x2[1] = (s2*T[0] + s3*T[1])>>31     x2[3] = (s3*T[0] - s2*T[1])>>31
5107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r11,r3, r9, r10		@ (r11,r3)   = s2*T[0]
5117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r4, r4, LSL #1		@ r4 = r4<<1 = Value for x2[2]
5127913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r11,r3, r14,r8		@ (r11,r3)  += s3*T[1]
5137913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r9, r9, #0
5147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r10,r11,r14,r10		@ (r10,r11)  = s3*T[0]
5157913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r3, r3, LSL #1		@ r3 = r3<<1 = Value for x2[1]
5167913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r10,r11,r9,r8		@ (r10,r11) -= s2*T[1]
5177913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r5, r12
5187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r11,r11,LSL #1		@ r11= r11<<1 = Value for x2[3]
5197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1,{r2,r3,r4,r11}
5217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLT	mdct_bufferfly_generic_loop1
5237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r12,#1024*4
5257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_bufferfly_generic_loop2:
5267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMDB	r7!,{r2,r3,r9,r10}	@ r2 = x1[0]
5277913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r3 = x1[1]
5287913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x1[2]
5297913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x1[3]    x1 -= 4
5307913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMDB	r1!,{r4,r8,r11,r14}	@ r4 = x2[0]
5317913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x2[1]
5327913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x2[2]
5337913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r14= x2[3]    x2 -= 4
5347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5357913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r2, r2, r3		@ r2 = s0 = x1[0] - x1[1]
5367913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r3, r2, r3, LSL #1	@ r3 =      x1[0] + x1[1] (-> x1[0])
5377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r9,r10		@ r9 = s1 = x1[2] - x1[3]
5387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r9,r10, LSL #1	@ r10=      x1[2] + x1[3] (-> x1[2])
5397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r4, r4, r8		@ r4 = s2 = x2[0] - x2[1]
5407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r4, r8, LSL #1	@ r8 =      x2[0] + x2[1] (-> x1[1])
5417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r14,r14,r11		@ r14= s3 = x2[3] - x2[2]
5427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r11,r14,r11,LSL #1	@ r11=      x2[3] + x2[2] (-> x1[3])
5437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r7,{r3,r8,r10,r11}
5447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = points
5467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = x2
5477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = s0
5487913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 free
5497913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r4 = s2
5507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r5 = T
5517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r6 = step
5527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r7 = x1
5537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r8 free
5547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r9 = s1
5557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r10 free
5567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r11 free
5577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r12= limit
5587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r14= s3
5597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r8, [r5,#4]		@ r8 = T[1]
5617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r10,[r5],-r6,LSL #2	@ r10= T[0]		T -= step
5627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2])
5647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ x2[0] = (s0*T[0] - s1*T[1])>>31     x2[2] = (s1*T[0] + s0*T[1])>>31
5657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall Xscale
5667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r3, r11,r2, r8		@ (r3, r11)  = s0*T[1]
5677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r3, r11,r9, r10		@ (r3, r11) += s1*T[0]
5687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r9, r9, #0
5697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r3, r2, r10,r2		@ (r3, r2)   = s0*T[0]
5707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r3, r2, r9, r8		@ (r3, r2)  += -s1*T[1]
5717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r9, r11,LSL #1		@ r9 = r11<<1 = Value for x2[2]
5727913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5737913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3])
5747913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ x2[1] = (s3*T[0] - s2*T[1])>>31     x2[3] = (s2*T[0] + s3*T[1])>>31
5757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r3, r11,r4, r10		@ (r3,r11)   = s2*T[0]
5767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r2, r2, LSL #1		@ r2 = r2<<1  = Value for x2[0]
5777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r3, r11,r14,r8		@ (r3,r11)  += s3*T[1]
5787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r4, r4, #0
5797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r10,r3,r14,r10		@ (r10,r3)   = s3*T[0]
5807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r11,r11,LSL #1		@ r11= r11<<1 = Value for x2[3]
5817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r10,r3, r4, r8		@ (r10,r3)  -= s2*T[1]
5827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r5, r12
5837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r3, r3, LSL #1		@ r3 = r3<<1  = Value for x2[1]
5847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1,{r2,r3,r9,r11}
5867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGT	mdct_bufferfly_generic_loop2
5887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r2,[r13],#4		@ unstack r2
5907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r1, r1, r0, LSL #2	@ r1 = x+POINTS*j
5917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall Xscale
5927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r2, r2, #1		@ r2--                 (j++)
5937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGT	mdct_butterflies_loop2
5947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r13!,{r4,r14}
5967913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r1,[r13,#4]
5987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
5997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r4, r4, #1		@ stages--
6007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r14,r14,LSL #1		@ r14= 4<<i            (i++)
6017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r6, r6, LSL #1		@ r6 = step <<= 1      (i++)
6027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGE	mdct_butterflies_loop1
6037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r13,{r0-r1}
6047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangno_generics:
6057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ mdct_butterflies part2 (loop around mdct_bufferfly_32)
6067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = points
6077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = in
6087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = step
6097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 = shift
6107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
6117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_bufferflies_loop3:
6127913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ mdct_bufferfly_32
6137913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
6147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ block1
6157913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r4, r1, #16*4		@ r4 = &in[16]
6167913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[16]
6177913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[17]
6187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[18]
6197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[19]
6207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[0]
6217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x[1]
6227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[2]
6237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[3]
6247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, r6		@ r5 = s0 = x[16] - x[17]
6257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r5, r6, LSL #1	@ r6 =      x[16] + x[17]  -> x[16]
6267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r9, r10		@ r9 = s1 = x[18] - x[19]
6277913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r9, r10,LSL #1	@ r10=      x[18] + x[19]  -> x[18]
6287913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r8, r8, r7		@ r8 = s2 = x[ 1] - x[ 0]
6297913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r7, r8, r7, LSL #1	@ r7 =      x[ 1] + x[ 0]  -> x[17]
6307913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r12,r11		@ r12= s3 = x[ 3] - x[ 2]
6317913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r11,r12,r11, LSL #1	@ r11=      x[ 3] + x[ 2]  -> x[19]
6327913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r4!,{r6,r7,r10,r11}
6337913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
6347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6,cPI1_8
6357913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r7,cPI3_8
6367913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
6377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] )
6387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ x[0] = s0*cPI3_8 - s1*cPI1_8     x[2] = s1*cPI3_8 + s0*cPI1_8
6397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall Xscale
6407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r11,r5, r6		@ (r14,r11)  = s0*cPI1_8
6417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r11,r9, r7		@ (r14,r11) += s1*cPI3_8
6427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r9, r9, #0
6437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r5, r7, r5		@ (r14,r5)   = s0*cPI3_8
6447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r5, r9, r6		@ (r14,r5)  -= s1*cPI1_8
6457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r11,r11,LSL #1
6467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r5, r5, LSL #1
6477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
6487913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] )
6497913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ x[1] = s2*cPI1_8 + s3*cPI3_8     x[3] = s3*cPI1_8 - s2*cPI3_8
6507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r9, r8, r6		@ (r14,r9)   = s2*cPI1_8
6517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r9, r12,r7		@ (r14,r9)  += s3*cPI3_8
6527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r8,r8,#0
6537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r12,r6, r12		@ (r14,r12)  = s3*cPI1_8
6547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r12,r8, r7		@ (r14,r12) -= s2*cPI3_8
6557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r9, r9, LSL #1
6567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r12,LSL #1
6577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1!,{r5,r9,r11,r12}
6587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
6597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ block2
6607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[20]
6617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[21]
6627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[22]
6637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[23]
6647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[4]
6657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x[5]
6667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[6]
6677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[7]
6687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, r6		@ r5 = s0 = x[20] - x[21]
6697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r5, r6, LSL #1	@ r6 =      x[20] + x[21]  -> x[20]
6707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r9, r10		@ r9 = s1 = x[22] - x[23]
6717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r9, r10,LSL #1	@ r10=      x[22] + x[23]  -> x[22]
6727913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r8, r8, r7		@ r8 = s2 = x[ 5] - x[ 4]
6737913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r7, r8, r7, LSL #1	@ r7 =      x[ 5] + x[ 4]  -> x[21]
6747913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r12,r11		@ r12= s3 = x[ 7] - x[ 6]
6757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r11,r12,r11, LSL #1	@ r11=      x[ 7] + x[ 6]  -> x[23]
6767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r14,cPI2_8
6777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r4!,{r6,r7,r10,r11}
6787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
6797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, r9		@ r5 = s0 - s1
6807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r9, r5, r9, LSL #1	@ r9 = s0 + s1
6817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r5, r14,r5		@ (r6,r5)  = (s0-s1)*cPI2_8
6827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r12,r8		@ r12= s3 - s2
6837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r12,r8, LSL #1	@ r8 = s3 + s2
6847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
6857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r8, r14,r8		@ (r6,r8)  = (s3+s2)*cPI2_8
6867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r5, r5, LSL #1
6877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r9, r14,r9		@ (r6,r9)  = (s0+s1)*cPI2_8
6887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r8, r8, LSL #1
6897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r12,r14,r12		@ (r6,r12) = (s3-s2)*cPI2_8
6907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r9, r9, LSL #1
6917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r12,LSL #1
6927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1!,{r5,r8,r9,r12}
6937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
6947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ block3
6957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[24]
6967913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[25]
6977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[25]
6987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[26]
6997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[8]
7007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x[9]
7017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[10]
7027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[11]
7037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, r6		@ r5 = s0 = x[24] - x[25]
7047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r5, r6, LSL #1	@ r6 =      x[24] + x[25]  -> x[25]
7057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r9, r10		@ r9 = s1 = x[26] - x[27]
7067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r9, r10,LSL #1	@ r10=      x[26] + x[27]  -> x[26]
7077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r8, r8, r7		@ r8 = s2 = x[ 9] - x[ 8]
7087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r7, r8, r7, LSL #1	@ r7 =      x[ 9] + x[ 8]  -> x[25]
7097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r12,r11		@ r12= s3 = x[11] - x[10]
7107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r11,r12,r11, LSL #1	@ r11=      x[11] + x[10]  -> x[27]
7117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r4!,{r6,r7,r10,r11}
7127913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
7137913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6,cPI3_8
7147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r7,cPI1_8
7157913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
7167913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] )
7177913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ x[8] = s0*cPI1_8 - s1*cPI3_8     x[10] = s1*cPI1_8 + s0*cPI3_8
7187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall Xscale
7197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r11,r5, r6		@ (r14,r11)  = s0*cPI3_8
7207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r11,r9, r7		@ (r14,r11) += s1*cPI1_8
7217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r9, r9, #0
7227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r5, r7, r5		@ (r14,r5)   = s0*cPI1_8
7237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r5, r9, r6		@ (r14,r5)  -= s1*cPI3_8
7247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r11,r11,LSL #1
7257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r5, r5, LSL #1
7267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
7277913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] )
7287913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ x[9] = s2*cPI3_8 + s3*cPI1_8     x[11] = s3*cPI3_8 - s2*cPI1_8
7297913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r9, r8, r6		@ (r14,r9)   = s2*cPI3_8
7307913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r9, r12,r7		@ (r14,r9)  += s3*cPI1_8
7317913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r8,r8,#0
7327913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r14,r12,r6, r12		@ (r14,r12)  = s3*cPI3_8
7337913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r14,r12,r8, r7		@ (r14,r12) -= s2*cPI1_8
7347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r9, r9, LSL #1
7357913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r12,LSL #1
7367913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1!,{r5,r9,r11,r12}
7377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
7387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ block4
7397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r4,{r5,r6,r10,r11}	@ r5 = x[28]
7407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[29]
7417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[30]
7427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[31]
7437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r1,{r8,r9,r12,r14}	@ r8 = x[12]
7447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[13]
7457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[14]
7467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r14= x[15]
7477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, r6		@ r5 = s0 = x[28] - x[29]
7487913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r5, r6, LSL #1	@ r6 =      x[28] + x[29]  -> x[28]
7497913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r7, r14,r12		@ r7 = s3 = x[15] - x[14]
7507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r12,r7, r12, LSL #1	@ r12=      x[15] + x[14]  -> x[31]
7517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r10,r10,r11		@ r10= s1 = x[30] - x[31]
7527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r11,r10,r11,LSL #1	@ r11=      x[30] + x[31]  -> x[30]
7537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r14, r8, r9		@ r14= s2 = x[12] - x[13]
7547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r9, r14, r9, LSL #1	@ r9 =      x[12] + x[13]  -> x[29]
7557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r4!,{r6,r9,r11,r12}
7567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1!,{r5,r7,r10,r14}
7577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
7587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ mdct_butterfly16 (1st version)
7597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ block 1
7607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r1,r1,#16*4
7617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r4,r1,#8*4
7627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[ 8]
7637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[ 9]
7647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[10]
7657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[11]
7667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[0]
7677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x[1]
7687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[2]
7697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[3]
7707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, r6		@ r5 = s0 = x[ 8] - x[ 9]
7717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r5, r6, LSL #1	@ r6 =      x[ 8] + x[ 9]  -> x[ 8]
7727913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r9, r10		@ r9 = s1 = x[10] - x[11]
7737913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r9, r10,LSL #1	@ r10=      x[10] + x[11]  -> x[10]
7747913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r8, r8, r7		@ r8 = s2 = x[ 1] - x[ 0]
7757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r7, r8, r7, LSL #1	@ r7 =      x[ 1] + x[ 0]  -> x[ 9]
7767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r12,r11		@ r12= s3 = x[ 3] - x[ 2]
7777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r11,r12,r11, LSL #1	@ r11=      x[ 3] + x[ 2]  -> x[11]
7787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r14,cPI2_8
7797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r4!,{r6,r7,r10,r11}
7807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
7817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, r9		@ r5 = s0 - s1
7827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r9, r5, r9, LSL #1	@ r9 = s0 + s1
7837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r5, r14,r5		@ (r6,r5)  = (s0-s1)*cPI2_8
7847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r12,r8		@ r12= s3 - s2
7857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r12,r8, LSL #1	@ r8 = s3 + s2
7867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
7877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r8, r14,r8		@ (r6,r8)  = (s3+s2)*cPI2_8
7887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r5, r5, LSL #1
7897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r9, r14,r9		@ (r6,r9)  = (s0+s1)*cPI2_8
7907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r8, r8, LSL #1
7917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r12,r14,r12		@ (r6,r12) = (s3-s2)*cPI2_8
7927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r9, r9, LSL #1
7937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r12,LSL #1
7947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1!,{r5,r8,r9,r12}
7957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
7967913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ block4
7977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[12]
7987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[13]
7997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[14]
8007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[15]
8017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[ 4]
8027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x[ 5]
8037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[ 6]
8047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[ 7]
8057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r14,r7, r8		@ r14= s0 = x[ 4] - x[ 5]
8067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r14,r8, LSL #1	@ r8 =      x[ 4] + x[ 5]  -> x[13]
8077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r7, r12,r11		@ r7 = s1 = x[ 7] - x[ 6]
8087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r11,r7, r11, LSL #1	@ r11=      x[ 7] + x[ 6]  -> x[15]
8097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, r6		@ r5 = s2 = x[12] - x[13]
8107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r5, r6, LSL #1	@ r6 =      x[12] + x[13]  -> x[12]
8117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r9, r10		@ r12= s3 = x[14] - x[15]
8127913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r12,r10,LSL #1	@ r10=      x[14] + x[15]  -> x[14]
8137913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r4!,{r6,r8,r10,r11}
8147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1!,{r5,r7,r12,r14}
8157913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
8167913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ mdct_butterfly_8
8177913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMDB	r1,{r6,r7,r8,r9,r10,r11,r12,r14}
8187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[0]
8197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r7 = x[1]
8207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x[2]
8217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[3]
8227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[4]
8237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[5]
8247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[6]
8257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r14= x[7]
8267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r6, r7		@ r6 = s0 = x[0] + x[1]
8277913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r7, r6, r7, LSL #1	@ r7 = s1 = x[0] - x[1]
8287913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r8, r9		@ r8 = s2 = x[2] + x[3]
8297913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r8, r9, LSL #1	@ r9 = s3 = x[2] - x[3]
8307913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r10,r11		@ r10= s4 = x[4] + x[5]
8317913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r11,r10,r11,LSL #1	@ r11= s5 = x[4] - x[5]
8327913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r12,r12,r14		@ r12= s6 = x[6] + x[7]
8337913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r14,r12,r14,LSL #1	@ r14= s7 = x[6] - x[7]
8347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
8357913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r2, r11,r9		@ r2 = x[0] = s5 + s3
8367913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r4, r2, r9, LSL #1	@ r4 = x[2] = s5 - s3
8377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r3, r14,r7		@ r3 = x[1] = s7 - s1
8387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r5, r3, r7, LSL #1	@ r5 = x[3] = s7 + s1
8397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r10,r10,r6		@ r10= x[4] = s4 - s0
8407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r11,r12,r8		@ r11= x[5] = s6 - s2
8417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r12,r10,r6, LSL #1	@ r12= x[6] = s4 + s0
8427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r14,r11,r8, LSL #1	@ r14= x[7] = s6 + s2
8437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMDB	r1,{r2,r3,r4,r5,r10,r11,r12,r14}
8447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
8457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ mdct_butterfly_8
8467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r1,{r6,r7,r8,r9,r10,r11,r12,r14}
8477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[0]
8487913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r7 = x[1]
8497913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x[2]
8507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[3]
8517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[4]
8527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[5]
8537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[6]
8547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r14= x[7]
8557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r6, r7		@ r6 = s0 = x[0] + x[1]
8567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r7, r6, r7, LSL #1	@ r7 = s1 = x[0] - x[1]
8577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r8, r9		@ r8 = s2 = x[2] + x[3]
8587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r8, r9, LSL #1	@ r9 = s3 = x[2] - x[3]
8597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r10,r11		@ r10= s4 = x[4] + x[5]
8607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r11,r10,r11,LSL #1	@ r11= s5 = x[4] - x[5]
8617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r12,r12,r14		@ r12= s6 = x[6] + x[7]
8627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r14,r12,r14,LSL #1	@ r14= s7 = x[6] - x[7]
8637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
8647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r2, r11,r9		@ r2 = x[0] = s5 + s3
8657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r4, r2, r9, LSL #1	@ r4 = x[2] = s5 - s3
8667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r3, r14,r7		@ r3 = x[1] = s7 - s1
8677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r5, r3, r7, LSL #1	@ r5 = x[3] = s7 + s1
8687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r10,r10,r6		@ r10= x[4] = s4 - s0
8697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r11,r12,r8		@ r11= x[5] = s6 - s2
8707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r12,r10,r6, LSL #1	@ r12= x[6] = s4 + s0
8717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r14,r11,r8, LSL #1	@ r14= x[7] = s6 + s2
8727913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1,{r2,r3,r4,r5,r10,r11,r12,r14}
8737913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
8747913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ block 2
8757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r1,r1,#16*4-8*4
8767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r4,r1,#8*4
8777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[ 8]
8787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[ 9]
8797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[10]
8807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[11]
8817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[0]
8827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x[1]
8837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[2]
8847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[3]
8857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, r6		@ r5 = s0 = x[ 8] - x[ 9]
8867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r5, r6, LSL #1	@ r6 =      x[ 8] + x[ 9]  -> x[ 8]
8877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r9, r10		@ r9 = s1 = x[10] - x[11]
8887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r9, r10,LSL #1	@ r10=      x[10] + x[11]  -> x[10]
8897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r8, r8, r7		@ r8 = s2 = x[ 1] - x[ 0]
8907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r7, r8, r7, LSL #1	@ r7 =      x[ 1] + x[ 0]  -> x[ 9]
8917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r12,r11		@ r12= s3 = x[ 3] - x[ 2]
8927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r11,r12,r11, LSL #1	@ r11=      x[ 3] + x[ 2]  -> x[11]
8937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r14,cPI2_8
8947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r4!,{r6,r7,r10,r11}
8957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
8967913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, r9		@ r5 = s0 - s1
8977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r9, r5, r9, LSL #1	@ r9 = s0 + s1
8987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r5, r14,r5		@ (r6,r5)  = (s0-s1)*cPI2_8
8997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r12,r8		@ r12= s3 - s2
9007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r12,r8, LSL #1	@ r8 = s3 + s2
9017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
9027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r8, r14,r8		@ (r6,r8)  = (s3+s2)*cPI2_8
9037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r5, r5, LSL #1
9047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r9, r14,r9		@ (r6,r9)  = (s0+s1)*cPI2_8
9057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r8, r8, LSL #1
9067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r6, r12,r14,r12		@ (r6,r12) = (s3-s2)*cPI2_8
9077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r9, r9, LSL #1
9087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r12,r12,LSL #1
9097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1!,{r5,r8,r9,r12}
9107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
9117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ block4
9127913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r4,{r5,r6,r9,r10}	@ r5 = x[12]
9137913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[13]
9147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[14]
9157913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[15]
9167913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r1,{r7,r8,r11,r12}	@ r7 = x[ 4]
9177913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x[ 5]
9187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[ 6]
9197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[ 7]
9207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, r6		@ r5 = s2 = x[12] - x[13]
9217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r5, r6, LSL #1	@ r6 =      x[12] + x[13]  -> x[12]
9227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r9, r10		@ r9 = s3 = x[14] - x[15]
9237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r9, r10,LSL #1	@ r10=      x[14] + x[15]  -> x[14]
9247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r14,r7, r8		@ r14= s0 = x[ 4] - x[ 5]
9257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r14,r8, LSL #1	@ r8 =      x[ 4] + x[ 5]  -> x[13]
9267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r7, r12,r11		@ r7 = s1 = x[ 7] - x[ 6]
9277913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r11,r7, r11, LSL #1	@ r11=      x[ 7] + x[ 6]  -> x[15]
9287913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r4!,{r6,r8,r10,r11}
9297913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1!,{r5,r7,r9,r14}
9307913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
9317913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ mdct_butterfly_8
9327913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMDB	r1,{r6,r7,r8,r9,r10,r11,r12,r14}
9337913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[0]
9347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r7 = x[1]
9357913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x[2]
9367913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[3]
9377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[4]
9387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[5]
9397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[6]
9407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r14= x[7]
9417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r6, r7		@ r6 = s0 = x[0] + x[1]
9427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r7, r6, r7, LSL #1	@ r7 = s1 = x[0] - x[1]
9437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r8, r9		@ r8 = s2 = x[2] + x[3]
9447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r8, r9, LSL #1	@ r9 = s3 = x[2] - x[3]
9457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r10,r11		@ r10= s4 = x[4] + x[5]
9467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r11,r10,r11,LSL #1	@ r11= s5 = x[4] - x[5]
9477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r12,r12,r14		@ r12= s6 = x[6] + x[7]
9487913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r14,r12,r14,LSL #1	@ r14= s7 = x[6] - x[7]
9497913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
9507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r2, r11,r9		@ r2 = x[0] = s5 + s3
9517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r4, r2, r9, LSL #1	@ r4 = x[2] = s5 - s3
9527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r3, r14,r7		@ r3 = x[1] = s7 - s1
9537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r5, r3, r7, LSL #1	@ r5 = x[3] = s7 + s1
9547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r10,r10,r6		@ r10= x[4] = s4 - s0
9557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r11,r12,r8		@ r11= x[5] = s6 - s2
9567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r12,r10,r6, LSL #1	@ r12= x[6] = s4 + s0
9577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r14,r11,r8, LSL #1	@ r14= x[7] = s6 + s2
9587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMDB	r1,{r2,r3,r4,r5,r10,r11,r12,r14}
9597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
9607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ mdct_butterfly_8
9617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMIA	r1,{r6,r7,r8,r9,r10,r11,r12,r14}
9627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r6 = x[0]
9637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r7 = x[1]
9647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r8 = x[2]
9657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r9 = x[3]
9667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r10= x[4]
9677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r11= x[5]
9687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r12= x[6]
9697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang					@ r14= x[7]
9707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r6, r7		@ r6 = s0 = x[0] + x[1]
9717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r7, r6, r7, LSL #1	@ r7 = s1 = x[0] - x[1]
9727913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r8, r9		@ r8 = s2 = x[2] + x[3]
9737913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r9, r8, r9, LSL #1	@ r9 = s3 = x[2] - x[3]
9747913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r10,r11		@ r10= s4 = x[4] + x[5]
9757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r11,r10,r11,LSL #1	@ r11= s5 = x[4] - x[5]
9767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r12,r12,r14		@ r12= s6 = x[6] + x[7]
9777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r14,r12,r14,LSL #1	@ r14= s7 = x[6] - x[7]
9787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
9797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r2, r11,r9		@ r2 = x[0] = s5 + s3
9807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r4, r2, r9, LSL #1	@ r4 = x[2] = s5 - s3
9817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r3, r14,r7		@ r3 = x[1] = s7 - s1
9827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r5, r3, r7, LSL #1	@ r5 = x[3] = s7 + s1
9837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r10,r10,r6		@ r10= x[4] = s4 - s0
9847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r11,r12,r8		@ r11= x[5] = s6 - s2
9857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r12,r10,r6, LSL #1	@ r12= x[6] = s4 + s0
9867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r14,r11,r8, LSL #1	@ r14= x[7] = s6 + s2
9877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STMIA	r1,{r2,r3,r4,r5,r10,r11,r12,r14}
9887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
9897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r1,r1,#8*4
9907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUBS	r0,r0,#64
9917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGT	mdct_bufferflies_loop3
9927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
9937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r13,{r0-r3}
9947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
9957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_bitreverseARM:
996021523c8f11a487b993a1bce5304752b21754574Gloria Wang	@ r0 = points = n
9977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = in
9987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = step
9997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 = shift
10007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r4, #0			@ r4 = bit = 0
10027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r5, r1, r0, LSL #1	@ r5 = w = x + (n>>1)
10037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADR	r6, bitrev
10047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r5, r5, #8
10057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangbrev_lp:
10067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDRB	r7, [r6, r4, LSR #6]
10077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	AND	r8, r4, #0x3f
10087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDRB	r8, [r6, r8]
10097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r4, r4, #1		@ bit++
10107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ stall XScale
10117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ORR	r7, r7, r8, LSL #6	@ r7 = bitrev[bit]
1012021523c8f11a487b993a1bce5304752b21754574Gloria Wang	MOV	r7, r7, LSR r3
1013021523c8f11a487b993a1bce5304752b21754574Gloria Wang	ADD	r9, r1, r7, LSL #2	@ r9 = xx = x + (b>>shift)
10147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r5, r9			@ if (w > xx)
10157913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r10,[r5],#-8		@   r10 = w[0]		w -= 2
10167913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDRGT	r11,[r5,#12]		@   r11 = w[1]
10177913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDRGT	r12,[r9]		@   r12 = xx[0]
10187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDRGT	r14,[r9,#4]		@   r14 = xx[1]
10197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRGT	r10,[r9]		@   xx[0]= w[0]
10207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRGT	r11,[r9,#4]		@   xx[1]= w[1]
10217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRGT	r12,[r5,#8]		@   w[0] = xx[0]
10227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STRGT	r14,[r5,#12]		@   w[1] = xx[1]
10237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r5,r1
10247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BGT	brev_lp
10257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ mdct_step7
10277913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = points
10287913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = in
10297913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = step
1030021523c8f11a487b993a1bce5304752b21754574Gloria Wang	@ r3 = shift
10317913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10327913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r2, #4			@ r5 = T = (step>=4) ?
1033e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADR	r7, .Lsincos_lookup	@          sincos_lookup0 +
1034e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADDLT	r7, #4			@          sincos_lookup1
1035e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	LDR	r5, [r7]
1036e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADD	r5, r7
10377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r7, r1, r0, LSL #1	@ r7 = w1 = x + (n>>1)
10387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADDGE	r5, r5, r2, LSL #1	@		            (step>>1)
10397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r8, r5, #1024*4		@ r8 = Ttop
10407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangstep7_loop1:
10417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6, [r1]		@ r6 = w0[0]
10427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r9, [r1,#4]		@ r9 = w0[1]
10437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r10,[r7,#-8]!		@ r10= w1[0]	w1 -= 2
10447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r11,[r7,#4]		@ r11= w1[1]
10457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r14,[r5,#4]		@ r14= T[1]
10467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r12,[r5],r2,LSL #2	@ r12= T[0]	T += step
10477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10487913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r6, r10		@ r6 = s0 = w0[0] + w1[0]
10497913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r10,r6, r10,LSL #1	@ r10= s1b= w0[0] - w1[0]
10507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r11,r11,r9		@ r11= s1 = w1[1] - w0[1]
10517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r9, r11,r9, LSL #1	@ r9 = s0b= w1[1] + w0[1]
10527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ Can save 1 cycle by using SMULL SMLAL - at the cost of being
10547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ 1 off.
10557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r0, r3, r6, r14		@ (r0,r3)   = s0*T[1]
10567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r0, r4, r11,r12		@ (r0,r4)  += s1*T[0] = s2
10577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r3, r3, r4
10587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r0, r14,r11,r14		@ (r0,r14)  = s1*T[1]
10597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r0, r12,r6, r12		@ (r0,r12) += s0*T[0] = s3
10607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r14,r14,r12
10617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r9 = s0b<<1
10637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r10= s1b<<1
10647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r9, r3, r9, ASR #1	@ r9 = s0b + s2
10657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r3, r9, r3, LSL #1	@ r3 = s0b - s2
10667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r14,r10,ASR #1	@ r12= s3  - s1b
10687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r14,r10,ASR #1	@ r10= s3  + s1b
10697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r9, [r1],#4
10707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r10,[r1],#4		@ w0 += 2
10717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r3, [r7]
10727913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r12,[r7,#4]
10737913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10747913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r5,r8
10757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLT	step7_loop1
10767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangstep7_loop2:
10787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6, [r1]		@ r6 = w0[0]
10797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r9, [r1,#4]		@ r9 = w0[1]
10807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r10,[r7,#-8]!		@ r10= w1[0]	w1 -= 2
10817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r11,[r7,#4]		@ r11= w1[1]
10827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r14,[r5,-r2,LSL #2]!	@ r12= T[1]	T -= step
10837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r12,[r5,#4]		@ r14= T[0]
10847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r6, r6, r10		@ r6 = s0 = w0[0] + w1[0]
10867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r10,r6, r10,LSL #1	@ r10= s1b= w0[0] - w1[0]
10877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r11,r11,r9		@ r11= s1 = w1[1] - w0[1]
10887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r9, r11,r9, LSL #1	@ r9 = s0b= w1[1] + w0[1]
10897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ Can save 1 cycle by using SMULL SMLAL - at the cost of being
10917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ 1 off.
10927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r0, r3, r6, r14		@ (r0,r3)   = s0*T[0]
10937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r0, r4, r11,r12		@ (r0,r4)  += s1*T[1] = s2
10947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r3, r3, r4
10957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r0, r14,r11,r14		@ (r0,r14)  = s1*T[0]
10967913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r0, r12,r6, r12		@ (r0,r12) += s0*T[1] = s3
10977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r14,r14,r12
10987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
10997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r9 = s0b<<1
11007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r10= s1b<<1
11017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r9, r3, r9, ASR #1	@ r9 = s0b + s2
11027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r3, r9, r3, LSL #1	@ r3 = s0b - s2
11037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
11047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SUB	r12,r14,r10,ASR #1	@ r12= s3  - s1b
11057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r10,r14,r10,ASR #1	@ r10= s3  + s1b
11067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r9, [r1],#4
11077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r10,[r1],#4		@ w0 += 2
11087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r3, [r7]
11097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r12,[r7,#4]
11107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
11117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r1,r7
11127913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLT	step7_loop2
11137913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
11147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r13!,{r0-r3}
11157913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
11167913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r0 = points
11177913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r1 = in
11187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r2 = step
11197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ r3 = shift
11207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r2, r2, ASR #2		@ r2 = step >>= 2
11217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r2, #0
11227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMPNE	r2, #1
11237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BEQ	mdct_end
11247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
11257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ step > 1 (default case)
11267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r2, #4			@ r5 = T = (step>=4) ?
1127e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADR	r7, .Lsincos_lookup	@          sincos_lookup0 +
1128e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADDLT	r7, #4			@          sincos_lookup1
1129e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	LDR	r5, [r7]
1130e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	ADD	r5, r7
11317913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADD	r7, r1, r0, LSL #1	@ r7 = iX = x + (n>>1)
11327913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	ADDGE	r5, r5, r2, LSL #1	@		            (step>>1)
11337913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_step8_default:
11347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r6, [r1],#4		@ r6 =  s0 = x[0]
11357913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r8, [r1],#4		@ r8 = -s1 = x[1]
11367913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r12,[r5,#4]       	@ r12= T[1]
11377913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDR	r14,[r5],r2,LSL #2	@ r14= T[0]	T += step
11387913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r8, r8, #0		@ r8 = s1
11397913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
11407913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ XPROD31(s0, s1, T[0], T[1], x, x+1)
11417913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ x[0] = s0 * T[0] + s1 * T[1]      x[1] = s1 * T[0] - s0 * T[1]
11427913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r9, r10, r8, r12	@ (r9,r10)  = s1 * T[1]
11437913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	CMP	r1, r7
11447913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r9, r10, r6, r14	@ (r9,r10) += s0 * T[0]
11457913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	RSB	r6, r6, #0		@ r6 = -s0
11467913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMULL	r9, r11, r8, r14	@ (r9,r11)  = s1 * T[0]
11477913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r10,r10,LSL #1
11487913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	SMLAL	r9, r11, r6, r12	@ (r9,r11) -= s0 * T[1]
11497913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r10,[r1,#-8]
11507913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r11,r11,LSL #1
11517913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	STR	r11,[r1,#-4]
11527913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	BLT	mdct_step8_default
11537913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
11547913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangmdct_end:
11557913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	MOV	r0, r2
11567913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	LDMFD	r13!,{r4-r11,PC}
11577913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
11587913073ddf11ca3dd7b0439998e1b17d443bb0baGloria WangcPI1_8:
11597913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.word	0x7641af3d
11607913073ddf11ca3dd7b0439998e1b17d443bb0baGloria WangcPI2_8:
11617913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.word	0x5a82799a
11627913073ddf11ca3dd7b0439998e1b17d443bb0baGloria WangcPI3_8:
11637913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.word	0x30fbc54d
11647913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wangbitrev:
11657913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	0
11667913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	32
11677913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	16
11687913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	48
11697913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	8
11707913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	40
11717913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	24
11727913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	56
11737913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	4
11747913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	36
11757913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	20
11767913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	52
11777913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	12
11787913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	44
11797913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	28
11807913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	60
11817913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	2
11827913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	34
11837913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	18
11847913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	50
11857913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	10
11867913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	42
11877913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	26
11887913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	58
11897913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	6
11907913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	38
11917913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	22
11927913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	54
11937913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	14
11947913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	46
11957913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	30
11967913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	62
11977913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	1
11987913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	33
11997913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	17
12007913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	49
12017913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	9
12027913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	41
12037913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	25
12047913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	57
12057913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	5
12067913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	37
12077913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	21
12087913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	53
12097913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	13
12107913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	45
12117913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	29
12127913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	61
12137913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	3
12147913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	35
12157913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	19
12167913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	51
12177913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	11
12187913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	43
12197913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	27
12207913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	59
12217913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	7
12227913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	39
12237913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	23
12247913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	55
12257913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	15
12267913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	47
12277913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	31
12287913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	.byte	63
12297913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang
1230e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel.Lsincos_lookup:
1231e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	.word	sincos_lookup0-.Lsincos_lookup
1232e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel	.word	sincos_lookup1-(.Lsincos_lookup+4)
1233e96d449a28f9679ca1ac22e21bd1cf1d68d2cb4fArd Biesheuvel
12347913073ddf11ca3dd7b0439998e1b17d443bb0baGloria Wang	@ END
1235