11b362b15af34006e6a11974088a46d42b903418eJohann;
21b362b15af34006e6a11974088a46d42b903418eJohann;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
31b362b15af34006e6a11974088a46d42b903418eJohann;
41b362b15af34006e6a11974088a46d42b903418eJohann;  Use of this source code is governed by a BSD-style license
51b362b15af34006e6a11974088a46d42b903418eJohann;  that can be found in the LICENSE file in the root of the source
61b362b15af34006e6a11974088a46d42b903418eJohann;  tree. An additional intellectual property rights grant can be found
71b362b15af34006e6a11974088a46d42b903418eJohann;  in the file PATENTS.  All contributing project authors may
81b362b15af34006e6a11974088a46d42b903418eJohann;  be found in the AUTHORS file in the root of the source tree.
91b362b15af34006e6a11974088a46d42b903418eJohann;
101b362b15af34006e6a11974088a46d42b903418eJohann
111b362b15af34006e6a11974088a46d42b903418eJohann
121b362b15af34006e6a11974088a46d42b903418eJohann    EXPORT  |vp8_short_fdct4x4_neon|
131b362b15af34006e6a11974088a46d42b903418eJohann    EXPORT  |vp8_short_fdct8x4_neon|
141b362b15af34006e6a11974088a46d42b903418eJohann
151b362b15af34006e6a11974088a46d42b903418eJohann    ARM
161b362b15af34006e6a11974088a46d42b903418eJohann    REQUIRE8
171b362b15af34006e6a11974088a46d42b903418eJohann    PRESERVE8
181b362b15af34006e6a11974088a46d42b903418eJohann
191b362b15af34006e6a11974088a46d42b903418eJohann    AREA ||.text||, CODE, READONLY, ALIGN=4
201b362b15af34006e6a11974088a46d42b903418eJohann
211b362b15af34006e6a11974088a46d42b903418eJohann
221b362b15af34006e6a11974088a46d42b903418eJohann    ALIGN 16    ; enable use of @128 bit aligned loads
231b362b15af34006e6a11974088a46d42b903418eJohanncoeff
241b362b15af34006e6a11974088a46d42b903418eJohann    DCW      5352,  5352,  5352, 5352
251b362b15af34006e6a11974088a46d42b903418eJohann    DCW      2217,  2217,  2217, 2217
261b362b15af34006e6a11974088a46d42b903418eJohann    DCD     14500, 14500, 14500, 14500
271b362b15af34006e6a11974088a46d42b903418eJohann    DCD      7500,  7500,  7500, 7500
281b362b15af34006e6a11974088a46d42b903418eJohann    DCD     12000, 12000, 12000, 12000
291b362b15af34006e6a11974088a46d42b903418eJohann    DCD     51000, 51000, 51000, 51000
301b362b15af34006e6a11974088a46d42b903418eJohann
311b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
321b362b15af34006e6a11974088a46d42b903418eJohann|vp8_short_fdct4x4_neon| PROC
331b362b15af34006e6a11974088a46d42b903418eJohann
341b362b15af34006e6a11974088a46d42b903418eJohann    ; Part one
351b362b15af34006e6a11974088a46d42b903418eJohann    vld1.16         {d0}, [r0@64], r2
361b362b15af34006e6a11974088a46d42b903418eJohann    adr             r12, coeff
371b362b15af34006e6a11974088a46d42b903418eJohann    vld1.16         {d1}, [r0@64], r2
381b362b15af34006e6a11974088a46d42b903418eJohann    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
391b362b15af34006e6a11974088a46d42b903418eJohann    vld1.16         {d2}, [r0@64], r2
401b362b15af34006e6a11974088a46d42b903418eJohann    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
411b362b15af34006e6a11974088a46d42b903418eJohann    vld1.16         {d3}, [r0@64], r2
421b362b15af34006e6a11974088a46d42b903418eJohann
431b362b15af34006e6a11974088a46d42b903418eJohann    ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
441b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.32         d0, d2
451b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.32         d1, d3
461b362b15af34006e6a11974088a46d42b903418eJohann    vld1.32         {q11,q12}, [r12@128]    ; q11=12000, q12=51000
471b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.16         d0, d1
481b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.16         d2, d3
491b362b15af34006e6a11974088a46d42b903418eJohann
501b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[3]
511b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        d5, d1, d2      ; b1 = ip[1] + ip[2]
521b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        d6, d1, d2      ; c1 = ip[1] - ip[2]
531b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[3]
541b362b15af34006e6a11974088a46d42b903418eJohann
551b362b15af34006e6a11974088a46d42b903418eJohann    vshl.s16        q2, q2, #3      ; (a1, b1) << 3
561b362b15af34006e6a11974088a46d42b903418eJohann    vshl.s16        q3, q3, #3      ; (c1, d1) << 3
571b362b15af34006e6a11974088a46d42b903418eJohann
581b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1
591b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        d2, d4, d5      ; op[2] = a1 - b1
601b362b15af34006e6a11974088a46d42b903418eJohann
611b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q9, d7, d16     ; d1*5352 + 14500
621b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q10, d7, d17    ; d1*2217 + 7500
631b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q9, d6, d17     ; c1*2217 + d1*5352 + 14500
641b362b15af34006e6a11974088a46d42b903418eJohann    vmlsl.s16       q10, d6, d16    ; d1*2217 - c1*5352 + 7500
651b362b15af34006e6a11974088a46d42b903418eJohann
661b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d1, q9, #12     ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
671b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d3, q10, #12    ; op[3] = (d1*2217 - c1*5352 +  7500)>>12
681b362b15af34006e6a11974088a46d42b903418eJohann
691b362b15af34006e6a11974088a46d42b903418eJohann
701b362b15af34006e6a11974088a46d42b903418eJohann    ; Part two
711b362b15af34006e6a11974088a46d42b903418eJohann
721b362b15af34006e6a11974088a46d42b903418eJohann    ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
731b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.32         d0, d2
741b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.32         d1, d3
751b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.16         d0, d1
761b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.16         d2, d3
771b362b15af34006e6a11974088a46d42b903418eJohann
781b362b15af34006e6a11974088a46d42b903418eJohann    vmov.s16        d26, #7
791b362b15af34006e6a11974088a46d42b903418eJohann
801b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[12]
811b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        d5, d1, d2      ; b1 = ip[4] + ip[8]
821b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        d6, d1, d2      ; c1 = ip[4] - ip[8]
831b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        d4, d4, d26     ; a1 + 7
841b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[12]
851b362b15af34006e6a11974088a46d42b903418eJohann
861b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1 + 7
871b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        d2, d4, d5      ; op[8] = a1 - b1 + 7
881b362b15af34006e6a11974088a46d42b903418eJohann
891b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q11, d7, d16    ; d1*5352 + 12000
901b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q12, d7, d17    ; d1*2217 + 51000
911b362b15af34006e6a11974088a46d42b903418eJohann
921b362b15af34006e6a11974088a46d42b903418eJohann    vceq.s16        d4, d7, #0
931b362b15af34006e6a11974088a46d42b903418eJohann
941b362b15af34006e6a11974088a46d42b903418eJohann    vshr.s16        d0, d0, #4
951b362b15af34006e6a11974088a46d42b903418eJohann    vshr.s16        d2, d2, #4
961b362b15af34006e6a11974088a46d42b903418eJohann
971b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000
981b362b15af34006e6a11974088a46d42b903418eJohann    vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000
991b362b15af34006e6a11974088a46d42b903418eJohann
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    vmvn            d4, d4
1011b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
1021b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)
1031b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
1041b362b15af34006e6a11974088a46d42b903418eJohann
1051b362b15af34006e6a11974088a46d42b903418eJohann    vst1.16         {q0, q1}, [r1@128]
1061b362b15af34006e6a11974088a46d42b903418eJohann
1071b362b15af34006e6a11974088a46d42b903418eJohann    bx              lr
1081b362b15af34006e6a11974088a46d42b903418eJohann
1091b362b15af34006e6a11974088a46d42b903418eJohann    ENDP
1101b362b15af34006e6a11974088a46d42b903418eJohann
1111b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
1121b362b15af34006e6a11974088a46d42b903418eJohann|vp8_short_fdct8x4_neon| PROC
1131b362b15af34006e6a11974088a46d42b903418eJohann
1141b362b15af34006e6a11974088a46d42b903418eJohann    ; Part one
1151b362b15af34006e6a11974088a46d42b903418eJohann
1161b362b15af34006e6a11974088a46d42b903418eJohann    vld1.16         {q0}, [r0@128], r2
1171b362b15af34006e6a11974088a46d42b903418eJohann    adr             r12, coeff
1181b362b15af34006e6a11974088a46d42b903418eJohann    vld1.16         {q1}, [r0@128], r2
1191b362b15af34006e6a11974088a46d42b903418eJohann    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
1201b362b15af34006e6a11974088a46d42b903418eJohann    vld1.16         {q2}, [r0@128], r2
1211b362b15af34006e6a11974088a46d42b903418eJohann    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
1221b362b15af34006e6a11974088a46d42b903418eJohann    vld1.16         {q3}, [r0@128], r2
1231b362b15af34006e6a11974088a46d42b903418eJohann
1241b362b15af34006e6a11974088a46d42b903418eJohann    ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
1251b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.32         q0, q2          ; [A0|B0]
1261b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.32         q1, q3          ; [A1|B1]
1271b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.16         q0, q1          ; [A2|B2]
1281b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.16         q2, q3          ; [A3|B3]
1291b362b15af34006e6a11974088a46d42b903418eJohann
1301b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[3]
1311b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        q12, q1, q2     ; b1 = ip[1] + ip[2]
1321b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        q13, q1, q2     ; c1 = ip[1] - ip[2]
1331b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[3]
1341b362b15af34006e6a11974088a46d42b903418eJohann
1351b362b15af34006e6a11974088a46d42b903418eJohann    vshl.s16        q11, q11, #3    ; a1 << 3
1361b362b15af34006e6a11974088a46d42b903418eJohann    vshl.s16        q12, q12, #3    ; b1 << 3
1371b362b15af34006e6a11974088a46d42b903418eJohann    vshl.s16        q13, q13, #3    ; c1 << 3
1381b362b15af34006e6a11974088a46d42b903418eJohann    vshl.s16        q14, q14, #3    ; d1 << 3
1391b362b15af34006e6a11974088a46d42b903418eJohann
1401b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        q0, q11, q12    ; [A0 | B0] = a1 + b1
1411b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        q2, q11, q12    ; [A2 | B2] = a1 - b1
1421b362b15af34006e6a11974088a46d42b903418eJohann
1431b362b15af34006e6a11974088a46d42b903418eJohann    vmov.s16        q11, q9         ; 14500
1441b362b15af34006e6a11974088a46d42b903418eJohann    vmov.s16        q12, q10        ; 7500
1451b362b15af34006e6a11974088a46d42b903418eJohann
1461b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q9, d28, d16    ; A[1] = d1*5352 + 14500
1471b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q10, d28, d17   ; A[3] = d1*2217 + 7500
1481b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q11, d29, d16   ; B[1] = d1*5352 + 14500
1491b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q12, d29, d17   ; B[3] = d1*2217 + 7500
1501b362b15af34006e6a11974088a46d42b903418eJohann
1511b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q9, d26, d17    ; A[1] = c1*2217 + d1*5352 + 14500
1521b362b15af34006e6a11974088a46d42b903418eJohann    vmlsl.s16       q10, d26, d16   ; A[3] = d1*2217 - c1*5352 + 7500
1531b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q11, d27, d17   ; B[1] = c1*2217 + d1*5352 + 14500
1541b362b15af34006e6a11974088a46d42b903418eJohann    vmlsl.s16       q12, d27, d16   ; B[3] = d1*2217 - c1*5352 + 7500
1551b362b15af34006e6a11974088a46d42b903418eJohann
1561b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d2, q9, #12     ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
1571b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d6, q10, #12    ; A[3] = (d1*2217 - c1*5352 +  7500)>>12
1581b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d3, q11, #12    ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
1591b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d7, q12, #12    ; B[3] = (d1*2217 - c1*5352 +  7500)>>12
1601b362b15af34006e6a11974088a46d42b903418eJohann
1611b362b15af34006e6a11974088a46d42b903418eJohann
1621b362b15af34006e6a11974088a46d42b903418eJohann    ; Part two
1631b362b15af34006e6a11974088a46d42b903418eJohann    vld1.32         {q9,q10}, [r12@128]    ; q9=12000, q10=51000
1641b362b15af34006e6a11974088a46d42b903418eJohann
1651b362b15af34006e6a11974088a46d42b903418eJohann    ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
1661b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.32         q0, q2          ; q0=[A0 | B0]
1671b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.32         q1, q3          ; q1=[A4 | B4]
1681b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.16         q0, q1          ; q2=[A8 | B8]
1691b362b15af34006e6a11974088a46d42b903418eJohann    vtrn.16         q2, q3          ; q3=[A12|B12]
1701b362b15af34006e6a11974088a46d42b903418eJohann
1711b362b15af34006e6a11974088a46d42b903418eJohann    vmov.s16        q15, #7
1721b362b15af34006e6a11974088a46d42b903418eJohann
1731b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[12]
1741b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        q12, q1, q2     ; b1 = ip[4] + ip[8]
1751b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        q11, q11, q15   ; a1 + 7
1761b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        q13, q1, q2     ; c1 = ip[4] - ip[8]
1771b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[12]
1781b362b15af34006e6a11974088a46d42b903418eJohann
1791b362b15af34006e6a11974088a46d42b903418eJohann    vadd.s16        q0, q11, q12    ; a1 + b1 + 7
1801b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        q1, q11, q12    ; a1 - b1 + 7
1811b362b15af34006e6a11974088a46d42b903418eJohann
1821b362b15af34006e6a11974088a46d42b903418eJohann    vmov.s16        q11, q9         ; 12000
1831b362b15af34006e6a11974088a46d42b903418eJohann    vmov.s16        q12, q10        ; 51000
1841b362b15af34006e6a11974088a46d42b903418eJohann
1851b362b15af34006e6a11974088a46d42b903418eJohann    vshr.s16        d0, d0, #4      ; A[0] = (a1 + b1 + 7)>>4
1861b362b15af34006e6a11974088a46d42b903418eJohann    vshr.s16        d4, d1, #4      ; B[0] = (a1 + b1 + 7)>>4
1871b362b15af34006e6a11974088a46d42b903418eJohann    vshr.s16        d2, d2, #4      ; A[8] = (a1 + b1 + 7)>>4
1881b362b15af34006e6a11974088a46d42b903418eJohann    vshr.s16        d6, d3, #4      ; B[8] = (a1 + b1 + 7)>>4
1891b362b15af34006e6a11974088a46d42b903418eJohann
1901b362b15af34006e6a11974088a46d42b903418eJohann
1911b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q9, d28, d16    ; A[4]  = d1*5352 + 12000
1921b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q10, d28, d17   ; A[12] = d1*2217 + 51000
1931b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q11, d29, d16   ; B[4]  = d1*5352 + 12000
1941b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q12, d29, d17   ; B[12] = d1*2217 + 51000
1951b362b15af34006e6a11974088a46d42b903418eJohann
1961b362b15af34006e6a11974088a46d42b903418eJohann    vceq.s16        q14, q14, #0
1971b362b15af34006e6a11974088a46d42b903418eJohann
1981b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q9, d26, d17    ; A[4]  = c1*2217 + d1*5352 + 12000
1991b362b15af34006e6a11974088a46d42b903418eJohann    vmlsl.s16       q10, d26, d16   ; A[12] = d1*2217 - c1*5352 + 51000
2001b362b15af34006e6a11974088a46d42b903418eJohann    vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000
2011b362b15af34006e6a11974088a46d42b903418eJohann    vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000
2021b362b15af34006e6a11974088a46d42b903418eJohann
203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    vmvn            q14, q14
2041b362b15af34006e6a11974088a46d42b903418eJohann
2051b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
2061b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
2071b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        d1, d1, d28     ; A[4] += (d1!=0)
2081b362b15af34006e6a11974088a46d42b903418eJohann
2091b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d5, q11, #16    ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
2101b362b15af34006e6a11974088a46d42b903418eJohann    vshrn.s32       d7, q12, #16    ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
2111b362b15af34006e6a11974088a46d42b903418eJohann    vsub.s16        d5, d5, d29     ; B[4] += (d1!=0)
2121b362b15af34006e6a11974088a46d42b903418eJohann
2131b362b15af34006e6a11974088a46d42b903418eJohann    vst1.16         {q0, q1}, [r1@128]! ; block A
2141b362b15af34006e6a11974088a46d42b903418eJohann    vst1.16         {q2, q3}, [r1@128]! ; block B
2151b362b15af34006e6a11974088a46d42b903418eJohann
2161b362b15af34006e6a11974088a46d42b903418eJohann    bx              lr
2171b362b15af34006e6a11974088a46d42b903418eJohann
2181b362b15af34006e6a11974088a46d42b903418eJohann    ENDP
2191b362b15af34006e6a11974088a46d42b903418eJohann
2201b362b15af34006e6a11974088a46d42b903418eJohann    END
2211b362b15af34006e6a11974088a46d42b903418eJohann
222