11b362b15af34006e6a11974088a46d42b903418eJohann; 21b362b15af34006e6a11974088a46d42b903418eJohann; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 31b362b15af34006e6a11974088a46d42b903418eJohann; 41b362b15af34006e6a11974088a46d42b903418eJohann; Use of this source code is governed by a BSD-style license 51b362b15af34006e6a11974088a46d42b903418eJohann; that can be found in the LICENSE file in the root of the source 61b362b15af34006e6a11974088a46d42b903418eJohann; tree. An additional intellectual property rights grant can be found 71b362b15af34006e6a11974088a46d42b903418eJohann; in the file PATENTS. All contributing project authors may 81b362b15af34006e6a11974088a46d42b903418eJohann; be found in the AUTHORS file in the root of the source tree. 91b362b15af34006e6a11974088a46d42b903418eJohann; 101b362b15af34006e6a11974088a46d42b903418eJohann 111b362b15af34006e6a11974088a46d42b903418eJohann 121b362b15af34006e6a11974088a46d42b903418eJohann EXPORT |vp8_short_fdct4x4_neon| 131b362b15af34006e6a11974088a46d42b903418eJohann EXPORT |vp8_short_fdct8x4_neon| 141b362b15af34006e6a11974088a46d42b903418eJohann 151b362b15af34006e6a11974088a46d42b903418eJohann ARM 161b362b15af34006e6a11974088a46d42b903418eJohann REQUIRE8 171b362b15af34006e6a11974088a46d42b903418eJohann PRESERVE8 181b362b15af34006e6a11974088a46d42b903418eJohann 191b362b15af34006e6a11974088a46d42b903418eJohann AREA ||.text||, CODE, READONLY, ALIGN=4 201b362b15af34006e6a11974088a46d42b903418eJohann 211b362b15af34006e6a11974088a46d42b903418eJohann 221b362b15af34006e6a11974088a46d42b903418eJohann ALIGN 16 ; enable use of @128 bit aligned loads 231b362b15af34006e6a11974088a46d42b903418eJohanncoeff 241b362b15af34006e6a11974088a46d42b903418eJohann DCW 5352, 5352, 5352, 5352 251b362b15af34006e6a11974088a46d42b903418eJohann DCW 2217, 2217, 2217, 2217 261b362b15af34006e6a11974088a46d42b903418eJohann DCD 14500, 14500, 14500, 14500 271b362b15af34006e6a11974088a46d42b903418eJohann DCD 7500, 7500, 7500, 7500 281b362b15af34006e6a11974088a46d42b903418eJohann DCD 12000, 12000, 12000, 12000 291b362b15af34006e6a11974088a46d42b903418eJohann DCD 51000, 51000, 51000, 51000 301b362b15af34006e6a11974088a46d42b903418eJohann 311b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_short_fdct4x4_c(short *input, short *output, int pitch) 321b362b15af34006e6a11974088a46d42b903418eJohann|vp8_short_fdct4x4_neon| PROC 331b362b15af34006e6a11974088a46d42b903418eJohann 341b362b15af34006e6a11974088a46d42b903418eJohann ; Part one 351b362b15af34006e6a11974088a46d42b903418eJohann vld1.16 {d0}, [r0@64], r2 361b362b15af34006e6a11974088a46d42b903418eJohann adr r12, coeff 371b362b15af34006e6a11974088a46d42b903418eJohann vld1.16 {d1}, [r0@64], r2 381b362b15af34006e6a11974088a46d42b903418eJohann vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 391b362b15af34006e6a11974088a46d42b903418eJohann vld1.16 {d2}, [r0@64], r2 401b362b15af34006e6a11974088a46d42b903418eJohann vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 411b362b15af34006e6a11974088a46d42b903418eJohann vld1.16 {d3}, [r0@64], r2 421b362b15af34006e6a11974088a46d42b903418eJohann 431b362b15af34006e6a11974088a46d42b903418eJohann ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] 441b362b15af34006e6a11974088a46d42b903418eJohann vtrn.32 d0, d2 451b362b15af34006e6a11974088a46d42b903418eJohann vtrn.32 d1, d3 461b362b15af34006e6a11974088a46d42b903418eJohann vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000 471b362b15af34006e6a11974088a46d42b903418eJohann vtrn.16 d0, d1 481b362b15af34006e6a11974088a46d42b903418eJohann vtrn.16 d2, d3 491b362b15af34006e6a11974088a46d42b903418eJohann 501b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3] 511b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2] 521b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2] 531b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3] 541b362b15af34006e6a11974088a46d42b903418eJohann 551b362b15af34006e6a11974088a46d42b903418eJohann vshl.s16 q2, q2, #3 ; (a1, b1) << 3 561b362b15af34006e6a11974088a46d42b903418eJohann vshl.s16 q3, q3, #3 ; (c1, d1) << 3 571b362b15af34006e6a11974088a46d42b903418eJohann 581b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 591b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 d2, d4, d5 ; op[2] = a1 - b1 601b362b15af34006e6a11974088a46d42b903418eJohann 611b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q9, d7, d16 ; d1*5352 + 14500 621b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q10, d7, d17 ; d1*2217 + 7500 631b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500 641b362b15af34006e6a11974088a46d42b903418eJohann vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500 651b362b15af34006e6a11974088a46d42b903418eJohann 661b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12 671b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12 681b362b15af34006e6a11974088a46d42b903418eJohann 691b362b15af34006e6a11974088a46d42b903418eJohann 701b362b15af34006e6a11974088a46d42b903418eJohann ; Part two 711b362b15af34006e6a11974088a46d42b903418eJohann 721b362b15af34006e6a11974088a46d42b903418eJohann ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] 731b362b15af34006e6a11974088a46d42b903418eJohann vtrn.32 d0, d2 741b362b15af34006e6a11974088a46d42b903418eJohann vtrn.32 d1, d3 751b362b15af34006e6a11974088a46d42b903418eJohann vtrn.16 d0, d1 761b362b15af34006e6a11974088a46d42b903418eJohann vtrn.16 d2, d3 771b362b15af34006e6a11974088a46d42b903418eJohann 781b362b15af34006e6a11974088a46d42b903418eJohann vmov.s16 d26, #7 791b362b15af34006e6a11974088a46d42b903418eJohann 801b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12] 811b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8] 821b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8] 831b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 d4, d4, d26 ; a1 + 7 841b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12] 851b362b15af34006e6a11974088a46d42b903418eJohann 861b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7 871b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7 881b362b15af34006e6a11974088a46d42b903418eJohann 891b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q11, d7, d16 ; d1*5352 + 12000 901b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q12, d7, d17 ; d1*2217 + 51000 911b362b15af34006e6a11974088a46d42b903418eJohann 921b362b15af34006e6a11974088a46d42b903418eJohann vceq.s16 d4, d7, #0 931b362b15af34006e6a11974088a46d42b903418eJohann 941b362b15af34006e6a11974088a46d42b903418eJohann vshr.s16 d0, d0, #4 951b362b15af34006e6a11974088a46d42b903418eJohann vshr.s16 d2, d2, #4 961b362b15af34006e6a11974088a46d42b903418eJohann 971b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000 981b362b15af34006e6a11974088a46d42b903418eJohann vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000 991b362b15af34006e6a11974088a46d42b903418eJohann 100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang vmvn d4, d4 1011b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16 1021b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 d1, d1, d4 ; op[4] += (d1!=0) 1031b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16 1041b362b15af34006e6a11974088a46d42b903418eJohann 1051b362b15af34006e6a11974088a46d42b903418eJohann vst1.16 {q0, q1}, [r1@128] 1061b362b15af34006e6a11974088a46d42b903418eJohann 1071b362b15af34006e6a11974088a46d42b903418eJohann bx lr 1081b362b15af34006e6a11974088a46d42b903418eJohann 1091b362b15af34006e6a11974088a46d42b903418eJohann ENDP 1101b362b15af34006e6a11974088a46d42b903418eJohann 1111b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_short_fdct8x4_c(short *input, short *output, int pitch) 1121b362b15af34006e6a11974088a46d42b903418eJohann|vp8_short_fdct8x4_neon| PROC 1131b362b15af34006e6a11974088a46d42b903418eJohann 1141b362b15af34006e6a11974088a46d42b903418eJohann ; Part one 1151b362b15af34006e6a11974088a46d42b903418eJohann 1161b362b15af34006e6a11974088a46d42b903418eJohann vld1.16 {q0}, [r0@128], r2 1171b362b15af34006e6a11974088a46d42b903418eJohann adr r12, coeff 1181b362b15af34006e6a11974088a46d42b903418eJohann vld1.16 {q1}, [r0@128], r2 1191b362b15af34006e6a11974088a46d42b903418eJohann vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 1201b362b15af34006e6a11974088a46d42b903418eJohann vld1.16 {q2}, [r0@128], r2 1211b362b15af34006e6a11974088a46d42b903418eJohann vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 1221b362b15af34006e6a11974088a46d42b903418eJohann vld1.16 {q3}, [r0@128], r2 1231b362b15af34006e6a11974088a46d42b903418eJohann 1241b362b15af34006e6a11974088a46d42b903418eJohann ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3] 1251b362b15af34006e6a11974088a46d42b903418eJohann vtrn.32 q0, q2 ; [A0|B0] 1261b362b15af34006e6a11974088a46d42b903418eJohann vtrn.32 q1, q3 ; [A1|B1] 1271b362b15af34006e6a11974088a46d42b903418eJohann vtrn.16 q0, q1 ; [A2|B2] 1281b362b15af34006e6a11974088a46d42b903418eJohann vtrn.16 q2, q3 ; [A3|B3] 1291b362b15af34006e6a11974088a46d42b903418eJohann 1301b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3] 1311b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2] 1321b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2] 1331b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3] 1341b362b15af34006e6a11974088a46d42b903418eJohann 1351b362b15af34006e6a11974088a46d42b903418eJohann vshl.s16 q11, q11, #3 ; a1 << 3 1361b362b15af34006e6a11974088a46d42b903418eJohann vshl.s16 q12, q12, #3 ; b1 << 3 1371b362b15af34006e6a11974088a46d42b903418eJohann vshl.s16 q13, q13, #3 ; c1 << 3 1381b362b15af34006e6a11974088a46d42b903418eJohann vshl.s16 q14, q14, #3 ; d1 << 3 1391b362b15af34006e6a11974088a46d42b903418eJohann 1401b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1 1411b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1 1421b362b15af34006e6a11974088a46d42b903418eJohann 1431b362b15af34006e6a11974088a46d42b903418eJohann vmov.s16 q11, q9 ; 14500 1441b362b15af34006e6a11974088a46d42b903418eJohann vmov.s16 q12, q10 ; 7500 1451b362b15af34006e6a11974088a46d42b903418eJohann 1461b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500 1471b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500 1481b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500 1491b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500 1501b362b15af34006e6a11974088a46d42b903418eJohann 1511b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500 1521b362b15af34006e6a11974088a46d42b903418eJohann vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500 1531b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500 1541b362b15af34006e6a11974088a46d42b903418eJohann vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500 1551b362b15af34006e6a11974088a46d42b903418eJohann 1561b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12 1571b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12 1581b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12 1591b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12 1601b362b15af34006e6a11974088a46d42b903418eJohann 1611b362b15af34006e6a11974088a46d42b903418eJohann 1621b362b15af34006e6a11974088a46d42b903418eJohann ; Part two 1631b362b15af34006e6a11974088a46d42b903418eJohann vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000 1641b362b15af34006e6a11974088a46d42b903418eJohann 1651b362b15af34006e6a11974088a46d42b903418eJohann ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12] 1661b362b15af34006e6a11974088a46d42b903418eJohann vtrn.32 q0, q2 ; q0=[A0 | B0] 1671b362b15af34006e6a11974088a46d42b903418eJohann vtrn.32 q1, q3 ; q1=[A4 | B4] 1681b362b15af34006e6a11974088a46d42b903418eJohann vtrn.16 q0, q1 ; q2=[A8 | B8] 1691b362b15af34006e6a11974088a46d42b903418eJohann vtrn.16 q2, q3 ; q3=[A12|B12] 1701b362b15af34006e6a11974088a46d42b903418eJohann 1711b362b15af34006e6a11974088a46d42b903418eJohann vmov.s16 q15, #7 1721b362b15af34006e6a11974088a46d42b903418eJohann 1731b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12] 1741b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8] 1751b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 q11, q11, q15 ; a1 + 7 1761b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8] 1771b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12] 1781b362b15af34006e6a11974088a46d42b903418eJohann 1791b362b15af34006e6a11974088a46d42b903418eJohann vadd.s16 q0, q11, q12 ; a1 + b1 + 7 1801b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 q1, q11, q12 ; a1 - b1 + 7 1811b362b15af34006e6a11974088a46d42b903418eJohann 1821b362b15af34006e6a11974088a46d42b903418eJohann vmov.s16 q11, q9 ; 12000 1831b362b15af34006e6a11974088a46d42b903418eJohann vmov.s16 q12, q10 ; 51000 1841b362b15af34006e6a11974088a46d42b903418eJohann 1851b362b15af34006e6a11974088a46d42b903418eJohann vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4 1861b362b15af34006e6a11974088a46d42b903418eJohann vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4 1871b362b15af34006e6a11974088a46d42b903418eJohann vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4 1881b362b15af34006e6a11974088a46d42b903418eJohann vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4 1891b362b15af34006e6a11974088a46d42b903418eJohann 1901b362b15af34006e6a11974088a46d42b903418eJohann 1911b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000 1921b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000 1931b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000 1941b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000 1951b362b15af34006e6a11974088a46d42b903418eJohann 1961b362b15af34006e6a11974088a46d42b903418eJohann vceq.s16 q14, q14, #0 1971b362b15af34006e6a11974088a46d42b903418eJohann 1981b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000 1991b362b15af34006e6a11974088a46d42b903418eJohann vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000 2001b362b15af34006e6a11974088a46d42b903418eJohann vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000 2011b362b15af34006e6a11974088a46d42b903418eJohann vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000 2021b362b15af34006e6a11974088a46d42b903418eJohann 203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang vmvn q14, q14 2041b362b15af34006e6a11974088a46d42b903418eJohann 2051b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16 2061b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16 2071b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 d1, d1, d28 ; A[4] += (d1!=0) 2081b362b15af34006e6a11974088a46d42b903418eJohann 2091b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16 2101b362b15af34006e6a11974088a46d42b903418eJohann vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16 2111b362b15af34006e6a11974088a46d42b903418eJohann vsub.s16 d5, d5, d29 ; B[4] += (d1!=0) 2121b362b15af34006e6a11974088a46d42b903418eJohann 2131b362b15af34006e6a11974088a46d42b903418eJohann vst1.16 {q0, q1}, [r1@128]! ; block A 2141b362b15af34006e6a11974088a46d42b903418eJohann vst1.16 {q2, q3}, [r1@128]! ; block B 2151b362b15af34006e6a11974088a46d42b903418eJohann 2161b362b15af34006e6a11974088a46d42b903418eJohann bx lr 2171b362b15af34006e6a11974088a46d42b903418eJohann 2181b362b15af34006e6a11974088a46d42b903418eJohann ENDP 2191b362b15af34006e6a11974088a46d42b903418eJohann 2201b362b15af34006e6a11974088a46d42b903418eJohann END 2211b362b15af34006e6a11974088a46d42b903418eJohann 222