15a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Copyright 2012 Google Inc. All Rights Reserved.
25a50414796e9a458925c7a13a15055d02406bf43Vikas Arora//
35a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// This code is licensed under the same terms as WebM:
45a50414796e9a458925c7a13a15055d02406bf43Vikas Arora//  Software License Agreement:  http://www.webmproject.org/license/software/
55a50414796e9a458925c7a13a15055d02406bf43Vikas Arora//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
65a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// -----------------------------------------------------------------------------
75a50414796e9a458925c7a13a15055d02406bf43Vikas Arora//
85a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// ARM NEON version of dsp functions and loop filtering.
95a50414796e9a458925c7a13a15055d02406bf43Vikas Arora//
105a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Authors: Somnath Banerjee (somnath@google.com)
115a50414796e9a458925c7a13a15055d02406bf43Vikas Arora//          Johann Koenig (johannkoenig@google.com)
125a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
135a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#include "./dsp.h"
145a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
155a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#if defined(WEBP_USE_NEON)
165a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
175a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#include "../dec/vp8i.h"
185a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
195a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#if defined(__cplusplus) || defined(c_plusplus)
205a50414796e9a458925c7a13a15055d02406bf43Vikas Aroraextern "C" {
215a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#endif
225a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
235a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",                  \
245a50414796e9a458925c7a13a15055d02406bf43Vikas Arora              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
255a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
265a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define FLIP_SIGN_BIT2(a, b, s)                                                \
275a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "veor     " #a "," #a "," #s "               \n"                             \
285a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "veor     " #b "," #b "," #s "               \n"                             \
295a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
305a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
315a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  FLIP_SIGN_BIT2(a, b, s)                                                      \
325a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  FLIP_SIGN_BIT2(c, d, s)                                                      \
335a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
345a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
355a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
365a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
375a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
385a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
395a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
405a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vdup.8     q14, " #thresh "            \n"                                  \
415a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
425a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
435a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
445a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
455a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
465a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
475a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
485a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
495a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
505a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
515a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vmov.i8    q15, #0x03                  \n"                                  \
525a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
535a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
545a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
555a50414796e9a458925c7a13a15055d02406bf43Vikas Arora                                                                               \
565a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vmov.i8    q15, #0x04                  \n"                                  \
575a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
585a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
595a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
605a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
615a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Applies filter on 2 pixels (p0 and q0)
625a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
635a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
645a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
655a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
665a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
675a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
685a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
695a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  FLIP_SIGN_BIT2(p0, q0, q10)
705a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
715a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Load/Store vertical edge
725a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
735a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
745a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
755a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
765a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
775a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
785a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
795a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
805a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
815a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
825a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define STORE8x2(c1, c2, p,stride)                                             \
835a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
845a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
855a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
865a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
875a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
885a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
895a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
905a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
915a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
925a50414796e9a458925c7a13a15055d02406bf43Vikas Arora//-----------------------------------------------------------------------------
935a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Simple In-loop filtering (Paragraph 15.2)
945a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
955a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
965a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  __asm__ volatile (
975a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
985a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
995a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
1005a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
1015a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
1025a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vld1.u8    {q4}, [%[p]]                   \n"  // q1
1035a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1045a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    DO_FILTER2(q1, q2, q3, q4, %[thresh])
1055a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1065a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
1075a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1085a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
1095a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
1105a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    : [p] "+r"(p)
1115a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    : [stride] "r"(stride), [thresh] "r"(thresh)
1125a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    : "memory", QRegs
1135a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  );
1145a50414796e9a458925c7a13a15055d02406bf43Vikas Arora}
1155a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1165a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
1175a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  __asm__ volatile (
1185a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
1195a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
1205a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
1215a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1225a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
1235a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)
1245a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vswp       d3, d6                         \n"  // p1:q1 p0:q3
1255a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vswp       d5, d8                         \n"  // q0:q2 q1:q4
1265a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vswp       q2, q3                         \n"  // p1:q1 p0:q2 q0:q3 q1:q4
1275a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1285a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    DO_FILTER2(q1, q2, q3, q4, %[thresh])
1295a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1305a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "sub        %[p], %[p], #1                 \n"  // p - 1
1315a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1325a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vswp        d5, d6                        \n"
1335a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    STORE8x2(d4, d5, [%[p]], %[stride])
1345a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    STORE8x2(d6, d7, [%[p]], %[stride])
1355a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1365a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    : [p] "+r"(p)
1375a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    : [stride] "r"(stride), [thresh] "r"(thresh)
1385a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    : "memory", "r4", "r5", "r6", QRegs
1395a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  );
1405a50414796e9a458925c7a13a15055d02406bf43Vikas Arora}
1415a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1425a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {
1435a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  int k;
1445a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  for (k = 3; k > 0; --k) {
1455a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    p += 4 * stride;
1465a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    SimpleVFilter16NEON(p, stride, thresh);
1475a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  }
1485a50414796e9a458925c7a13a15055d02406bf43Vikas Arora}
1495a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1505a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
1515a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  int k;
1525a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  for (k = 3; k > 0; --k) {
1535a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    p += 4;
1545a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    SimpleHFilter16NEON(p, stride, thresh);
1555a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  }
1565a50414796e9a458925c7a13a15055d02406bf43Vikas Arora}
1575a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1585a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void TransformOneNEON(const int16_t *in, uint8_t *dst) {
1595a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  const int kBPS = BPS;
1605a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  const int16_t constants[] = {20091, 17734, 0, 0};
1615a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  /* kC1, kC2. Padded because vld1.16 loads 8 bytes
1625a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * Technically these are unsigned but vqdmulh is only available in signed.
1635a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * vqdmulh returns high half (effectively >> 16) but also doubles the value,
1645a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * changing the >> 16 to >> 15 and requiring an additional >> 1.
1655a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * We use this to our advantage with kC2. The canonical value is 35468.
1665a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * However, the high bit is set so treating it as signed will give incorrect
1675a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * results. We avoid this by down shifting by 1 here to clear the highest bit.
1685a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * Combined with the doubling effect of vqdmulh we get >> 16.
1695a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * This can not be applied to kC1 because the lowest bit is set. Down shifting
1705a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * the constant would reduce precision.
1715a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   */
1725a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1735a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  /* libwebp uses a trick to avoid some extra addition that libvpx does.
1745a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * Instead of:
1755a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1765a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1775a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   * same issue with kC1 and vqdmulh that we work around by down shifting kC2
1785a50414796e9a458925c7a13a15055d02406bf43Vikas Arora   */
1795a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1805a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1815a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  __asm__ volatile (
1825a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vld1.16         {q1, q2}, [%[in]]           \n"
1835a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vld1.16         {d0}, [%[constants]]        \n"
1845a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1855a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* d2: in[0]
1865a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d3: in[8]
1875a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d4: in[4]
1885a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d5: in[12]
1895a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     */
1905a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vswp            d3, d4                      \n"
1915a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1925a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1935a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * q9 = {in[4], in[12]} * kC2 >> 16
1945a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     */
1955a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqdmulh.s16     q8, q2, d0[0]               \n"
1965a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqdmulh.s16     q9, q2, d0[1]               \n"
1975a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
1985a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* d22 = a = in[0] + in[8]
1995a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d23 = b = in[0] - in[8]
2005a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     */
2015a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       d22, d2, d3                 \n"
2025a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqsub.s16       d23, d2, d3                 \n"
2035a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2045a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* The multiplication should be x * kC1 >> 16
2055a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * However, with vqdmulh we get x * kC1 * 2 >> 16
2065a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * (multiply, double, return high half)
2075a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * We avoided this in kC2 by pre-shifting the constant.
2085a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * q8 = in[4]/[12] * kC1 >> 16
2095a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     */
2105a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vshr.s16        q8, q8, #1                  \n"
2115a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2125a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* Add {in[4], in[12]} back after the multiplication. This is handled by
2135a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * adding 1 << 16 to kC1 in the libwebp C code.
2145a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     */
2155a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       q8, q2, q8                  \n"
2165a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2175a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* d20 = c = in[4]*kC2 - in[12]*kC1
2185a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d21 = d = in[4]*kC1 + in[12]*kC2
2195a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     */
2205a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqsub.s16       d20, d18, d17               \n"
2215a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       d21, d19, d16               \n"
2225a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2235a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* d2 = tmp[0] = a + d
2245a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d3 = tmp[1] = b + c
2255a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d4 = tmp[2] = b - c
2265a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d5 = tmp[3] = a - d
2275a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     */
2285a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       d2, d22, d21                \n"
2295a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       d3, d23, d20                \n"
2305a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqsub.s16       d4, d23, d20                \n"
2315a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqsub.s16       d5, d22, d21                \n"
2325a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2335a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vzip.16         q1, q2                      \n"
2345a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vzip.16         q1, q2                      \n"
2355a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2365a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vswp            d3, d4                      \n"
2375a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2385a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
2395a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
2405a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     */
2415a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqdmulh.s16     q8, q2, d0[0]               \n"
2425a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqdmulh.s16     q9, q2, d0[1]               \n"
2435a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2445a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* d22 = a = tmp[0] + tmp[8]
2455a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d23 = b = tmp[0] - tmp[8]
2465a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     */
2475a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       d22, d2, d3                 \n"
2485a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqsub.s16       d23, d2, d3                 \n"
2495a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2505a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* See long winded explanations prior */
2515a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vshr.s16        q8, q8, #1                  \n"
2525a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       q8, q2, q8                  \n"
2535a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2545a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* d20 = c = in[4]*kC2 - in[12]*kC1
2555a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d21 = d = in[4]*kC1 + in[12]*kC2
2565a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     */
2575a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqsub.s16       d20, d18, d17               \n"
2585a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       d21, d19, d16               \n"
2595a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2605a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* d2 = tmp[0] = a + d
2615a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d3 = tmp[1] = b + c
2625a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d4 = tmp[2] = b - c
2635a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     * d5 = tmp[3] = a - d
2645a50414796e9a458925c7a13a15055d02406bf43Vikas Arora     */
2655a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       d2, d22, d21                \n"
2665a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       d3, d23, d20                \n"
2675a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqsub.s16       d4, d23, d20                \n"
2685a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqsub.s16       d5, d22, d21                \n"
2695a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2705a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
2715a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
2725a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
2735a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
2745a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2755a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
2765a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2775a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* (val) + 4 >> 3 */
2785a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vrshr.s16       d2, d2, #3                  \n"
2795a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vrshr.s16       d3, d3, #3                  \n"
2805a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vrshr.s16       d4, d4, #3                  \n"
2815a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vrshr.s16       d5, d5, #3                  \n"
2825a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2835a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vzip.16         q1, q2                      \n"
2845a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vzip.16         q1, q2                      \n"
2855a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2865a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    /* Must accumulate before saturating */
2875a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vmovl.u8        q8, d6                      \n"
2885a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vmovl.u8        q9, d7                      \n"
2895a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2905a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       q1, q1, q8                  \n"
2915a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqadd.s16       q2, q2, q9                  \n"
2925a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2935a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqmovun.s16     d0, q1                      \n"
2945a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vqmovun.s16     d1, q2                      \n"
2955a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
2965a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
2975a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
2985a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
2995a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    "vst1.32         d1[1], [%[dst]]             \n"
3005a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
3015a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
3025a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
3035a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
3045a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  );
3055a50414796e9a458925c7a13a15055d02406bf43Vikas Arora}
3065a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
3075a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
3085a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  TransformOneNEON(in, dst);
3095a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  if (do_two) {
3105a50414796e9a458925c7a13a15055d02406bf43Vikas Arora    TransformOneNEON(in + 16, dst + 4);
3115a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  }
3125a50414796e9a458925c7a13a15055d02406bf43Vikas Arora}
3135a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
3145a50414796e9a458925c7a13a15055d02406bf43Vikas Aroraextern void VP8DspInitNEON(void);
3155a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
3165a50414796e9a458925c7a13a15055d02406bf43Vikas Aroravoid VP8DspInitNEON(void) {
3175a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  VP8Transform = TransformTwoNEON;
3185a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
3195a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  VP8SimpleVFilter16 = SimpleVFilter16NEON;
3205a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  VP8SimpleHFilter16 = SimpleHFilter16NEON;
3215a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  VP8SimpleVFilter16i = SimpleVFilter16iNEON;
3225a50414796e9a458925c7a13a15055d02406bf43Vikas Arora  VP8SimpleHFilter16i = SimpleHFilter16iNEON;
3235a50414796e9a458925c7a13a15055d02406bf43Vikas Arora}
3245a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
3255a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#if defined(__cplusplus) || defined(c_plusplus)
3265a50414796e9a458925c7a13a15055d02406bf43Vikas Arora}    // extern "C"
3275a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#endif
3285a50414796e9a458925c7a13a15055d02406bf43Vikas Arora
3295a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#endif   // WEBP_USE_NEON
330