15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2012 Google Inc. All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
3eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// Use of this source code is governed by a BSD-style license
4eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// that can be found in the COPYING file in the root of the source
5eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// tree. An additional intellectual property rights grant can be found
6eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// in the file PATENTS. All contributing project authors may
7eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// be found in the AUTHORS file in the root of the source tree.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// -----------------------------------------------------------------------------
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ARM NEON version of dsp functions and loop filtering.
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Authors: Somnath Banerjee (somnath@google.com)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          Johann Koenig (johannkoenig@google.com)
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "./dsp.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(__cplusplus) || defined(c_plusplus)
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern "C" {
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if defined(WEBP_USE_NEON)
222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "../dec/vp8i.h"
242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",                  \
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define FLIP_SIGN_BIT2(a, b, s)                                                \
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "veor     " #a "," #a "," #s "               \n"                             \
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "veor     " #b "," #b "," #s "               \n"                             \
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLIP_SIGN_BIT2(a, b, s)                                                      \
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLIP_SIGN_BIT2(c, d, s)                                                      \
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vdup.8     q14, " #thresh "            \n"                                  \
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vmov.i8    q15, #0x03                  \n"                                  \
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                                               \
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vmov.i8    q15, #0x04                  \n"                                  \
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Applies filter on 2 pixels (p0 and q0)
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLIP_SIGN_BIT2(p0, q0, q10)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Load/Store vertical edge
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
84c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#define STORE8x2(c1, c2, p, stride)                                            \
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//-----------------------------------------------------------------------------
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Simple In-loop filtering (Paragraph 15.2)
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  __asm__ volatile (
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.u8    {q4}, [%[p]]                   \n"  // q1
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DO_FILTER2(q1, q2, q3, q4, %[thresh])
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [p] "+r"(p)
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [stride] "r"(stride), [thresh] "r"(thresh)
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : "memory", QRegs
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  );
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  __asm__ volatile (
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vswp       d3, d6                         \n"  // p1:q1 p0:q3
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vswp       d5, d8                         \n"  // q0:q2 q1:q4
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vswp       q2, q3                         \n"  // p1:q1 p0:q2 q0:q3 q1:q4
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DO_FILTER2(q1, q2, q3, q4, %[thresh])
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "sub        %[p], %[p], #1                 \n"  // p - 1
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vswp        d5, d6                        \n"
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    STORE8x2(d4, d5, [%[p]], %[stride])
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    STORE8x2(d6, d7, [%[p]], %[stride])
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [p] "+r"(p)
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [stride] "r"(stride), [thresh] "r"(thresh)
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : "memory", "r4", "r5", "r6", QRegs
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  );
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int k;
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (k = 3; k > 0; --k) {
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    p += 4 * stride;
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    SimpleVFilter16NEON(p, stride, thresh);
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int k;
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (k = 3; k > 0; --k) {
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    p += 4;
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    SimpleHFilter16NEON(p, stride, thresh);
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)//-----------------------------------------------------------------------------
1612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Inverse transforms (Paragraph 14.4)
1622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kBPS = BPS;
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int16_t constants[] = {20091, 17734, 0, 0};
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  /* kC1, kC2. Padded because vld1.16 loads 8 bytes
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * Technically these are unsigned but vqdmulh is only available in signed.
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * vqdmulh returns high half (effectively >> 16) but also doubles the value,
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * changing the >> 16 to >> 15 and requiring an additional >> 1.
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * We use this to our advantage with kC2. The canonical value is 35468.
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * However, the high bit is set so treating it as signed will give incorrect
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * results. We avoid this by down shifting by 1 here to clear the highest bit.
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * Combined with the doubling effect of vqdmulh we get >> 16.
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * This can not be applied to kC1 because the lowest bit is set. Down shifting
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * the constant would reduce precision.
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   */
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  /* libwebp uses a trick to avoid some extra addition that libvpx does.
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * Instead of:
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   * same issue with kC1 and vqdmulh that we work around by down shifting kC2
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   */
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  __asm__ volatile (
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.16         {q1, q2}, [%[in]]           \n"
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.16         {d0}, [%[constants]]        \n"
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d2: in[0]
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d3: in[8]
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d4: in[4]
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d5: in[12]
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vswp            d3, d4                      \n"
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * q9 = {in[4], in[12]} * kC2 >> 16
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q8, q2, d0[0]               \n"
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q9, q2, d0[1]               \n"
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d22 = a = in[0] + in[8]
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d23 = b = in[0] - in[8]
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d22, d2, d3                 \n"
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d23, d2, d3                 \n"
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* The multiplication should be x * kC1 >> 16
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * However, with vqdmulh we get x * kC1 * 2 >> 16
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * (multiply, double, return high half)
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * We avoided this in kC2 by pre-shifting the constant.
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * q8 = in[4]/[12] * kC1 >> 16
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vshr.s16        q8, q8, #1                  \n"
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* Add {in[4], in[12]} back after the multiplication. This is handled by
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * adding 1 << 16 to kC1 in the libwebp C code.
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       q8, q2, q8                  \n"
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d20 = c = in[4]*kC2 - in[12]*kC1
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d21 = d = in[4]*kC1 + in[12]*kC2
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d20, d18, d17               \n"
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d21, d19, d16               \n"
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d2 = tmp[0] = a + d
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d3 = tmp[1] = b + c
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d4 = tmp[2] = b - c
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d5 = tmp[3] = a - d
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d2, d22, d21                \n"
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d3, d23, d20                \n"
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d4, d23, d20                \n"
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d5, d22, d21                \n"
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vzip.16         q1, q2                      \n"
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vzip.16         q1, q2                      \n"
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vswp            d3, d4                      \n"
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q8, q2, d0[0]               \n"
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q9, q2, d0[1]               \n"
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d22 = a = tmp[0] + tmp[8]
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d23 = b = tmp[0] - tmp[8]
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d22, d2, d3                 \n"
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d23, d2, d3                 \n"
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* See long winded explanations prior */
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vshr.s16        q8, q8, #1                  \n"
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       q8, q2, q8                  \n"
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d20 = c = in[4]*kC2 - in[12]*kC1
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d21 = d = in[4]*kC1 + in[12]*kC2
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d20, d18, d17               \n"
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d21, d19, d16               \n"
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d2 = tmp[0] = a + d
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d3 = tmp[1] = b + c
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d4 = tmp[2] = b - c
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d5 = tmp[3] = a - d
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d2, d22, d21                \n"
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d3, d23, d20                \n"
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d4, d23, d20                \n"
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d5, d22, d21                \n"
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* (val) + 4 >> 3 */
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d2, d2, #3                  \n"
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d3, d3, #3                  \n"
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d4, d4, #3                  \n"
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d5, d5, #3                  \n"
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vzip.16         q1, q2                      \n"
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vzip.16         q1, q2                      \n"
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* Must accumulate before saturating */
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vmovl.u8        q8, d6                      \n"
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vmovl.u8        q9, d7                      \n"
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       q1, q1, q8                  \n"
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       q2, q2, q9                  \n"
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqmovun.s16     d0, q1                      \n"
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqmovun.s16     d1, q2                      \n"
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.32         d1[1], [%[dst]]             \n"
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  );
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  TransformOneNEON(in, dst);
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (do_two) {
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    TransformOneNEON(in + 16, dst + 4);
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)static void TransformWHT(const int16_t* in, int16_t* out) {
320c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const int kStep = 32;  // The store is only incrementing the pointer as if we
321c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)                         // had stored a single byte.
3222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  __asm__ volatile (
3232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    // part 1
3242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    // load data into q0, q1
3252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vld1.16         {q0, q1}, [%[in]]           \n"
3262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3272a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
3282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
3292a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
3302a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
3312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3322a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
3332a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
3342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
3352a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
3362a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3372a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    // Transpose
3382a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
3392a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
3402a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
3412a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
3422a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vtrn.32         q0, q1                      \n"
3432a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vtrn.32         q2, q3                      \n"
3442a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3452a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vmov.s32        q4, #3                      \n" // dc = 3
3462a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
3472a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
3482a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
3492a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
3502a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
3512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3522a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vadd.s32        q0, q6, q7                  \n"
3532a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
3542a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vadd.s32        q1, q9, q8                  \n"
3552a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
3562a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vsub.s32        q2, q6, q7                  \n"
3572a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
3582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vsub.s32        q3, q9, q8                  \n"
3592a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
3602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    // set the results to output
3622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d0[0], [%[out]], %[kStep]   \n"
3632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d1[0], [%[out]], %[kStep]   \n"
3642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d2[0], [%[out]], %[kStep]   \n"
3652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d3[0], [%[out]], %[kStep]   \n"
3662a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d0[1], [%[out]], %[kStep]   \n"
3672a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d1[1], [%[out]], %[kStep]   \n"
3682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d2[1], [%[out]], %[kStep]   \n"
3692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d3[1], [%[out]], %[kStep]   \n"
3702a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d0[2], [%[out]], %[kStep]   \n"
3712a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d1[2], [%[out]], %[kStep]   \n"
3722a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d2[2], [%[out]], %[kStep]   \n"
3732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d3[2], [%[out]], %[kStep]   \n"
3742a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d0[3], [%[out]], %[kStep]   \n"
3752a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d1[3], [%[out]], %[kStep]   \n"
3762a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d2[3], [%[out]], %[kStep]   \n"
3772a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    "vst1.16         d3[3], [%[out]], %[kStep]   \n"
3782a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3792a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    : [out] "+r"(out)  // modified registers
3802a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    : [in] "r"(in), [kStep] "r"(kStep)  // constants
3812a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    : "memory", "q0", "q1", "q2", "q3", "q4",
3822a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      "q5", "q6", "q7", "q8", "q9"  // clobbered
3832a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  );
3842a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
3852a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3862a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif   // WEBP_USE_NEON
3872a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3882a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)//------------------------------------------------------------------------------
3892a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Entry point
3902a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern void VP8DspInitNEON(void);
3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void VP8DspInitNEON(void) {
3942a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if defined(WEBP_USE_NEON)
3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  VP8Transform = TransformTwoNEON;
3962a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  VP8TransformWHT = TransformWHT;
3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  VP8SimpleVFilter16 = SimpleVFilter16NEON;
3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  VP8SimpleHFilter16 = SimpleHFilter16NEON;
4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  VP8SimpleVFilter16i = SimpleVFilter16iNEON;
4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  VP8SimpleHFilter16i = SimpleHFilter16iNEON;
4022a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif   // WEBP_USE_NEON
4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(__cplusplus) || defined(c_plusplus)
4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}    // extern "C"
4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
408