enc_neon.c revision 0406ce1417f76f2034833414dcecc9f56253640c
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2012 Google Inc. All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// that can be found in the COPYING file in the root of the source
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// tree. An additional intellectual property rights grant can be found
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in the file PATENTS. All contributing project authors may
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// be found in the AUTHORS file in the root of the source tree.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// -----------------------------------------------------------------------------
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ARM NEON version of speed-critical encoding functions.
1158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch//
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// adapted from libvpx (http://www.webmproject.org/code/)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "./dsp.h"
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(__cplusplus) || defined(c_plusplus)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern "C" {
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
19d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
20d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)#if defined(WEBP_USE_NEON)
21d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
22d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)#include "../enc/vp8enci.h"
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Transforms (Paragraph 14.4)
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Inverse transform.
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This code is pretty much the same as TransformOneNEON in the decoder, except
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for subtraction to *ref. See the comments there for algorithmic explanations.
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void ITransformOne(const uint8_t* ref,
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          const int16_t* in, uint8_t* dst) {
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kBPS = BPS;
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int16_t kC1C2[] = { 20091, 17734, 0, 0 };  // kC1 / (kC2 >> 1) / 0 / 0
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
35d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)  __asm__ volatile (
36d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vld1.16         {q1, q2}, [%[in]]           \n"
37d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vld1.16         {d0}, [%[kC1C2]]            \n"
38d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
39d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    // d2: in[0]
40d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    // d3: in[8]
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // d4: in[4]
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // d5: in[12]
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vswp            d3, d4                      \n"
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // q9 = {in[4], in[12]} * kC2 >> 16
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q8, q2, d0[0]               \n"
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q9, q2, d0[1]               \n"
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // d22 = a = in[0] + in[8]
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // d23 = b = in[0] - in[8]
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d22, d2, d3                 \n"
534e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)    "vqsub.s16       d23, d2, d3                 \n"
544e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)
554e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)    //  q8 = in[4]/[12] * kC1 >> 16
564e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)    "vshr.s16        q8, q8, #1                  \n"
574e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Add {in[4], in[12]} back after the multiplication.
59d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vqadd.s16       q8, q2, q8                  \n"
60d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
61d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    // d20 = c = in[4]*kC2 - in[12]*kC1
62d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    // d21 = d = in[4]*kC1 + in[12]*kC2
63d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vqsub.s16       d20, d18, d17               \n"
64d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vqadd.s16       d21, d19, d16               \n"
65d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
66d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    // d2 = tmp[0] = a + d
67d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    // d3 = tmp[1] = b + c
68d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    // d4 = tmp[2] = b - c
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // d5 = tmp[3] = a - d
70d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vqadd.s16       d2, d22, d21                \n"
71d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vqadd.s16       d3, d23, d20                \n"
72d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vqsub.s16       d4, d23, d20                \n"
73d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vqsub.s16       d5, d22, d21                \n"
74d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
75d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vzip.16         q1, q2                      \n"
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vzip.16         q1, q2                      \n"
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vswp            d3, d4                      \n"
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q8, q2, d0[0]               \n"
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q9, q2, d0[1]               \n"
84a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
85a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // d22 = a = tmp[0] + tmp[8]
86a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // d23 = b = tmp[0] - tmp[8]
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d22, d2, d3                 \n"
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d23, d2, d3                 \n"
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vshr.s16        q8, q8, #1                  \n"
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       q8, q2, q8                  \n"
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // d20 = c = in[4]*kC2 - in[12]*kC1
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // d21 = d = in[4]*kC1 + in[12]*kC2
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d20, d18, d17               \n"
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d21, d19, d16               \n"
97a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
98a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // d2 = tmp[0] = a + d
99a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // d3 = tmp[1] = b + c
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // d4 = tmp[2] = b - c
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // d5 = tmp[3] = a - d
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d2, d22, d21                \n"
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d3, d23, d20                \n"
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d4, d23, d20                \n"
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d5, d22, d21                \n"
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // (val) + 4 >> 3
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d2, d2, #3                  \n"
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d3, d3, #3                  \n"
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d4, d4, #3                  \n"
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d5, d5, #3                  \n"
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1201320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    "vzip.16         q1, q2                      \n"
1211320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    "vzip.16         q1, q2                      \n"
1221320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
1231320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    // Must accumulate before saturating
1241320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    "vmovl.u8        q8, d6                      \n"
1251320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    "vmovl.u8        q9, d7                      \n"
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       q1, q1, q8                  \n"
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       q2, q2, q9                  \n"
1291320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
1301320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    "vqmovun.s16     d0, q1                      \n"
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqmovun.s16     d1, q2                      \n"
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
1361320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    "vst1.32         d1[1], [%[dst]]             \n"
1371320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
1381320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  );
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void ITransform(const uint8_t* ref,
1451320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci                       const int16_t* in, uint8_t* dst, int do_two) {
1461320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  ITransformOne(ref, in, dst);
1471320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  if (do_two) {
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ITransformOne(ref + 4, in + 16, dst + 4);
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Same code as dec_neon.c
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void ITransformWHT(const int16_t* in, int16_t* out) {
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kStep = 32;  // The store is only incrementing the pointer as if we
1551320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci                         // had stored a single byte.
1561320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  __asm__ volatile (
1571320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    // part 1
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // load data into q0, q1
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.16         {q0, q1}, [%[in]]           \n"
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
16858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
17058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
17158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // Transpose
17258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
17358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
174a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
17558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
176a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    "vtrn.32         q0, q1                      \n"
17758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vtrn.32         q2, q3                      \n"
17858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
17958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vmov.s32        q4, #3                      \n" // dc = 3
18058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
18158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
18258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
18358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
18458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
18558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
18658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vadd.s32        q0, q6, q7                  \n"
18758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
18858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vadd.s32        q1, q9, q8                  \n"
18958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
19058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vsub.s32        q2, q6, q7                  \n"
19158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
19258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vsub.s32        q3, q9, q8                  \n"
19358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
19458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
19558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // set the results to output
19658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vst1.16         d0[0], [%[out]], %[kStep]      \n"
19758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vst1.16         d1[0], [%[out]], %[kStep]      \n"
19858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vst1.16         d2[0], [%[out]], %[kStep]      \n"
199d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d3[0], [%[out]], %[kStep]      \n"
200d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d0[1], [%[out]], %[kStep]      \n"
201d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d1[1], [%[out]], %[kStep]      \n"
202d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d2[1], [%[out]], %[kStep]      \n"
203d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d3[1], [%[out]], %[kStep]      \n"
204d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d0[2], [%[out]], %[kStep]      \n"
205d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d1[2], [%[out]], %[kStep]      \n"
206d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d2[2], [%[out]], %[kStep]      \n"
207d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d3[2], [%[out]], %[kStep]      \n"
208d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d0[3], [%[out]], %[kStep]      \n"
209d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d1[3], [%[out]], %[kStep]      \n"
210d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d2[3], [%[out]], %[kStep]      \n"
211d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vst1.16         d3[3], [%[out]], %[kStep]      \n"
212d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
213d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    : [out] "+r"(out)  // modified registers
214d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    : [in] "r"(in), [kStep] "r"(kStep)  // constants
215d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    : "memory", "q0", "q1", "q2", "q3", "q4",
216d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)      "q5", "q6", "q7", "q8", "q9" // clobbered
217d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)  );
218d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)}
219d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
220d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)// Forward transform.
221d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
222d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
223d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)static const int16_t kCoeff16[] = {
224d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
225d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)};
226d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)static const int32_t kCoeff32[] = {
227d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)   1812,  1812,  1812,  1812,
228a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    937,   937,   937,   937,
22958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch  12000, 12000, 12000, 12000,
23058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch  51000, 51000, 51000, 51000
23158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch};
23258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
23358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdochstatic void FTransform(const uint8_t* src, const uint8_t* ref,
23458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch                       int16_t* out) {
23558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch  const int kBPS = BPS;
23658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch  const uint8_t* src_ptr = src;
237a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)  const uint8_t* ref_ptr = ref;
23858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch  const int16_t* coeff16 = kCoeff16;
239a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)  const int32_t* coeff32 = kCoeff32;
24058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
24158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch  __asm__ volatile (
24258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // load src into q4, q5 in high half
24358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
24458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
24558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
24658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.8 {d11}, [%[src_ptr]]               \n"
24758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
24858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // load ref into q6, q7 in high half
24958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
25058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
25158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
25258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.8 {d15}, [%[ref_ptr]]               \n"
253d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
254d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    // Pack the high values in to q4 and q6
255d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vtrn.32     q4, q5                       \n"
256d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vtrn.32     q6, q7                       \n"
257d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
258d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    // d[0-3] = src - ref
259d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vsubl.u8    q0, d8, d12                  \n"
260a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    "vsubl.u8    q1, d9, d13                  \n"
26158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
26258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // load coeff16 into q8(d16=5352, d17=2217)
26358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16     {q8}, [%[coeff16]]           \n"
26458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
26558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // load coeff32 high half into q9 = 1812, q10 = 937
26658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
267a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
26858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // load coeff32 low half into q11=12000, q12=51000
269a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
27058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
27158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // part 1
27258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // Transpose. Register dN is the same as dN in C
27358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vtrn.32         d0, d2                   \n"
27458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vtrn.32         d1, d3                   \n"
27558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vtrn.16         d0, d1                   \n"
27658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vtrn.16         d2, d3                   \n"
27758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
27858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
27958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
28058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
28158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
28258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
283d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vadd.s16        d0, d4, d5               \n" // a0 + a1
284d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
285d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vsub.s16        d2, d4, d5               \n" // a0 - a1
286d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
287d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
288d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
289d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
290a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
29158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
29258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
29358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
29458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
29558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vshrn.s32       d1, q9, #9               \n"
29658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vshrn.s32       d3, q10, #9              \n"
297a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
29858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // part 2
299a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
30058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vtrn.32         d0, d2                   \n"
30158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vtrn.32         d1, d3                   \n"
30258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vtrn.16         d0, d1                   \n"
30358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vtrn.16         d2, d3                   \n"
30458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
30558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vmov.s16        d26, #7                  \n"
30658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
30758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
30858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
30958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
31058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vadd.s16        d4, d4, d26              \n" // a1 + 7
31158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
31258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
313d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
314d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
315d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
316d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
317d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
318d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)
319d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)    "vceq.s16        d4, d7, #0               \n"
320a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
32158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vshr.s16        d0, d0, #4               \n"
32258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vshr.s16        d2, d2, #4               \n"
32358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
32458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
32558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
32658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
32758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vmvn            d4, d4                   \n" // !(d1 == 0)
32858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
32958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vshrn.s32       d1, q11, #16             \n"
33058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // op[4] += (d1!=0)
33158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vsub.s16        d1, d1, d4               \n"
33258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
33358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vshrn.s32       d3, q12, #16             \n"
33458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
33558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // set result to out array
33658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vst1.16         {q0, q1}, [%[out]]   \n"
33758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
33858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch      [coeff32] "+r"(coeff32)          // modified registers
33958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
34058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch      [out] "r"(out)                   // constants
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "q10", "q11", "q12", "q13"       // clobbered
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  );
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
34558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch
34658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdochstatic void FTransformWHT(const int16_t* in, int16_t* out) {
34758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch  const int kStep = 32;
34858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch  __asm__ volatile (
34958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // d0 = in[0 * 16] , d1 = in[1 * 16]
35058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    // d2 = in[2 * 16] , d3 = in[3 * 16]
35158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16         d0[0], [%[in]], %[kStep]   \n"
35258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16         d1[0], [%[in]], %[kStep]   \n"
35358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16         d2[0], [%[in]], %[kStep]   \n"
35458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16         d3[0], [%[in]], %[kStep]   \n"
35558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16         d0[1], [%[in]], %[kStep]   \n"
35658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16         d1[1], [%[in]], %[kStep]   \n"
357a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    "vld1.16         d2[1], [%[in]], %[kStep]   \n"
358a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    "vld1.16         d3[1], [%[in]], %[kStep]   \n"
35958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16         d0[2], [%[in]], %[kStep]   \n"
36058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16         d1[2], [%[in]], %[kStep]   \n"
36158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16         d2[2], [%[in]], %[kStep]   \n"
36258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16         d3[2], [%[in]], %[kStep]   \n"
36358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch    "vld1.16         d0[3], [%[in]], %[kStep]   \n"
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.16         d1[3], [%[in]], %[kStep]   \n"
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.16         d2[3], [%[in]], %[kStep]   \n"
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.16         d3[3], [%[in]], %[kStep]   \n"
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vaddl.s16       q2, d0, d2                 \n" // a0=(in[0*16]+in[2*16])
369    "vaddl.s16       q3, d1, d3                 \n" // a1=(in[1*16]+in[3*16])
370    "vsubl.s16       q4, d1, d3                 \n" // a2=(in[1*16]-in[3*16])
371    "vsubl.s16       q5, d0, d2                 \n" // a3=(in[0*16]-in[2*16])
372
373    "vqadd.s32       q6, q2, q3                 \n" // a0 + a1
374    "vqadd.s32       q7, q5, q4                 \n" // a3 + a2
375    "vqsub.s32       q8, q5, q4                 \n" // a3 - a2
376    "vqsub.s32       q9, q2, q3                 \n" // a0 - a1
377
378    // Transpose
379    // q6 = tmp[0, 1,  2,  3] ; q7 = tmp[ 4,  5,  6,  7]
380    // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15]
381    "vswp            d13, d16                   \n" // vtrn.64 q0, q2
382    "vswp            d15, d18                   \n" // vtrn.64 q1, q3
383    "vtrn.32         q6, q7                     \n"
384    "vtrn.32         q8, q9                     \n"
385
386    "vqadd.s32       q0, q6, q8                 \n" // a0 = tmp[0] + tmp[8]
387    "vqadd.s32       q1, q7, q9                 \n" // a1 = tmp[4] + tmp[12]
388    "vqsub.s32       q2, q7, q9                 \n" // a2 = tmp[4] - tmp[12]
389    "vqsub.s32       q3, q6, q8                 \n" // a3 = tmp[0] - tmp[8]
390
391    "vqadd.s32       q4, q0, q1                 \n" // b0 = a0 + a1
392    "vqadd.s32       q5, q3, q2                 \n" // b1 = a3 + a2
393    "vqsub.s32       q6, q3, q2                 \n" // b2 = a3 - a2
394    "vqsub.s32       q7, q0, q1                 \n" // b3 = a0 - a1
395
396    "vshrn.s32       d18, q4, #1                \n" // b0 >> 1
397    "vshrn.s32       d19, q5, #1                \n" // b1 >> 1
398    "vshrn.s32       d20, q6, #1                \n" // b2 >> 1
399    "vshrn.s32       d21, q7, #1                \n" // b3 >> 1
400
401    "vst1.16         {q9, q10}, [%[out]]        \n"
402
403    : [in] "+r"(in)
404    : [kStep] "r"(kStep), [out] "r"(out)
405    : "memory", "q0", "q1", "q2", "q3", "q4", "q5",
406      "q6", "q7", "q8", "q9", "q10"       // clobbered
407  ) ;
408}
409
410//------------------------------------------------------------------------------
411// Texture distortion
412//
413// We try to match the spectral content (weighted) between source and
414// reconstructed samples.
415
416// Hadamard transform
417// Returns the weighted sum of the absolute value of transformed coefficients.
418// This uses a TTransform helper function in C
419static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
420                    const uint16_t* const w) {
421  const int kBPS = BPS;
422  const uint8_t* A = a;
423  const uint8_t* B = b;
424  const uint16_t* W = w;
425  int sum;
426  __asm__ volatile (
427    "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
428    "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
429    "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
430    "vld1.32         d2[1], [%[a]]            \n"
431
432    "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
433    "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
434    "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
435    "vld1.32         d3[1], [%[b]]            \n"
436
437    // a d0/d2, b d1/d3
438    // d0/d1: 01 01 01 01
439    // d2/d3: 23 23 23 23
440    // But: it goes 01 45 23 67
441    // Notice the middle values are transposed
442    "vtrn.16         q0, q1                   \n"
443
444    // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
445    "vaddl.u8        q2, d0, d2               \n"
446    "vaddl.u8        q10, d1, d3              \n"
447    // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
448    "vsubl.u8        q3, d0, d2               \n"
449    "vsubl.u8        q11, d1, d3              \n"
450
451    // tmp[0] = a0 + a1
452    "vpaddl.s16      q0, q2                   \n"
453    "vpaddl.s16      q8, q10                  \n"
454
455    // tmp[1] = a3 + a2
456    "vpaddl.s16      q1, q3                   \n"
457    "vpaddl.s16      q9, q11                  \n"
458
459    // No pair subtract
460    // q2 = {a0, a3}
461    // q3 = {a1, a2}
462    "vtrn.16         q2, q3                   \n"
463    "vtrn.16         q10, q11                 \n"
464
465    // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
466    "vsubl.s16       q12, d4, d6              \n"
467    "vsubl.s16       q13, d5, d7              \n"
468    "vsubl.s16       q14, d20, d22            \n"
469    "vsubl.s16       q15, d21, d23            \n"
470
471    // separate tmp[3] and tmp[2]
472    // q12 = tmp[3]
473    // q13 = tmp[2]
474    "vtrn.32         q12, q13                 \n"
475    "vtrn.32         q14, q15                 \n"
476
477    // Transpose tmp for a
478    "vswp            d1, d26                  \n" // vtrn.64
479    "vswp            d3, d24                  \n" // vtrn.64
480    "vtrn.32         q0, q1                   \n"
481    "vtrn.32         q13, q12                 \n"
482
483    // Transpose tmp for b
484    "vswp            d17, d30                 \n" // vtrn.64
485    "vswp            d19, d28                 \n" // vtrn.64
486    "vtrn.32         q8, q9                   \n"
487    "vtrn.32         q15, q14                 \n"
488
489    // The first Q register is a, the second b.
490    // q0/8 tmp[0-3]
491    // q13/15 tmp[4-7]
492    // q1/9 tmp[8-11]
493    // q12/14 tmp[12-15]
494
495    // These are still in 01 45 23 67 order. We fix it easily in the addition
496    // case but the subtraction propegates them.
497    "vswp            d3, d27                  \n"
498    "vswp            d19, d31                 \n"
499
500    // a0 = tmp[0] + tmp[8]
501    "vadd.s32        q2, q0, q1               \n"
502    "vadd.s32        q3, q8, q9               \n"
503
504    // a1 = tmp[4] + tmp[12]
505    "vadd.s32        q10, q13, q12            \n"
506    "vadd.s32        q11, q15, q14            \n"
507
508    // a2 = tmp[4] - tmp[12]
509    "vsub.s32        q13, q13, q12            \n"
510    "vsub.s32        q15, q15, q14            \n"
511
512    // a3 = tmp[0] - tmp[8]
513    "vsub.s32        q0, q0, q1               \n"
514    "vsub.s32        q8, q8, q9               \n"
515
516    // b0 = a0 + a1
517    "vadd.s32        q1, q2, q10              \n"
518    "vadd.s32        q9, q3, q11              \n"
519
520    // b1 = a3 + a2
521    "vadd.s32        q12, q0, q13             \n"
522    "vadd.s32        q14, q8, q15             \n"
523
524    // b2 = a3 - a2
525    "vsub.s32        q0, q0, q13              \n"
526    "vsub.s32        q8, q8, q15              \n"
527
528    // b3 = a0 - a1
529    "vsub.s32        q2, q2, q10              \n"
530    "vsub.s32        q3, q3, q11              \n"
531
532    "vld1.64         {q10, q11}, [%[w]]       \n"
533
534    // abs(b0)
535    "vabs.s32        q1, q1                   \n"
536    "vabs.s32        q9, q9                   \n"
537    // abs(b1)
538    "vabs.s32        q12, q12                 \n"
539    "vabs.s32        q14, q14                 \n"
540    // abs(b2)
541    "vabs.s32        q0, q0                   \n"
542    "vabs.s32        q8, q8                   \n"
543    // abs(b3)
544    "vabs.s32        q2, q2                   \n"
545    "vabs.s32        q3, q3                   \n"
546
547    // expand w before using.
548    "vmovl.u16       q13, d20                 \n"
549    "vmovl.u16       q15, d21                 \n"
550
551    // w[0] * abs(b0)
552    "vmul.u32        q1, q1, q13              \n"
553    "vmul.u32        q9, q9, q13              \n"
554
555    // w[4] * abs(b1)
556    "vmla.u32        q1, q12, q15             \n"
557    "vmla.u32        q9, q14, q15             \n"
558
559    // expand w before using.
560    "vmovl.u16       q13, d22                 \n"
561    "vmovl.u16       q15, d23                 \n"
562
563    // w[8] * abs(b1)
564    "vmla.u32        q1, q0, q13              \n"
565    "vmla.u32        q9, q8, q13              \n"
566
567    // w[12] * abs(b1)
568    "vmla.u32        q1, q2, q15              \n"
569    "vmla.u32        q9, q3, q15              \n"
570
571    // Sum the arrays
572    "vpaddl.u32      q1, q1                   \n"
573    "vpaddl.u32      q9, q9                   \n"
574    "vadd.u64        d2, d3                   \n"
575    "vadd.u64        d18, d19                 \n"
576
577    // Hadamard transform needs 4 bits of extra precision (2 bits in each
578    // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
579    // precision for coeff is 8bit of input + 4bits of Hadamard transform +
580    // 16bits for w[] + 2 bits of abs() summation.
581    //
582    // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
583    // A-OK.
584
585    // sum2 - sum1
586    "vsub.u32        d0, d2, d18              \n"
587    // abs(sum2 - sum1)
588    "vabs.s32        d0, d0                   \n"
589    // abs(sum2 - sum1) >> 5
590    "vshr.u32        d0, #5                   \n"
591
592    // It would be better to move the value straight into r0 but I'm not
593    // entirely sure how this works with inline assembly.
594    "vmov.32         %[sum], d0[0]            \n"
595
596    : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
597    : [kBPS] "r"(kBPS)
598    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
599      "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
600  ) ;
601
602  return sum;
603}
604
605static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
606                      const uint16_t* const w) {
607  int D = 0;
608  int x, y;
609  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
610    for (x = 0; x < 16; x += 4) {
611      D += Disto4x4(a + x + y, b + x + y, w);
612    }
613  }
614  return D;
615}
616
617#endif   // WEBP_USE_NEON
618
619//------------------------------------------------------------------------------
620// Entry point
621
622extern void VP8EncDspInitNEON(void);
623
624void VP8EncDspInitNEON(void) {
625#if defined(WEBP_USE_NEON)
626  VP8ITransform = ITransform;
627  VP8FTransform = FTransform;
628
629  VP8ITransformWHT = ITransformWHT;
630  VP8FTransformWHT = FTransformWHT;
631
632  VP8TDisto4x4 = Disto4x4;
633  VP8TDisto16x16 = Disto16x16;
634#endif   // WEBP_USE_NEON
635}
636
637#if defined(__cplusplus) || defined(c_plusplus)
638}    // extern "C"
639#endif
640