enc_neon.c revision 1e7bf8805bd030c19924a5306837ecd72c295751
1a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Copyright 2012 Google Inc. All Rights Reserved.
2a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch//
3a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// This code is licensed under the same terms as WebM:
4a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch//  Software License Agreement:  http://www.webmproject.org/license/software/
5a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
6a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// -----------------------------------------------------------------------------
7a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch//
8010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// ARM NEON version of speed-critical encoding functions.
9a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch//
100529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// adapted from libvpx (http://www.webmproject.org/code/)
11a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
12010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#include "./dsp.h"
13a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
14010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#if defined(__cplusplus) || defined(c_plusplus)
15a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochextern "C" {
16a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#endif
17a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
18a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#if defined(WEBP_USE_NEON)
19a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
20010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#include "../enc/vp8enci.h"
21010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
22010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)//------------------------------------------------------------------------------
23010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// Transforms (Paragraph 14.4)
24010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
25010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// Inverse transform.
26010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// This code is pretty much the same as TransformOneNEON in the decoder, except
27010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// for subtraction to *ref. See the comments there for algorithmic explanations.
28010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)static void ITransformOne(const uint8_t* ref,
29010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)                          const int16_t* in, uint8_t* dst) {
30010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  const int kBPS = BPS;
31010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  const int16_t kC1C2[] = { 20091, 17734, 0, 0 };  // kC1 / (kC2 >> 1) / 0 / 0
32010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
33010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  __asm__ volatile (
34010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vld1.16         {q1, q2}, [%[in]]           \n"
35010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vld1.16         {d0}, [%[kC1C2]]            \n"
36010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
37010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // d2: in[0]
38010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // d3: in[8]
39010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // d4: in[4]
40010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // d5: in[12]
41010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vswp            d3, d4                      \n"
42010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
43010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
44010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // q9 = {in[4], in[12]} * kC2 >> 16
45010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqdmulh.s16     q8, q2, d0[0]               \n"
46010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqdmulh.s16     q9, q2, d0[1]               \n"
47010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
48010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // d22 = a = in[0] + in[8]
49010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // d23 = b = in[0] - in[8]
50010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqadd.s16       d22, d2, d3                 \n"
51010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqsub.s16       d23, d2, d3                 \n"
52010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
53010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    //  q8 = in[4]/[12] * kC1 >> 16
54010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vshr.s16        q8, q8, #1                  \n"
55010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
56010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // Add {in[4], in[12]} back after the multiplication.
57010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqadd.s16       q8, q2, q8                  \n"
58010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
59010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // d20 = c = in[4]*kC2 - in[12]*kC1
60010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // d21 = d = in[4]*kC1 + in[12]*kC2
61a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqsub.s16       d20, d18, d17               \n"
62a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqadd.s16       d21, d19, d16               \n"
63010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
64010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // d2 = tmp[0] = a + d
65a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // d3 = tmp[1] = b + c
66a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // d4 = tmp[2] = b - c
67010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // d5 = tmp[3] = a - d
68010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqadd.s16       d2, d22, d21                \n"
69a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqadd.s16       d3, d23, d20                \n"
70a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqsub.s16       d4, d23, d20                \n"
71010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqsub.s16       d5, d22, d21                \n"
72010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
73a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vzip.16         q1, q2                      \n"
74f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    "vzip.16         q1, q2                      \n"
755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
76a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vswp            d3, d4                      \n"
77a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
78a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
79a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
80a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqdmulh.s16     q8, q2, d0[0]               \n"
81a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqdmulh.s16     q9, q2, d0[1]               \n"
82a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
83a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // d22 = a = tmp[0] + tmp[8]
84a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // d23 = b = tmp[0] - tmp[8]
85a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqadd.s16       d22, d2, d3                 \n"
86a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqsub.s16       d23, d2, d3                 \n"
87a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
88a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vshr.s16        q8, q8, #1                  \n"
89a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqadd.s16       q8, q2, q8                  \n"
90a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
91a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // d20 = c = in[4]*kC2 - in[12]*kC1
92a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // d21 = d = in[4]*kC1 + in[12]*kC2
93a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqsub.s16       d20, d18, d17               \n"
94a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqadd.s16       d21, d19, d16               \n"
95a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
96a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // d2 = tmp[0] = a + d
97a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // d3 = tmp[1] = b + c
98a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // d4 = tmp[2] = b - c
99a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // d5 = tmp[3] = a - d
100a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqadd.s16       d2, d22, d21                \n"
101a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vqadd.s16       d3, d23, d20                \n"
102010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqsub.s16       d4, d23, d20                \n"
103010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqsub.s16       d5, d22, d21                \n"
104010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
105010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
106010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
107010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
108010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
1091320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
110010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
111a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
112010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // (val) + 4 >> 3
113a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vrshr.s16       d2, d2, #3                  \n"
114010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vrshr.s16       d3, d3, #3                  \n"
115010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vrshr.s16       d4, d4, #3                  \n"
116010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vrshr.s16       d5, d5, #3                  \n"
117010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
118010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vzip.16         q1, q2                      \n"
119010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vzip.16         q1, q2                      \n"
120010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
121010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // Must accumulate before saturating
122010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vmovl.u8        q8, d6                      \n"
12346d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)    "vmovl.u8        q9, d7                      \n"
12446d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)
125010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqadd.s16       q1, q1, q8                  \n"
126010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqadd.s16       q2, q2, q9                  \n"
127010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
128010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqmovun.s16     d0, q1                      \n"
129010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vqmovun.s16     d1, q2                      \n"
130010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
131010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
132010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
133010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
134010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vst1.32         d1[1], [%[dst]]             \n"
135010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
136010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
137010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
138010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
139010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  );
140010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)}
1415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
142010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)static void ITransform(const uint8_t* ref,
143010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)                       const int16_t* in, uint8_t* dst, int do_two) {
144010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  ITransformOne(ref, in, dst);
145010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  if (do_two) {
146010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    ITransformOne(ref + 4, in + 16, dst + 4);
147010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  }
148010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)}
149010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
150010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// Same code as dec_neon.c
1511320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tuccistatic void ITransformWHT(const int16_t* in, int16_t* out) {
152010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  const int kStep = 32; // The store is only incrementing the pointer as if we
153010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)                        // had stored a single byte.
1541320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  __asm__ volatile (
155010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // part 1
1561320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    // load data into q0, q1
1576e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)    "vld1.16         {q0, q1}, [%[in]]           \n"
158010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
159010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
160010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
161010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
162010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
1631320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
164010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
165010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
166010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
167010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
1681320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
169a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // Transpose
170a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
171a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
172010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
173a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
174a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vtrn.32         q0, q1                      \n"
175a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vtrn.32         q2, q3                      \n"
176a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
177a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vmov.s32        q4, #3                      \n" // dc = 3
178a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
179a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
180a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
181a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
182a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
183a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
184a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vadd.s32        q0, q6, q7                  \n"
185a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
186a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vadd.s32        q1, q9, q8                  \n"
187a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
188a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vsub.s32        q2, q6, q7                  \n"
189a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
190a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vsub.s32        q3, q9, q8                  \n"
191a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
192a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch
193a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    // set the results to output
194a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch    "vst1.16         d0[0], [%[out]], %[kStep]      \n"
195    "vst1.16         d1[0], [%[out]], %[kStep]      \n"
196    "vst1.16         d2[0], [%[out]], %[kStep]      \n"
197    "vst1.16         d3[0], [%[out]], %[kStep]      \n"
198    "vst1.16         d0[1], [%[out]], %[kStep]      \n"
199    "vst1.16         d1[1], [%[out]], %[kStep]      \n"
200    "vst1.16         d2[1], [%[out]], %[kStep]      \n"
201    "vst1.16         d3[1], [%[out]], %[kStep]      \n"
202    "vst1.16         d0[2], [%[out]], %[kStep]      \n"
203    "vst1.16         d1[2], [%[out]], %[kStep]      \n"
204    "vst1.16         d2[2], [%[out]], %[kStep]      \n"
205    "vst1.16         d3[2], [%[out]], %[kStep]      \n"
206    "vst1.16         d0[3], [%[out]], %[kStep]      \n"
207    "vst1.16         d1[3], [%[out]], %[kStep]      \n"
208    "vst1.16         d2[3], [%[out]], %[kStep]      \n"
209    "vst1.16         d3[3], [%[out]], %[kStep]      \n"
210
211    : [out] "+r"(out)  // modified registers
212    : [in] "r"(in), [kStep] "r"(kStep)  // constants
213    : "memory", "q0", "q1", "q2", "q3", "q4",
214      "q5", "q6", "q7", "q8", "q9" // clobbered
215  );
216}
217
218// Forward transform.
219
220// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
221static const int16_t kCoeff16[] = {
222  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
223};
224static const int32_t kCoeff32[] = {
225   1812,  1812,  1812,  1812,
226    937,   937,   937,   937,
227  12000, 12000, 12000, 12000,
228  51000, 51000, 51000, 51000
229};
230
231static void FTransform(const uint8_t* src, const uint8_t* ref,
232                       int16_t* out) {
233  const int kBPS = BPS;
234  const uint8_t* src_ptr = src;
235  const uint8_t* ref_ptr = ref;
236  const int16_t* coeff16 = kCoeff16;
237  const int32_t* coeff32 = kCoeff32;
238
239  __asm__ volatile (
240    // load src into q4, q5 in high half
241    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
242    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
243    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
244    "vld1.8 {d11}, [%[src_ptr]]               \n"
245
246    // load ref into q6, q7 in high half
247    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
248    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
249    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
250    "vld1.8 {d15}, [%[ref_ptr]]               \n"
251
252    // Pack the high values in to q4 and q6
253    "vtrn.32     q4, q5                       \n"
254    "vtrn.32     q6, q7                       \n"
255
256    // d[0-3] = src - ref
257    "vsubl.u8    q0, d8, d12                  \n"
258    "vsubl.u8    q1, d9, d13                  \n"
259
260    // load coeff16 into q8(d16=5352, d17=2217)
261    "vld1.16     {q8}, [%[coeff16]]           \n"
262
263    // load coeff32 high half into q9 = 1812, q10 = 937
264    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
265
266    // load coeff32 low half into q11=12000, q12=51000
267    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
268
269    // part 1
270    // Transpose. Register dN is the same as dN in C
271    "vtrn.32         d0, d2                   \n"
272    "vtrn.32         d1, d3                   \n"
273    "vtrn.16         d0, d1                   \n"
274    "vtrn.16         d2, d3                   \n"
275
276    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
277    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
278    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
279    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
280
281    "vadd.s16        d0, d4, d5               \n" // a0 + a1
282    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
283    "vsub.s16        d2, d4, d5               \n" // a0 - a1
284    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
285
286    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
287    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
288    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
289    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
290
291    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
292    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
293    "vshrn.s32       d1, q9, #9               \n"
294    "vshrn.s32       d3, q10, #9              \n"
295
296    // part 2
297    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
298    "vtrn.32         d0, d2                   \n"
299    "vtrn.32         d1, d3                   \n"
300    "vtrn.16         d0, d1                   \n"
301    "vtrn.16         d2, d3                   \n"
302
303    "vmov.s16        d26, #7                  \n"
304
305    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
306    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
307    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
308    "vadd.s16        d4, d4, d26              \n" // a1 + 7
309    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
310
311    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
312    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
313
314    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
315    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
316
317    "vceq.s16        d4, d7, #0               \n"
318
319    "vshr.s16        d0, d0, #4               \n"
320    "vshr.s16        d2, d2, #4               \n"
321
322    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
323    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
324
325    "vmvn.s16        d4, d4                   \n"
326    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
327    "vshrn.s32       d1, q11, #16             \n"
328    // op[4] += (d1!=0)
329    "vsub.s16        d1, d1, d4               \n"
330    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
331    "vshrn.s32       d3, q12, #16             \n"
332
333    // set result to out array
334    "vst1.16         {q0, q1}, [%[out]]   \n"
335    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
336      [coeff32] "+r"(coeff32)          // modified registers
337    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
338      [out] "r"(out)                   // constants
339    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
340      "q10", "q11", "q12", "q13"       // clobbered
341  );
342}
343
344static void FTransformWHT(const int16_t* in, int16_t* out) {
345  const int kStep = 32;
346  __asm__ volatile (
347    // d0 = in[0 * 16] , d1 = in[1 * 16]
348    // d2 = in[2 * 16] , d3 = in[3 * 16]
349    "vld1.16         d0[0], [%[in]], %[kStep]   \n"
350    "vld1.16         d1[0], [%[in]], %[kStep]   \n"
351    "vld1.16         d2[0], [%[in]], %[kStep]   \n"
352    "vld1.16         d3[0], [%[in]], %[kStep]   \n"
353    "vld1.16         d0[1], [%[in]], %[kStep]   \n"
354    "vld1.16         d1[1], [%[in]], %[kStep]   \n"
355    "vld1.16         d2[1], [%[in]], %[kStep]   \n"
356    "vld1.16         d3[1], [%[in]], %[kStep]   \n"
357    "vld1.16         d0[2], [%[in]], %[kStep]   \n"
358    "vld1.16         d1[2], [%[in]], %[kStep]   \n"
359    "vld1.16         d2[2], [%[in]], %[kStep]   \n"
360    "vld1.16         d3[2], [%[in]], %[kStep]   \n"
361    "vld1.16         d0[3], [%[in]], %[kStep]   \n"
362    "vld1.16         d1[3], [%[in]], %[kStep]   \n"
363    "vld1.16         d2[3], [%[in]], %[kStep]   \n"
364    "vld1.16         d3[3], [%[in]], %[kStep]   \n"
365
366    "vaddl.s16       q2, d0, d2                 \n"
367    "vshl.s32        q2, q2, #2                 \n" // a0=(in[0*16]+in[2*16])<<2
368    "vaddl.s16       q3, d1, d3                 \n"
369    "vshl.s32        q3, q3, #2                 \n" // a1=(in[1*16]+in[3*16])<<2
370    "vsubl.s16       q4, d1, d3                 \n"
371    "vshl.s32        q4, q4, #2                 \n" // a2=(in[1*16]-in[3*16])<<2
372    "vsubl.s16       q5, d0, d2                 \n"
373    "vshl.s32        q5, q5, #2                 \n" // a3=(in[0*16]-in[2*16])<<2
374
375    "vceq.s32        q10, q2, #0                \n"
376    "vmvn.s32        q10, q10                   \n" // (a0 != 0)
377    "vqadd.s32       q6, q2, q3                 \n" // (a0 + a1)
378    "vqsub.s32       q6, q6, q10                \n" // (a0 + a1) + (a0 != 0)
379    "vqadd.s32       q7, q5, q4                 \n" // a3 + a2
380    "vqsub.s32       q8, q5, q4                 \n" // a3 - a2
381    "vqsub.s32       q9, q2, q3                 \n" // a0 - a1
382
383    // Transpose
384    // q6 = tmp[0, 1,  2,  3] ; q7 = tmp[ 4,  5,  6,  7]
385    // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15]
386    "vswp            d13, d16                   \n" // vtrn.64 q0, q2
387    "vswp            d15, d18                   \n" // vtrn.64 q1, q3
388    "vtrn.32         q6, q7                     \n"
389    "vtrn.32         q8, q9                     \n"
390
391    "vqadd.s32       q0, q6, q8                 \n" // a0 = tmp[0] + tmp[8]
392    "vqadd.s32       q1, q7, q9                 \n" // a1 = tmp[4] + tmp[12]
393    "vqsub.s32       q2, q7, q9                 \n" // a2 = tmp[4] - tmp[12]
394    "vqsub.s32       q3, q6, q8                 \n" // a3 = tmp[0] - tmp[8]
395
396    "vqadd.s32       q4, q0, q1                 \n" // b0 = a0 + a1
397    "vqadd.s32       q5, q3, q2                 \n" // b1 = a3 + a2
398    "vqsub.s32       q6, q3, q2                 \n" // b2 = a3 - a2
399    "vqsub.s32       q7, q0, q1                 \n" // b3 = a0 - a1
400
401    "vmov.s32         q0, #3                    \n" // q0 = 3
402
403    "vcgt.s32        q1, q4, #0                 \n" // (b0>0)
404    "vqsub.s32       q2, q4, q1                 \n" // (b0+(b0>0))
405    "vqadd.s32       q3, q2, q0                 \n" // (b0+(b0>0)+3)
406    "vshrn.s32       d18, q3, #3                \n" // (b0+(b0>0)+3) >> 3
407
408    "vcgt.s32        q1, q5, #0                 \n" // (b1>0)
409    "vqsub.s32       q2, q5, q1                 \n" // (b1+(b1>0))
410    "vqadd.s32       q3, q2, q0                 \n" // (b1+(b1>0)+3)
411    "vshrn.s32       d19, q3, #3                \n" // (b1+(b1>0)+3) >> 3
412
413    "vcgt.s32        q1, q6, #0                 \n" // (b2>0)
414    "vqsub.s32       q2, q6, q1                 \n" // (b2+(b2>0))
415    "vqadd.s32       q3, q2, q0                 \n" // (b2+(b2>0)+3)
416    "vshrn.s32       d20, q3, #3                \n" // (b2+(b2>0)+3) >> 3
417
418    "vcgt.s32        q1, q7, #0                 \n" // (b3>0)
419    "vqsub.s32       q2, q7, q1                 \n" // (b3+(b3>0))
420    "vqadd.s32       q3, q2, q0                 \n" // (b3+(b3>0)+3)
421    "vshrn.s32       d21, q3, #3                \n" // (b3+(b3>0)+3) >> 3
422
423    "vst1.16         {q9, q10}, [%[out]]        \n"
424
425    : [in] "+r"(in)
426    : [kStep] "r"(kStep), [out] "r"(out)
427    : "memory", "q0", "q1", "q2", "q3", "q4", "q5",
428      "q6", "q7", "q8", "q9", "q10"       // clobbered
429  ) ;
430}
431
432//------------------------------------------------------------------------------
433// Texture distortion
434//
435// We try to match the spectral content (weighted) between source and
436// reconstructed samples.
437
438// Hadamard transform
439// Returns the weighted sum of the absolute value of transformed coefficients.
440// This uses a TTransform helper function in C
441static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
442                    const uint16_t* const w) {
443  const int kBPS = BPS;
444  const uint8_t* A = a;
445  const uint8_t* B = b;
446  const uint16_t* W = w;
447  int sum;
448  __asm__ volatile (
449    "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
450    "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
451    "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
452    "vld1.32         d2[1], [%[a]]            \n"
453
454    "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
455    "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
456    "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
457    "vld1.32         d3[1], [%[b]]            \n"
458
459    // a d0/d2, b d1/d3
460    // d0/d1: 01 01 01 01
461    // d2/d3: 23 23 23 23
462    // But: it goes 01 45 23 67
463    // Notice the middle values are transposed
464    "vtrn.16         q0, q1                   \n"
465
466    // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
467    "vaddl.u8        q2, d0, d2               \n"
468    "vaddl.u8        q10, d1, d3              \n"
469    // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
470    "vsubl.u8        q3, d0, d2               \n"
471    "vsubl.u8        q11, d1, d3              \n"
472
473    // tmp[0] = a0 + a1
474    "vpaddl.s16      q0, q2                   \n"
475    "vpaddl.s16      q8, q10                  \n"
476
477    // tmp[1] = a3 + a2
478    "vpaddl.s16      q1, q3                   \n"
479    "vpaddl.s16      q9, q11                  \n"
480
481    // No pair subtract
482    // q2 = {a0, a3}
483    // q3 = {a1, a2}
484    "vtrn.16         q2, q3                   \n"
485    "vtrn.16         q10, q11                 \n"
486
487    // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
488    "vsubl.s16       q12, d4, d6              \n"
489    "vsubl.s16       q13, d5, d7              \n"
490    "vsubl.s16       q14, d20, d22            \n"
491    "vsubl.s16       q15, d21, d23            \n"
492
493    // separate tmp[3] and tmp[2]
494    // q12 = tmp[3]
495    // q13 = tmp[2]
496    "vtrn.32         q12, q13                 \n"
497    "vtrn.32         q14, q15                 \n"
498
499    // Transpose tmp for a
500    "vswp            d1, d26                  \n" // vtrn.64
501    "vswp            d3, d24                  \n" // vtrn.64
502    "vtrn.32         q0, q1                   \n"
503    "vtrn.32         q13, q12                 \n"
504
505    // Transpose tmp for b
506    "vswp            d17, d30                 \n" // vtrn.64
507    "vswp            d19, d28                 \n" // vtrn.64
508    "vtrn.32         q8, q9                   \n"
509    "vtrn.32         q15, q14                 \n"
510
511    // The first Q register is a, the second b.
512    // q0/8 tmp[0-3]
513    // q13/15 tmp[4-7]
514    // q1/9 tmp[8-11]
515    // q12/14 tmp[12-15]
516
517    // These are still in 01 45 23 67 order. We fix it easily in the addition
518    // case but the subtraction propegates them.
519    "vswp            d3, d27                  \n"
520    "vswp            d19, d31                 \n"
521
522    // a0 = tmp[0] + tmp[8]
523    "vadd.s32        q2, q0, q1               \n"
524    "vadd.s32        q3, q8, q9               \n"
525
526    // a1 = tmp[4] + tmp[12]
527    "vadd.s32        q10, q13, q12            \n"
528    "vadd.s32        q11, q15, q14            \n"
529
530    // a2 = tmp[4] - tmp[12]
531    "vsub.s32        q13, q13, q12            \n"
532    "vsub.s32        q15, q15, q14            \n"
533
534    // a3 = tmp[0] - tmp[8]
535    "vsub.s32        q0, q0, q1               \n"
536    "vsub.s32        q8, q8, q9               \n"
537
538    // b0 = a0 + a1
539    "vadd.s32        q1, q2, q10              \n"
540    "vadd.s32        q9, q3, q11              \n"
541
542    // b1 = a3 + a2
543    "vadd.s32        q12, q0, q13             \n"
544    "vadd.s32        q14, q8, q15             \n"
545
546    // b2 = a3 - a2
547    "vsub.s32        q0, q0, q13              \n"
548    "vsub.s32        q8, q8, q15              \n"
549
550    // b3 = a0 - a1
551    "vsub.s32        q2, q2, q10              \n"
552    "vsub.s32        q3, q3, q11              \n"
553
554    "vld1.64         {q10, q11}, [%[w]]       \n"
555
556    // abs(b0)
557    "vabs.s32        q1, q1                   \n"
558    "vabs.s32        q9, q9                   \n"
559    // abs(b1)
560    "vabs.s32        q12, q12                 \n"
561    "vabs.s32        q14, q14                 \n"
562    // abs(b2)
563    "vabs.s32        q0, q0                   \n"
564    "vabs.s32        q8, q8                   \n"
565    // abs(b3)
566    "vabs.s32        q2, q2                   \n"
567    "vabs.s32        q3, q3                   \n"
568
569    // expand w before using.
570    "vmovl.u16       q13, d20                 \n"
571    "vmovl.u16       q15, d21                 \n"
572
573    // w[0] * abs(b0)
574    "vmul.u32        q1, q1, q13              \n"
575    "vmul.u32        q9, q9, q13              \n"
576
577    // w[4] * abs(b1)
578    "vmla.u32        q1, q12, q15             \n"
579    "vmla.u32        q9, q14, q15             \n"
580
581    // expand w before using.
582    "vmovl.u16       q13, d22                 \n"
583    "vmovl.u16       q15, d23                 \n"
584
585    // w[8] * abs(b1)
586    "vmla.u32        q1, q0, q13              \n"
587    "vmla.u32        q9, q8, q13              \n"
588
589    // w[12] * abs(b1)
590    "vmla.u32        q1, q2, q15              \n"
591    "vmla.u32        q9, q3, q15              \n"
592
593    // Sum the arrays
594    "vpaddl.u32      q1, q1                   \n"
595    "vpaddl.u32      q9, q9                   \n"
596    "vadd.u64        d2, d3                   \n"
597    "vadd.u64        d18, d19                 \n"
598
599    // Hadamard transform needs 4 bits of extra precision (2 bits in each
600    // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
601    // precision for coeff is 8bit of input + 4bits of Hadamard transform +
602    // 16bits for w[] + 2 bits of abs() summation.
603    //
604    // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
605    // A-OK.
606
607    // sum2 - sum1
608    "vsub.u32        d0, d2, d18              \n"
609    // abs(sum2 - sum1)
610    "vabs.s32        d0, d0                   \n"
611    // abs(sum2 - sum1) >> 5
612    "vshr.u32        d0, #5                   \n"
613
614    // It would be better to move the value straight into r0 but I'm not
615    // entirely sure how this works with inline assembly.
616    "vmov.32         %[sum], d0[0]            \n"
617
618    : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
619    : [kBPS] "r"(kBPS)
620    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
621      "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
622  ) ;
623
624  return sum;
625}
626
627static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
628                      const uint16_t* const w) {
629  int D = 0;
630  int x, y;
631  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
632    for (x = 0; x < 16; x += 4) {
633      D += Disto4x4(a + x + y, b + x + y, w);
634    }
635  }
636  return D;
637}
638
639#endif   // WEBP_USE_NEON
640
641//------------------------------------------------------------------------------
642// Entry point
643
644extern void VP8EncDspInitNEON(void);
645
646void VP8EncDspInitNEON(void) {
647#if defined(WEBP_USE_NEON)
648  VP8ITransform = ITransform;
649  VP8FTransform = FTransform;
650
651  VP8ITransformWHT = ITransformWHT;
652  VP8FTransformWHT = FTransformWHT;
653
654  VP8TDisto4x4 = Disto4x4;
655  VP8TDisto16x16 = Disto16x16;
656#endif   // WEBP_USE_NEON
657}
658
659#if defined(__cplusplus) || defined(c_plusplus)
660}    // extern "C"
661#endif
662