dec_neon.c revision 8b720228d581a84fd173b6dcb2fa295b59db489a
1// Copyright 2012 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// ARM NEON version of dsp functions and loop filtering.
11//
12// Authors: Somnath Banerjee (somnath@google.com)
13//          Johann Koenig (johannkoenig@google.com)
14
15#include "./dsp.h"
16
17#if defined(WEBP_USE_NEON)
18
19#include "../dec/vp8i.h"
20
21#define QRegs "q0", "q1", "q2", "q3",                                          \
22              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
23
24#define FLIP_SIGN_BIT2(a, b, s)                                                \
25  "veor     " #a "," #a "," #s "               \n"                             \
26  "veor     " #b "," #b "," #s "               \n"                             \
27
28#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
29  FLIP_SIGN_BIT2(a, b, s)                                                      \
30  FLIP_SIGN_BIT2(c, d, s)                                                      \
31
32#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
33  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
34  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
35  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
36  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
37  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
38  "vdup.8     q14, " #thresh "            \n"                                  \
39  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
40
41#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
42  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
43  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
44  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
45  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
46  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
47
48#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
49  "vmov.i8    q15, #0x03                  \n"                                  \
50  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
51  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
52  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
53                                                                               \
54  "vmov.i8    q15, #0x04                  \n"                                  \
55  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
56  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
57  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
58
59// Applies filter on 2 pixels (p0 and q0)
60#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
61  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
62  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
63  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
64  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
65  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
66  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
67  FLIP_SIGN_BIT2(p0, q0, q10)
68
69// Load/Store vertical edge
70#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
71  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
72  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
73  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
74  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
75  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
76  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
77  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
78  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
79
80#define STORE8x2(c1, c2, p, stride)                                            \
81  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
82  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
83  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
84  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
85  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
86  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
87  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
88  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
89
90//-----------------------------------------------------------------------------
91// Simple In-loop filtering (Paragraph 15.2)
92
93static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
94  __asm__ volatile (
95    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
96
97    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
98    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
99    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
100    "vld1.u8    {q12}, [%[p]]                  \n"  // q1
101
102    DO_FILTER2(q1, q2, q3, q12, %[thresh])
103
104    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
105
106    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
107    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
108    : [p] "+r"(p)
109    : [stride] "r"(stride), [thresh] "r"(thresh)
110    : "memory", QRegs
111  );
112}
113
114static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
115  __asm__ volatile (
116    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
117    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
118    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
119
120    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
121    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
122    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
123    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
124    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
125
126    DO_FILTER2(q1, q2, q12, q13, %[thresh])
127
128    "sub        %[p], %[p], #1                 \n"  // p - 1
129
130    "vswp        d5, d24                       \n"
131    STORE8x2(d4, d5, [%[p]], %[stride])
132    STORE8x2(d24, d25, [%[p]], %[stride])
133
134    : [p] "+r"(p)
135    : [stride] "r"(stride), [thresh] "r"(thresh)
136    : "memory", "r4", "r5", "r6", QRegs
137  );
138}
139
140static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {
141  int k;
142  for (k = 3; k > 0; --k) {
143    p += 4 * stride;
144    SimpleVFilter16NEON(p, stride, thresh);
145  }
146}
147
148static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
149  int k;
150  for (k = 3; k > 0; --k) {
151    p += 4;
152    SimpleHFilter16NEON(p, stride, thresh);
153  }
154}
155
156//-----------------------------------------------------------------------------
157// Inverse transforms (Paragraph 14.4)
158
159static void TransformOne(const int16_t* in, uint8_t* dst) {
160  const int kBPS = BPS;
161  const int16_t constants[] = {20091, 17734, 0, 0};
162  /* kC1, kC2. Padded because vld1.16 loads 8 bytes
163   * Technically these are unsigned but vqdmulh is only available in signed.
164   * vqdmulh returns high half (effectively >> 16) but also doubles the value,
165   * changing the >> 16 to >> 15 and requiring an additional >> 1.
166   * We use this to our advantage with kC2. The canonical value is 35468.
167   * However, the high bit is set so treating it as signed will give incorrect
168   * results. We avoid this by down shifting by 1 here to clear the highest bit.
169   * Combined with the doubling effect of vqdmulh we get >> 16.
170   * This can not be applied to kC1 because the lowest bit is set. Down shifting
171   * the constant would reduce precision.
172   */
173
174  /* libwebp uses a trick to avoid some extra addition that libvpx does.
175   * Instead of:
176   * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
177   * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
178   * same issue with kC1 and vqdmulh that we work around by down shifting kC2
179   */
180
181  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
182  __asm__ volatile (
183    "vld1.16         {q1, q2}, [%[in]]           \n"
184    "vld1.16         {d0}, [%[constants]]        \n"
185
186    /* d2: in[0]
187     * d3: in[8]
188     * d4: in[4]
189     * d5: in[12]
190     */
191    "vswp            d3, d4                      \n"
192
193    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
194     * q9 = {in[4], in[12]} * kC2 >> 16
195     */
196    "vqdmulh.s16     q8, q2, d0[0]               \n"
197    "vqdmulh.s16     q9, q2, d0[1]               \n"
198
199    /* d22 = a = in[0] + in[8]
200     * d23 = b = in[0] - in[8]
201     */
202    "vqadd.s16       d22, d2, d3                 \n"
203    "vqsub.s16       d23, d2, d3                 \n"
204
205    /* The multiplication should be x * kC1 >> 16
206     * However, with vqdmulh we get x * kC1 * 2 >> 16
207     * (multiply, double, return high half)
208     * We avoided this in kC2 by pre-shifting the constant.
209     * q8 = in[4]/[12] * kC1 >> 16
210     */
211    "vshr.s16        q8, q8, #1                  \n"
212
213    /* Add {in[4], in[12]} back after the multiplication. This is handled by
214     * adding 1 << 16 to kC1 in the libwebp C code.
215     */
216    "vqadd.s16       q8, q2, q8                  \n"
217
218    /* d20 = c = in[4]*kC2 - in[12]*kC1
219     * d21 = d = in[4]*kC1 + in[12]*kC2
220     */
221    "vqsub.s16       d20, d18, d17               \n"
222    "vqadd.s16       d21, d19, d16               \n"
223
224    /* d2 = tmp[0] = a + d
225     * d3 = tmp[1] = b + c
226     * d4 = tmp[2] = b - c
227     * d5 = tmp[3] = a - d
228     */
229    "vqadd.s16       d2, d22, d21                \n"
230    "vqadd.s16       d3, d23, d20                \n"
231    "vqsub.s16       d4, d23, d20                \n"
232    "vqsub.s16       d5, d22, d21                \n"
233
234    "vzip.16         q1, q2                      \n"
235    "vzip.16         q1, q2                      \n"
236
237    "vswp            d3, d4                      \n"
238
239    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
240     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
241     */
242    "vqdmulh.s16     q8, q2, d0[0]               \n"
243    "vqdmulh.s16     q9, q2, d0[1]               \n"
244
245    /* d22 = a = tmp[0] + tmp[8]
246     * d23 = b = tmp[0] - tmp[8]
247     */
248    "vqadd.s16       d22, d2, d3                 \n"
249    "vqsub.s16       d23, d2, d3                 \n"
250
251    /* See long winded explanations prior */
252    "vshr.s16        q8, q8, #1                  \n"
253    "vqadd.s16       q8, q2, q8                  \n"
254
255    /* d20 = c = in[4]*kC2 - in[12]*kC1
256     * d21 = d = in[4]*kC1 + in[12]*kC2
257     */
258    "vqsub.s16       d20, d18, d17               \n"
259    "vqadd.s16       d21, d19, d16               \n"
260
261    /* d2 = tmp[0] = a + d
262     * d3 = tmp[1] = b + c
263     * d4 = tmp[2] = b - c
264     * d5 = tmp[3] = a - d
265     */
266    "vqadd.s16       d2, d22, d21                \n"
267    "vqadd.s16       d3, d23, d20                \n"
268    "vqsub.s16       d4, d23, d20                \n"
269    "vqsub.s16       d5, d22, d21                \n"
270
271    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
272    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
273    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
274    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
275
276    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
277
278    /* (val) + 4 >> 3 */
279    "vrshr.s16       d2, d2, #3                  \n"
280    "vrshr.s16       d3, d3, #3                  \n"
281    "vrshr.s16       d4, d4, #3                  \n"
282    "vrshr.s16       d5, d5, #3                  \n"
283
284    "vzip.16         q1, q2                      \n"
285    "vzip.16         q1, q2                      \n"
286
287    /* Must accumulate before saturating */
288    "vmovl.u8        q8, d6                      \n"
289    "vmovl.u8        q9, d7                      \n"
290
291    "vqadd.s16       q1, q1, q8                  \n"
292    "vqadd.s16       q2, q2, q9                  \n"
293
294    "vqmovun.s16     d0, q1                      \n"
295    "vqmovun.s16     d1, q2                      \n"
296
297    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
298    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
299    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
300    "vst1.32         d1[1], [%[dst]]             \n"
301
302    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
303    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
304    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
305  );
306}
307
308static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
309  TransformOne(in, dst);
310  if (do_two) {
311    TransformOne(in + 16, dst + 4);
312  }
313}
314
315static void TransformDC(const int16_t* in, uint8_t* dst) {
316  const int DC = (in[0] + 4) >> 3;
317  const int kBPS = BPS;
318  __asm__ volatile (
319    "vdup.16         q1, %[DC]        \n"
320
321    "vld1.32         d0[0], [%[dst]], %[kBPS]    \n"
322    "vld1.32         d1[0], [%[dst]], %[kBPS]    \n"
323    "vld1.32         d0[1], [%[dst]], %[kBPS]    \n"
324    "vld1.32         d1[1], [%[dst]], %[kBPS]    \n"
325
326    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
327
328    // add DC and convert to s16.
329    "vaddw.u8        q2, q1, d0                  \n"
330    "vaddw.u8        q3, q1, d1                  \n"
331    // convert back to u8 with saturation
332    "vqmovun.s16     d0,  q2                     \n"
333    "vqmovun.s16     d1,  q3                     \n"
334
335    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
336    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
337    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
338    "vst1.32         d1[1], [%[dst]]             \n"
339    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
340    : [kBPS] "r"(kBPS),   /* constants */
341      [DC] "r"(DC)
342    : "memory", "q0", "q1", "q2", "q3"  /* clobbered */
343  );
344}
345
346static void TransformWHT(const int16_t* in, int16_t* out) {
347  const int kStep = 32;  // The store is only incrementing the pointer as if we
348                         // had stored a single byte.
349  __asm__ volatile (
350    // part 1
351    // load data into q0, q1
352    "vld1.16         {q0, q1}, [%[in]]           \n"
353
354    "vaddl.s16       q2, d0, d3                  \n"  // a0 = in[0] + in[12]
355    "vaddl.s16       q3, d1, d2                  \n"  // a1 = in[4] + in[8]
356    "vsubl.s16       q10, d1, d2                 \n"  // a2 = in[4] - in[8]
357    "vsubl.s16       q11, d0, d3                 \n"  // a3 = in[0] - in[12]
358
359    "vadd.s32        q0, q2, q3                  \n"  // tmp[0] = a0 + a1
360    "vsub.s32        q2, q2, q3                  \n"  // tmp[8] = a0 - a1
361    "vadd.s32        q1, q11, q10                \n"  // tmp[4] = a3 + a2
362    "vsub.s32        q3, q11, q10                \n"  // tmp[12] = a3 - a2
363
364    // Transpose
365    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
366    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
367    "vswp            d1, d4                      \n"  // vtrn.64 q0, q2
368    "vswp            d3, d6                      \n"  // vtrn.64 q1, q3
369    "vtrn.32         q0, q1                      \n"
370    "vtrn.32         q2, q3                      \n"
371
372    "vmov.s32        q10, #3                     \n"  // dc = 3
373    "vadd.s32        q0, q0, q10                 \n"  // dc = tmp[0] + 3
374    "vadd.s32        q12, q0, q3                 \n"  // a0 = dc + tmp[3]
375    "vadd.s32        q13, q1, q2                 \n"  // a1 = tmp[1] + tmp[2]
376    "vsub.s32        q8, q1, q2                  \n"  // a2 = tmp[1] - tmp[2]
377    "vsub.s32        q9, q0, q3                  \n"  // a3 = dc - tmp[3]
378
379    "vadd.s32        q0, q12, q13                \n"
380    "vshrn.s32       d0, q0, #3                  \n"  // (a0 + a1) >> 3
381    "vadd.s32        q1, q9, q8                  \n"
382    "vshrn.s32       d1, q1, #3                  \n"  // (a3 + a2) >> 3
383    "vsub.s32        q2, q12, q13                \n"
384    "vshrn.s32       d2, q2, #3                  \n"  // (a0 - a1) >> 3
385    "vsub.s32        q3, q9, q8                  \n"
386    "vshrn.s32       d3, q3, #3                  \n"  // (a3 - a2) >> 3
387
388    // set the results to output
389    "vst1.16         d0[0], [%[out]], %[kStep]   \n"
390    "vst1.16         d1[0], [%[out]], %[kStep]   \n"
391    "vst1.16         d2[0], [%[out]], %[kStep]   \n"
392    "vst1.16         d3[0], [%[out]], %[kStep]   \n"
393    "vst1.16         d0[1], [%[out]], %[kStep]   \n"
394    "vst1.16         d1[1], [%[out]], %[kStep]   \n"
395    "vst1.16         d2[1], [%[out]], %[kStep]   \n"
396    "vst1.16         d3[1], [%[out]], %[kStep]   \n"
397    "vst1.16         d0[2], [%[out]], %[kStep]   \n"
398    "vst1.16         d1[2], [%[out]], %[kStep]   \n"
399    "vst1.16         d2[2], [%[out]], %[kStep]   \n"
400    "vst1.16         d3[2], [%[out]], %[kStep]   \n"
401    "vst1.16         d0[3], [%[out]], %[kStep]   \n"
402    "vst1.16         d1[3], [%[out]], %[kStep]   \n"
403    "vst1.16         d2[3], [%[out]], %[kStep]   \n"
404    "vst1.16         d3[3], [%[out]], %[kStep]   \n"
405
406    : [out] "+r"(out)  // modified registers
407    : [in] "r"(in), [kStep] "r"(kStep)  // constants
408    : "memory", "q0", "q1", "q2", "q3",
409      "q8", "q9", "q10", "q11", "q12", "q13"  // clobbered
410  );
411}
412
413#endif   // WEBP_USE_NEON
414
415//------------------------------------------------------------------------------
416// Entry point
417
418extern void VP8DspInitNEON(void);
419
420void VP8DspInitNEON(void) {
421#if defined(WEBP_USE_NEON)
422  VP8Transform = TransformTwo;
423  VP8TransformAC3 = TransformOne;  // no special code here
424  VP8TransformDC = TransformDC;
425  VP8TransformWHT = TransformWHT;
426
427  VP8SimpleVFilter16 = SimpleVFilter16NEON;
428  VP8SimpleHFilter16 = SimpleHFilter16NEON;
429  VP8SimpleVFilter16i = SimpleVFilter16iNEON;
430  VP8SimpleHFilter16i = SimpleHFilter16iNEON;
431#endif   // WEBP_USE_NEON
432}
433
434