1// Copyright 2012 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// ARM NEON version of dsp functions and loop filtering.
11//
12// Authors: Somnath Banerjee (somnath@google.com)
13//          Johann Koenig (johannkoenig@google.com)
14
15#include "src/dsp/dsp.h"
16
17#if defined(WEBP_USE_NEON)
18
19#include "src/dsp/neon.h"
20#include "src/dec/vp8i_dec.h"
21
22//------------------------------------------------------------------------------
23// NxM Loading functions
24
25#if !defined(WORK_AROUND_GCC)
26
27// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
28// (register alloc, probably). The variants somewhat mitigate the problem, but
29// not quite. HFilter16i() remains problematic.
30static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
31                                            int stride) {
32  const uint8x8_t zero = vdup_n_u8(0);
33  uint8x8x4_t out;
34  INIT_VECTOR4(out, zero, zero, zero, zero);
35  out = vld4_lane_u8(src + 0 * stride, out, 0);
36  out = vld4_lane_u8(src + 1 * stride, out, 1);
37  out = vld4_lane_u8(src + 2 * stride, out, 2);
38  out = vld4_lane_u8(src + 3 * stride, out, 3);
39  out = vld4_lane_u8(src + 4 * stride, out, 4);
40  out = vld4_lane_u8(src + 5 * stride, out, 5);
41  out = vld4_lane_u8(src + 6 * stride, out, 6);
42  out = vld4_lane_u8(src + 7 * stride, out, 7);
43  return out;
44}
45
46static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
47                                      uint8x16_t* const p1,
48                                      uint8x16_t* const p0,
49                                      uint8x16_t* const q0,
50                                      uint8x16_t* const q1) {
51  // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
52  // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
53  const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
54  const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
55  *p1 = vcombine_u8(row0.val[0], row8.val[0]);
56  *p0 = vcombine_u8(row0.val[1], row8.val[1]);
57  *q0 = vcombine_u8(row0.val[2], row8.val[2]);
58  *q1 = vcombine_u8(row0.val[3], row8.val[3]);
59}
60
61#else  // WORK_AROUND_GCC
62
63#define LOADQ_LANE_32b(VALUE, LANE) do {                             \
64  (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
65  src += stride;                                                     \
66} while (0)
67
68static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
69                                      uint8x16_t* const p1,
70                                      uint8x16_t* const p0,
71                                      uint8x16_t* const q0,
72                                      uint8x16_t* const q1) {
73  const uint32x4_t zero = vdupq_n_u32(0);
74  uint32x4x4_t in;
75  INIT_VECTOR4(in, zero, zero, zero, zero);
76  src -= 2;
77  LOADQ_LANE_32b(in.val[0], 0);
78  LOADQ_LANE_32b(in.val[1], 0);
79  LOADQ_LANE_32b(in.val[2], 0);
80  LOADQ_LANE_32b(in.val[3], 0);
81  LOADQ_LANE_32b(in.val[0], 1);
82  LOADQ_LANE_32b(in.val[1], 1);
83  LOADQ_LANE_32b(in.val[2], 1);
84  LOADQ_LANE_32b(in.val[3], 1);
85  LOADQ_LANE_32b(in.val[0], 2);
86  LOADQ_LANE_32b(in.val[1], 2);
87  LOADQ_LANE_32b(in.val[2], 2);
88  LOADQ_LANE_32b(in.val[3], 2);
89  LOADQ_LANE_32b(in.val[0], 3);
90  LOADQ_LANE_32b(in.val[1], 3);
91  LOADQ_LANE_32b(in.val[2], 3);
92  LOADQ_LANE_32b(in.val[3], 3);
93  // Transpose four 4x4 parts:
94  {
95    const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
96                                        vreinterpretq_u8_u32(in.val[1]));
97    const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
98                                        vreinterpretq_u8_u32(in.val[3]));
99    const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
100                                         vreinterpretq_u16_u8(row23.val[0]));
101    const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
102                                         vreinterpretq_u16_u8(row23.val[1]));
103    *p1 = vreinterpretq_u8_u16(row02.val[0]);
104    *p0 = vreinterpretq_u8_u16(row13.val[0]);
105    *q0 = vreinterpretq_u8_u16(row02.val[1]);
106    *q1 = vreinterpretq_u8_u16(row13.val[1]);
107  }
108}
109#undef LOADQ_LANE_32b
110
111#endif  // !WORK_AROUND_GCC
112
113static WEBP_INLINE void Load8x16_NEON(
114    const uint8_t* const src, int stride,
115    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
116    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
117    uint8x16_t* const q2, uint8x16_t* const q3) {
118  Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
119  Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
120}
121
122static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
123                                      uint8x16_t* const p1,
124                                      uint8x16_t* const p0,
125                                      uint8x16_t* const q0,
126                                      uint8x16_t* const q1) {
127  *p1 = vld1q_u8(src - 2 * stride);
128  *p0 = vld1q_u8(src - 1 * stride);
129  *q0 = vld1q_u8(src + 0 * stride);
130  *q1 = vld1q_u8(src + 1 * stride);
131}
132
133static WEBP_INLINE void Load16x8_NEON(
134    const uint8_t* const src, int stride,
135    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
136    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
137    uint8x16_t* const q2, uint8x16_t* const q3) {
138  Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
139  Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
140}
141
142static WEBP_INLINE void Load8x8x2_NEON(
143    const uint8_t* const u, const uint8_t* const v, int stride,
144    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
145    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
146    uint8x16_t* const q2, uint8x16_t* const q3) {
147  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
148  // and the v-samples on the higher half.
149  *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
150  *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
151  *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
152  *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
153  *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
154  *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
155  *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
156  *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
157}
158
159#if !defined(WORK_AROUND_GCC)
160
161#define LOAD_UV_8(ROW) \
162  vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
163
164static WEBP_INLINE void Load8x8x2T_NEON(
165    const uint8_t* const u, const uint8_t* const v, int stride,
166    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
167    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
168    uint8x16_t* const q2, uint8x16_t* const q3) {
169  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
170  // and the v-samples on the higher half.
171  const uint8x16_t row0 = LOAD_UV_8(0);
172  const uint8x16_t row1 = LOAD_UV_8(1);
173  const uint8x16_t row2 = LOAD_UV_8(2);
174  const uint8x16_t row3 = LOAD_UV_8(3);
175  const uint8x16_t row4 = LOAD_UV_8(4);
176  const uint8x16_t row5 = LOAD_UV_8(5);
177  const uint8x16_t row6 = LOAD_UV_8(6);
178  const uint8x16_t row7 = LOAD_UV_8(7);
179  // Perform two side-by-side 8x8 transposes
180  // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
181  // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
182  // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
183  // u30 u31 u32 u33 u34 u35 u36 u37 | ...
184  // u40 u41 u42 u43 u44 u45 u46 u47 | ...
185  // u50 u51 u52 u53 u54 u55 u56 u57 | ...
186  // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
187  // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
188  const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
189                                                    // u01 u11 u03 u13 ...
190  const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
191                                                    // u21 u31 u23 u33 ...
192  const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
193  const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
194  const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
195                                       vreinterpretq_u16_u8(row23.val[0]));
196  const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
197                                       vreinterpretq_u16_u8(row23.val[1]));
198  const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
199                                       vreinterpretq_u16_u8(row67.val[0]));
200  const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
201                                       vreinterpretq_u16_u8(row67.val[1]));
202  const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
203                                       vreinterpretq_u32_u16(row46.val[0]));
204  const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
205                                       vreinterpretq_u32_u16(row46.val[1]));
206  const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
207                                       vreinterpretq_u32_u16(row57.val[0]));
208  const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
209                                       vreinterpretq_u32_u16(row57.val[1]));
210  *p3 = vreinterpretq_u8_u32(row04.val[0]);
211  *p2 = vreinterpretq_u8_u32(row15.val[0]);
212  *p1 = vreinterpretq_u8_u32(row26.val[0]);
213  *p0 = vreinterpretq_u8_u32(row37.val[0]);
214  *q0 = vreinterpretq_u8_u32(row04.val[1]);
215  *q1 = vreinterpretq_u8_u32(row15.val[1]);
216  *q2 = vreinterpretq_u8_u32(row26.val[1]);
217  *q3 = vreinterpretq_u8_u32(row37.val[1]);
218}
219#undef LOAD_UV_8
220
221#endif  // !WORK_AROUND_GCC
222
223static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
224                                      uint8_t* const dst, int stride) {
225  vst2_lane_u8(dst + 0 * stride, v, 0);
226  vst2_lane_u8(dst + 1 * stride, v, 1);
227  vst2_lane_u8(dst + 2 * stride, v, 2);
228  vst2_lane_u8(dst + 3 * stride, v, 3);
229  vst2_lane_u8(dst + 4 * stride, v, 4);
230  vst2_lane_u8(dst + 5 * stride, v, 5);
231  vst2_lane_u8(dst + 6 * stride, v, 6);
232  vst2_lane_u8(dst + 7 * stride, v, 7);
233}
234
235static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
236                                       uint8_t* const dst, int stride) {
237  uint8x8x2_t lo, hi;
238  lo.val[0] = vget_low_u8(p0);
239  lo.val[1] = vget_low_u8(q0);
240  hi.val[0] = vget_high_u8(p0);
241  hi.val[1] = vget_high_u8(q0);
242  Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
243  Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
244}
245
246#if !defined(WORK_AROUND_GCC)
247static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
248                                      uint8_t* const dst, int stride) {
249  vst4_lane_u8(dst + 0 * stride, v, 0);
250  vst4_lane_u8(dst + 1 * stride, v, 1);
251  vst4_lane_u8(dst + 2 * stride, v, 2);
252  vst4_lane_u8(dst + 3 * stride, v, 3);
253  vst4_lane_u8(dst + 4 * stride, v, 4);
254  vst4_lane_u8(dst + 5 * stride, v, 5);
255  vst4_lane_u8(dst + 6 * stride, v, 6);
256  vst4_lane_u8(dst + 7 * stride, v, 7);
257}
258
259static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
260                                       const uint8x16_t q0, const uint8x16_t q1,
261                                       uint8_t* const dst, int stride) {
262  uint8x8x4_t lo, hi;
263  INIT_VECTOR4(lo,
264               vget_low_u8(p1), vget_low_u8(p0),
265               vget_low_u8(q0), vget_low_u8(q1));
266  INIT_VECTOR4(hi,
267               vget_high_u8(p1), vget_high_u8(p0),
268               vget_high_u8(q0), vget_high_u8(q1));
269  Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
270  Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
271}
272#endif  // !WORK_AROUND_GCC
273
274static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
275                                       uint8_t* const dst, int stride) {
276  vst1q_u8(dst - stride, p0);
277  vst1q_u8(dst, q0);
278}
279
280static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
281                                       const uint8x16_t q0, const uint8x16_t q1,
282                                       uint8_t* const dst, int stride) {
283  Store16x2_NEON(p1, p0, dst - stride, stride);
284  Store16x2_NEON(q0, q1, dst + stride, stride);
285}
286
287static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
288                                        const uint8x16_t q0,
289                                        uint8_t* const u, uint8_t* const v,
290                                        int stride) {
291  // p0 and q0 contain the u+v samples packed in low/high halves.
292  vst1_u8(u - stride, vget_low_u8(p0));
293  vst1_u8(u,          vget_low_u8(q0));
294  vst1_u8(v - stride, vget_high_u8(p0));
295  vst1_u8(v,          vget_high_u8(q0));
296}
297
298static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
299                                        const uint8x16_t p0,
300                                        const uint8x16_t q0,
301                                        const uint8x16_t q1,
302                                        uint8_t* const u, uint8_t* const v,
303                                        int stride) {
304  // The p1...q1 registers contain the u+v samples packed in low/high halves.
305  Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
306  Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
307}
308
309#if !defined(WORK_AROUND_GCC)
310
311#define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
312  vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
313  vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
314  (DST) += stride;                                \
315} while (0)
316
317static WEBP_INLINE void Store6x8x2_NEON(
318    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
319    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
320    uint8_t* u, uint8_t* v, int stride) {
321  uint8x8x3_t u0, u1, v0, v1;
322  INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
323  INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
324  INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
325  INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
326  STORE6_LANE(u, u0, u1, 0);
327  STORE6_LANE(u, u0, u1, 1);
328  STORE6_LANE(u, u0, u1, 2);
329  STORE6_LANE(u, u0, u1, 3);
330  STORE6_LANE(u, u0, u1, 4);
331  STORE6_LANE(u, u0, u1, 5);
332  STORE6_LANE(u, u0, u1, 6);
333  STORE6_LANE(u, u0, u1, 7);
334  STORE6_LANE(v, v0, v1, 0);
335  STORE6_LANE(v, v0, v1, 1);
336  STORE6_LANE(v, v0, v1, 2);
337  STORE6_LANE(v, v0, v1, 3);
338  STORE6_LANE(v, v0, v1, 4);
339  STORE6_LANE(v, v0, v1, 5);
340  STORE6_LANE(v, v0, v1, 6);
341  STORE6_LANE(v, v0, v1, 7);
342}
343#undef STORE6_LANE
344
345static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
346                                        const uint8x16_t p0,
347                                        const uint8x16_t q0,
348                                        const uint8x16_t q1,
349                                        uint8_t* const u, uint8_t* const v,
350                                        int stride) {
351  uint8x8x4_t u0, v0;
352  INIT_VECTOR4(u0,
353               vget_low_u8(p1), vget_low_u8(p0),
354               vget_low_u8(q0), vget_low_u8(q1));
355  INIT_VECTOR4(v0,
356               vget_high_u8(p1), vget_high_u8(p0),
357               vget_high_u8(q0), vget_high_u8(q1));
358  vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
359  vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
360  vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
361  vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
362  vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
363  vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
364  vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
365  vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
366  vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
367  vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
368  vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
369  vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
370  vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
371  vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
372  vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
373  vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
374}
375
376#endif  // !WORK_AROUND_GCC
377
378// Zero extend 'v' to an int16x8_t.
379static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
380  return vreinterpretq_s16_u16(vmovl_u8(v));
381}
382
383// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
384// to the corresponding rows of 'dst'.
385static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
386                                                 const int16x8_t dst01,
387                                                 const int16x8_t dst23) {
388  // Unsigned saturate to 8b.
389  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
390  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
391
392  // Store the results.
393  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
394  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
395  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
396  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
397}
398
399static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
400                                    const int16x8_t row23,
401                                    uint8_t* const dst) {
402  uint32x2_t dst01 = vdup_n_u32(0);
403  uint32x2_t dst23 = vdup_n_u32(0);
404
405  // Load the source pixels.
406  dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
407  dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
408  dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
409  dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
410
411  {
412    // Convert to 16b.
413    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
414    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
415
416    // Descale with rounding.
417    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
418    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
419    // Add the inverse transform.
420    SaturateAndStore4x4_NEON(dst, out01, out23);
421  }
422}
423
424//-----------------------------------------------------------------------------
425// Simple In-loop filtering (Paragraph 15.2)
426
427static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
428                                   const uint8x16_t q0, const uint8x16_t q1,
429                                   int thresh) {
430  const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
431  const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
432  const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
433  const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
434  const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
435  const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
436  const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
437  return mask;
438}
439
440static int8x16_t FlipSign_NEON(const uint8x16_t v) {
441  const uint8x16_t sign_bit = vdupq_n_u8(0x80);
442  return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
443}
444
445static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
446  const int8x16_t sign_bit = vdupq_n_s8(0x80);
447  return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
448}
449
450static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
451                                   const int8x16_t q0, const int8x16_t q1) {
452  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
453  const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
454  const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
455  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
456  const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
457  return s3;
458}
459
460static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
461  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
462  const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
463  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
464  return s2;
465}
466
467//------------------------------------------------------------------------------
468
469static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
470                                    const int8x16_t delta,
471                                    int8x16_t* const op0,
472                                    int8x16_t* const oq0) {
473  const int8x16_t kCst3 = vdupq_n_s8(0x03);
474  const int8x16_t kCst4 = vdupq_n_s8(0x04);
475  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
476  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
477  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
478  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
479  *op0 = vqaddq_s8(p0s, delta3);
480  *oq0 = vqsubq_s8(q0s, delta4);
481}
482
483#if defined(WEBP_USE_INTRINSICS)
484
485static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
486                              const int8x16_t delta,
487                              uint8x16_t* const op0, uint8x16_t* const oq0) {
488  const int8x16_t kCst3 = vdupq_n_s8(0x03);
489  const int8x16_t kCst4 = vdupq_n_s8(0x04);
490  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
491  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
492  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
493  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
494  const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
495  const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
496  *op0 = FlipSignBack_NEON(sp0);
497  *oq0 = FlipSignBack_NEON(sq0);
498}
499
500static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
501                           const uint8x16_t q0, const uint8x16_t q1,
502                           const uint8x16_t mask,
503                           uint8x16_t* const op0, uint8x16_t* const oq0) {
504  const int8x16_t p1s = FlipSign_NEON(p1);
505  const int8x16_t p0s = FlipSign_NEON(p0);
506  const int8x16_t q0s = FlipSign_NEON(q0);
507  const int8x16_t q1s = FlipSign_NEON(q1);
508  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
509  const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
510  ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
511}
512
513static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
514  uint8x16_t p1, p0, q0, q1, op0, oq0;
515  Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
516  {
517    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
518    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
519  }
520  Store16x2_NEON(op0, oq0, p, stride);
521}
522
523static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
524  uint8x16_t p1, p0, q0, q1, oq0, op0;
525  Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
526  {
527    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
528    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
529  }
530  Store2x16_NEON(op0, oq0, p, stride);
531}
532
533#else
534
535// Load/Store vertical edge
536#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
537  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
538  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
539  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
540  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
541  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
542  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
543  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
544  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
545
546#define STORE8x2(c1, c2, p, stride)                                            \
547  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
548  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
549  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
550  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
551  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
552  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
553  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
554  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
555
556#define QRegs "q0", "q1", "q2", "q3",                                          \
557              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558
559#define FLIP_SIGN_BIT2(a, b, s)                                                \
560  "veor     " #a "," #a "," #s "               \n"                             \
561  "veor     " #b "," #b "," #s "               \n"                             \
562
563#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
564  FLIP_SIGN_BIT2(a, b, s)                                                      \
565  FLIP_SIGN_BIT2(c, d, s)                                                      \
566
567#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
568  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
569  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
570  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
571  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
572  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
573  "vdup.8     q14, " #thresh "            \n"                                  \
574  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
575
576#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
577  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
578  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
579  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
580  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
581  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
582
583#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
584  "vmov.i8    q15, #0x03                  \n"                                  \
585  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
586  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
587  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
588                                                                               \
589  "vmov.i8    q15, #0x04                  \n"                                  \
590  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
591  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
592  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
593
594// Applies filter on 2 pixels (p0 and q0)
595#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
596  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
597  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
598  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
599  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
600  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
601  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
602  FLIP_SIGN_BIT2(p0, q0, q10)
603
604static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
605  __asm__ volatile (
606    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
607
608    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
609    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
610    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
611    "vld1.u8    {q12}, [%[p]]                  \n"  // q1
612
613    DO_FILTER2(q1, q2, q3, q12, %[thresh])
614
615    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
616
617    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
618    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
619    : [p] "+r"(p)
620    : [stride] "r"(stride), [thresh] "r"(thresh)
621    : "memory", QRegs
622  );
623}
624
625static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
626  __asm__ volatile (
627    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
628    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
629    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
630
631    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
632    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
633    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
634    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
635    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
636
637    DO_FILTER2(q1, q2, q12, q13, %[thresh])
638
639    "sub        %[p], %[p], #1                 \n"  // p - 1
640
641    "vswp        d5, d24                       \n"
642    STORE8x2(d4, d5, [%[p]], %[stride])
643    STORE8x2(d24, d25, [%[p]], %[stride])
644
645    : [p] "+r"(p)
646    : [stride] "r"(stride), [thresh] "r"(thresh)
647    : "memory", "r4", "r5", "r6", QRegs
648  );
649}
650
651#undef LOAD8x4
652#undef STORE8x2
653
654#endif    // WEBP_USE_INTRINSICS
655
656static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
657  uint32_t k;
658  for (k = 3; k != 0; --k) {
659    p += 4 * stride;
660    SimpleVFilter16_NEON(p, stride, thresh);
661  }
662}
663
664static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
665  uint32_t k;
666  for (k = 3; k != 0; --k) {
667    p += 4;
668    SimpleHFilter16_NEON(p, stride, thresh);
669  }
670}
671
672//------------------------------------------------------------------------------
673// Complex In-loop filtering (Paragraph 15.3)
674
675static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
676                                const uint8x16_t q0, const uint8x16_t q1,
677                                int hev_thresh) {
678  const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
679  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
680  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
681  const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
682  const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
683  return mask;
684}
685
686static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
687                                    const uint8x16_t p1, const uint8x16_t p0,
688                                    const uint8x16_t q0, const uint8x16_t q1,
689                                    const uint8x16_t q2, const uint8x16_t q3,
690                                    int ithresh, int thresh) {
691  const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
692  const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
693  const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
694  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
695  const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
696  const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
697  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
698  const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
699  const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
700  const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
701  const uint8x16_t max12 = vmaxq_u8(max1, max2);
702  const uint8x16_t max123 = vmaxq_u8(max12, max3);
703  const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
704  const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
705  const uint8x16_t mask = vandq_u8(mask1, mask2);
706  return mask;
707}
708
709//  4-points filter
710
711static void ApplyFilter4_NEON(
712    const int8x16_t p1, const int8x16_t p0,
713    const int8x16_t q0, const int8x16_t q1,
714    const int8x16_t delta0,
715    uint8x16_t* const op1, uint8x16_t* const op0,
716    uint8x16_t* const oq0, uint8x16_t* const oq1) {
717  const int8x16_t kCst3 = vdupq_n_s8(0x03);
718  const int8x16_t kCst4 = vdupq_n_s8(0x04);
719  const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
720  const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
721  const int8x16_t a1 = vshrq_n_s8(delta1, 3);
722  const int8x16_t a2 = vshrq_n_s8(delta2, 3);
723  const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
724  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
725  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
726  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
727  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
728}
729
730static void DoFilter4_NEON(
731    const uint8x16_t p1, const uint8x16_t p0,
732    const uint8x16_t q0, const uint8x16_t q1,
733    const uint8x16_t mask, const uint8x16_t hev_mask,
734    uint8x16_t* const op1, uint8x16_t* const op0,
735    uint8x16_t* const oq0, uint8x16_t* const oq1) {
736  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
737  const int8x16_t p1s = FlipSign_NEON(p1);
738  int8x16_t p0s = FlipSign_NEON(p0);
739  int8x16_t q0s = FlipSign_NEON(q0);
740  const int8x16_t q1s = FlipSign_NEON(q1);
741  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
742
743  // do_filter2 part (simple loopfilter on pixels with hev)
744  {
745    const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
746    const int8x16_t simple_lf_delta =
747        vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
748    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
749  }
750
751  // do_filter4 part (complex loopfilter on pixels without hev)
752  {
753    const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
754    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
755    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
756    const int8x16_t complex_lf_delta =
757        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
758    ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
759  }
760}
761
762//  6-points filter
763
764static void ApplyFilter6_NEON(
765    const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
766    const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
767    const int8x16_t delta,
768    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
769    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
770  // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
771  // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
772  // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
773  //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
774  const int8x8_t delta_lo = vget_low_s8(delta);
775  const int8x8_t delta_hi = vget_high_s8(delta);
776  const int8x8_t kCst9 = vdup_n_s8(9);
777  const int16x8_t kCstm1 = vdupq_n_s16(-1);
778  const int8x8_t kCst18 = vdup_n_s8(18);
779  const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
780  const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
781  const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
782  const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
783  const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
784  const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
785  const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
786  const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
787  const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
788  const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
789  const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
790  const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
791  const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
792
793  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
794  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
795  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
796  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
797  *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
798  *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
799}
800
801static void DoFilter6_NEON(
802    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
803    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
804    const uint8x16_t mask, const uint8x16_t hev_mask,
805    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
806    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
807  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
808  const int8x16_t p2s = FlipSign_NEON(p2);
809  const int8x16_t p1s = FlipSign_NEON(p1);
810  int8x16_t p0s = FlipSign_NEON(p0);
811  int8x16_t q0s = FlipSign_NEON(q0);
812  const int8x16_t q1s = FlipSign_NEON(q1);
813  const int8x16_t q2s = FlipSign_NEON(q2);
814  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
815  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
816
817  // do_filter2 part (simple loopfilter on pixels with hev)
818  {
819    const int8x16_t simple_lf_delta =
820        vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
821    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
822  }
823
824  // do_filter6 part (complex loopfilter on pixels without hev)
825  {
826    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
827    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
828    const int8x16_t complex_lf_delta =
829        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
830    ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
831                      op2, op1, op0, oq0, oq1, oq2);
832  }
833}
834
835// on macroblock edges
836
837static void VFilter16_NEON(uint8_t* p, int stride,
838                           int thresh, int ithresh, int hev_thresh) {
839  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840  Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841  {
842    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
843                                              ithresh, thresh);
844    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
845    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
848    Store16x2_NEON(op2, op1, p - 2 * stride, stride);
849    Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
850    Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
851  }
852}
853
854static void HFilter16_NEON(uint8_t* p, int stride,
855                           int thresh, int ithresh, int hev_thresh) {
856  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
857  Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
858  {
859    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
860                                              ithresh, thresh);
861    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
862    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
863    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
864                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
865    Store2x16_NEON(op2, op1, p - 2, stride);
866    Store2x16_NEON(op0, oq0, p + 0, stride);
867    Store2x16_NEON(oq1, oq2, p + 2, stride);
868  }
869}
870
871// on three inner edges
872static void VFilter16i_NEON(uint8_t* p, int stride,
873                            int thresh, int ithresh, int hev_thresh) {
874  uint32_t k;
875  uint8x16_t p3, p2, p1, p0;
876  Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
877  for (k = 3; k != 0; --k) {
878    uint8x16_t q0, q1, q2, q3;
879    p += 4 * stride;
880    Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
881    {
882      const uint8x16_t mask =
883          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
884      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
885      // p3 and p2 are not just temporary variables here: they will be
886      // re-used for next span. And q2/q3 will become p1/p0 accordingly.
887      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
888      Store16x4_NEON(p1, p0, p3, p2, p, stride);
889      p1 = q2;
890      p0 = q3;
891    }
892  }
893}
894
895#if !defined(WORK_AROUND_GCC)
896static void HFilter16i_NEON(uint8_t* p, int stride,
897                            int thresh, int ithresh, int hev_thresh) {
898  uint32_t k;
899  uint8x16_t p3, p2, p1, p0;
900  Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
901  for (k = 3; k != 0; --k) {
902    uint8x16_t q0, q1, q2, q3;
903    p += 4;
904    Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
905    {
906      const uint8x16_t mask =
907          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
908      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
909      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
910      Store4x16_NEON(p1, p0, p3, p2, p, stride);
911      p1 = q2;
912      p0 = q3;
913    }
914  }
915}
916#endif  // !WORK_AROUND_GCC
917
918// 8-pixels wide variant, for chroma filtering
919static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
920                          int thresh, int ithresh, int hev_thresh) {
921  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
922  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
923  {
924    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
925                                              ithresh, thresh);
926    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
927    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
928    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
929                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
930    Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
931    Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
932    Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
933  }
934}
935static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
936                           int thresh, int ithresh, int hev_thresh) {
937  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
938  u += 4 * stride;
939  v += 4 * stride;
940  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
941  {
942    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
943                                              ithresh, thresh);
944    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
945    uint8x16_t op1, op0, oq0, oq1;
946    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
947    Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
948  }
949}
950
951#if !defined(WORK_AROUND_GCC)
952static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
953                          int thresh, int ithresh, int hev_thresh) {
954  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
955  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
956  {
957    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
958                                              ithresh, thresh);
959    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
960    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
961    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
962                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
963    Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
964  }
965}
966
967static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
968                           int thresh, int ithresh, int hev_thresh) {
969  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
970  u += 4;
971  v += 4;
972  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
973  {
974    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
975                                              ithresh, thresh);
976    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
977    uint8x16_t op1, op0, oq0, oq1;
978    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
979    Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
980  }
981}
982#endif  // !WORK_AROUND_GCC
983
984//-----------------------------------------------------------------------------
985// Inverse transforms (Paragraph 14.4)
986
987// Technically these are unsigned but vqdmulh is only available in signed.
988// vqdmulh returns high half (effectively >> 16) but also doubles the value,
989// changing the >> 16 to >> 15 and requiring an additional >> 1.
990// We use this to our advantage with kC2. The canonical value is 35468.
991// However, the high bit is set so treating it as signed will give incorrect
992// results. We avoid this by down shifting by 1 here to clear the highest bit.
993// Combined with the doubling effect of vqdmulh we get >> 16.
994// This can not be applied to kC1 because the lowest bit is set. Down shifting
995// the constant would reduce precision.
996
997// libwebp uses a trick to avoid some extra addition that libvpx does.
998// Instead of:
999// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1000// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1001// same issue with kC1 and vqdmulh that we work around by down shifting kC2
1002
1003static const int16_t kC1 = 20091;
1004static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
1005
1006#if defined(WEBP_USE_INTRINSICS)
1007static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1008                                          const int16x8_t in1,
1009                                          int16x8x2_t* const out) {
1010  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
1011  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
1012  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
1013                                                  // b0 d0 b1 d1 b2 d2 ...
1014  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1015}
1016
1017static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1018  // {rows} = in0 | in4
1019  //          in8 | in12
1020  // B1 = in4 | in12
1021  const int16x8_t B1 =
1022      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1023  // C0 = kC1 * in4 | kC1 * in12
1024  // C1 = kC2 * in4 | kC2 * in12
1025  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1026  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1027  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1028                                vget_low_s16(rows->val[1]));   // in0 + in8
1029  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1030                                vget_low_s16(rows->val[1]));   // in0 - in8
1031  // c = kC2 * in4 - kC1 * in12
1032  // d = kC1 * in4 + kC2 * in12
1033  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1034  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1035  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
1036  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
1037  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
1038  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
1039  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1040  Transpose8x2_NEON(E0, E1, rows);
1041}
1042
1043static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1044  int16x8x2_t rows;
1045  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1046  TransformPass_NEON(&rows);
1047  TransformPass_NEON(&rows);
1048  Add4x4_NEON(rows.val[0], rows.val[1], dst);
1049}
1050
1051#else
1052
1053static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1054  const int kBPS = BPS;
1055  // kC1, kC2. Padded because vld1.16 loads 8 bytes
1056  const int16_t constants[4] = { kC1, kC2, 0, 0 };
1057  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1058  __asm__ volatile (
1059    "vld1.16         {q1, q2}, [%[in]]           \n"
1060    "vld1.16         {d0}, [%[constants]]        \n"
1061
1062    /* d2: in[0]
1063     * d3: in[8]
1064     * d4: in[4]
1065     * d5: in[12]
1066     */
1067    "vswp            d3, d4                      \n"
1068
1069    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1070     * q9 = {in[4], in[12]} * kC2 >> 16
1071     */
1072    "vqdmulh.s16     q8, q2, d0[0]               \n"
1073    "vqdmulh.s16     q9, q2, d0[1]               \n"
1074
1075    /* d22 = a = in[0] + in[8]
1076     * d23 = b = in[0] - in[8]
1077     */
1078    "vqadd.s16       d22, d2, d3                 \n"
1079    "vqsub.s16       d23, d2, d3                 \n"
1080
1081    /* The multiplication should be x * kC1 >> 16
1082     * However, with vqdmulh we get x * kC1 * 2 >> 16
1083     * (multiply, double, return high half)
1084     * We avoided this in kC2 by pre-shifting the constant.
1085     * q8 = in[4]/[12] * kC1 >> 16
1086     */
1087    "vshr.s16        q8, q8, #1                  \n"
1088
1089    /* Add {in[4], in[12]} back after the multiplication. This is handled by
1090     * adding 1 << 16 to kC1 in the libwebp C code.
1091     */
1092    "vqadd.s16       q8, q2, q8                  \n"
1093
1094    /* d20 = c = in[4]*kC2 - in[12]*kC1
1095     * d21 = d = in[4]*kC1 + in[12]*kC2
1096     */
1097    "vqsub.s16       d20, d18, d17               \n"
1098    "vqadd.s16       d21, d19, d16               \n"
1099
1100    /* d2 = tmp[0] = a + d
1101     * d3 = tmp[1] = b + c
1102     * d4 = tmp[2] = b - c
1103     * d5 = tmp[3] = a - d
1104     */
1105    "vqadd.s16       d2, d22, d21                \n"
1106    "vqadd.s16       d3, d23, d20                \n"
1107    "vqsub.s16       d4, d23, d20                \n"
1108    "vqsub.s16       d5, d22, d21                \n"
1109
1110    "vzip.16         q1, q2                      \n"
1111    "vzip.16         q1, q2                      \n"
1112
1113    "vswp            d3, d4                      \n"
1114
1115    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1116     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1117     */
1118    "vqdmulh.s16     q8, q2, d0[0]               \n"
1119    "vqdmulh.s16     q9, q2, d0[1]               \n"
1120
1121    /* d22 = a = tmp[0] + tmp[8]
1122     * d23 = b = tmp[0] - tmp[8]
1123     */
1124    "vqadd.s16       d22, d2, d3                 \n"
1125    "vqsub.s16       d23, d2, d3                 \n"
1126
1127    /* See long winded explanations prior */
1128    "vshr.s16        q8, q8, #1                  \n"
1129    "vqadd.s16       q8, q2, q8                  \n"
1130
1131    /* d20 = c = in[4]*kC2 - in[12]*kC1
1132     * d21 = d = in[4]*kC1 + in[12]*kC2
1133     */
1134    "vqsub.s16       d20, d18, d17               \n"
1135    "vqadd.s16       d21, d19, d16               \n"
1136
1137    /* d2 = tmp[0] = a + d
1138     * d3 = tmp[1] = b + c
1139     * d4 = tmp[2] = b - c
1140     * d5 = tmp[3] = a - d
1141     */
1142    "vqadd.s16       d2, d22, d21                \n"
1143    "vqadd.s16       d3, d23, d20                \n"
1144    "vqsub.s16       d4, d23, d20                \n"
1145    "vqsub.s16       d5, d22, d21                \n"
1146
1147    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
1148    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
1149    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
1150    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
1151
1152    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
1153
1154    /* (val) + 4 >> 3 */
1155    "vrshr.s16       d2, d2, #3                  \n"
1156    "vrshr.s16       d3, d3, #3                  \n"
1157    "vrshr.s16       d4, d4, #3                  \n"
1158    "vrshr.s16       d5, d5, #3                  \n"
1159
1160    "vzip.16         q1, q2                      \n"
1161    "vzip.16         q1, q2                      \n"
1162
1163    /* Must accumulate before saturating */
1164    "vmovl.u8        q8, d6                      \n"
1165    "vmovl.u8        q9, d7                      \n"
1166
1167    "vqadd.s16       q1, q1, q8                  \n"
1168    "vqadd.s16       q2, q2, q9                  \n"
1169
1170    "vqmovun.s16     d0, q1                      \n"
1171    "vqmovun.s16     d1, q2                      \n"
1172
1173    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
1174    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
1175    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
1176    "vst1.32         d1[1], [%[dst]]             \n"
1177
1178    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
1179    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
1180    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
1181  );
1182}
1183
1184#endif    // WEBP_USE_INTRINSICS
1185
1186static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
1187  TransformOne_NEON(in, dst);
1188  if (do_two) {
1189    TransformOne_NEON(in + 16, dst + 4);
1190  }
1191}
1192
1193static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
1194  const int16x8_t DC = vdupq_n_s16(in[0]);
1195  Add4x4_NEON(DC, DC, dst);
1196}
1197
1198//------------------------------------------------------------------------------
1199
1200#define STORE_WHT(dst, col, rows) do {                  \
1201  *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1202  *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1203  *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1204  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1205} while (0)
1206
1207static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
1208  int32x4x4_t tmp;
1209
1210  {
1211    // Load the source.
1212    const int16x4_t in00_03 = vld1_s16(in + 0);
1213    const int16x4_t in04_07 = vld1_s16(in + 4);
1214    const int16x4_t in08_11 = vld1_s16(in + 8);
1215    const int16x4_t in12_15 = vld1_s16(in + 12);
1216    const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
1217    const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
1218    const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
1219    const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
1220    tmp.val[0] = vaddq_s32(a0, a1);
1221    tmp.val[1] = vaddq_s32(a3, a2);
1222    tmp.val[2] = vsubq_s32(a0, a1);
1223    tmp.val[3] = vsubq_s32(a3, a2);
1224    // Arrange the temporary results column-wise.
1225    tmp = Transpose4x4_NEON(tmp);
1226  }
1227
1228  {
1229    const int32x4_t kCst3 = vdupq_n_s32(3);
1230    const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
1231    const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1232    const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1233    const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1234    const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1235
1236    tmp.val[0] = vaddq_s32(a0, a1);
1237    tmp.val[1] = vaddq_s32(a3, a2);
1238    tmp.val[2] = vsubq_s32(a0, a1);
1239    tmp.val[3] = vsubq_s32(a3, a2);
1240
1241    // right shift the results by 3.
1242    tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1243    tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1244    tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1245    tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1246
1247    STORE_WHT(out, 0, tmp);
1248    STORE_WHT(out, 1, tmp);
1249    STORE_WHT(out, 2, tmp);
1250    STORE_WHT(out, 3, tmp);
1251  }
1252}
1253
1254#undef STORE_WHT
1255
1256//------------------------------------------------------------------------------
1257
1258#define MUL(a, b) (((a) * (b)) >> 16)
1259static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
1260  static const int kC1_full = 20091 + (1 << 16);
1261  static const int kC2_full = 35468;
1262  const int16x4_t A = vld1_dup_s16(in);
1263  const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1264  const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1265  const int c1 = MUL(in[1], kC2_full);
1266  const int d1 = MUL(in[1], kC1_full);
1267  const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
1268                      (uint64_t)( c1 & 0xffff) << 16 |
1269                      (uint64_t)(-c1 & 0xffff) << 32 |
1270                      (uint64_t)(-d1 & 0xffff) << 48;
1271  const int16x4_t CD = vcreate_s16(cd);
1272  const int16x4_t B = vqadd_s16(A, CD);
1273  const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1274  const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1275  Add4x4_NEON(m0_m1, m2_m3, dst);
1276}
1277#undef MUL
1278
1279//------------------------------------------------------------------------------
1280// 4x4
1281
1282static void DC4_NEON(uint8_t* dst) {    // DC
1283  const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1284  const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1285  const uint16x4_t p1 = vpadd_u16(p0, p0);
1286  const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1287  const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1288  const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1289  const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1290  const uint16x8_t s0 = vaddq_u16(L0, L1);
1291  const uint16x8_t s1 = vaddq_u16(L2, L3);
1292  const uint16x8_t s01 = vaddq_u16(s0, s1);
1293  const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1294  const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
1295  const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1296  int i;
1297  for (i = 0; i < 4; ++i) {
1298    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1299  }
1300}
1301
1302// TrueMotion (4x4 + 8x8)
1303static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1304  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1305  const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
1306  const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
1307  int y;
1308  for (y = 0; y < size; y += 4) {
1309    // left edge
1310    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1311    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1312    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1313    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1314    const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
1315    const int16x8_t r1 = vaddq_s16(L1, d);
1316    const int16x8_t r2 = vaddq_s16(L2, d);
1317    const int16x8_t r3 = vaddq_s16(L3, d);
1318    // Saturate and store the result.
1319    const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1320    const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1321    const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1322    const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1323    if (size == 4) {
1324      vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1325      vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1326      vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1327      vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1328    } else {
1329      vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1330      vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1331      vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1332      vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1333    }
1334    dst += 4 * BPS;
1335  }
1336}
1337
1338static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1339
1340static void VE4_NEON(uint8_t* dst) {    // vertical
1341  // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1342  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
1343  const uint64x1_t A1 = vshr_n_u64(A0, 8);
1344  const uint64x1_t A2 = vshr_n_u64(A0, 16);
1345  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1346  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1347  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1348  const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1349  const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1350  int i;
1351  for (i = 0; i < 4; ++i) {
1352    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1353  }
1354}
1355
1356static void RD4_NEON(uint8_t* dst) {   // Down-right
1357  const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1358  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1359  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1360  const uint32_t I = dst[-1 + 0 * BPS];
1361  const uint32_t J = dst[-1 + 1 * BPS];
1362  const uint32_t K = dst[-1 + 2 * BPS];
1363  const uint32_t L = dst[-1 + 3 * BPS];
1364  const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
1365  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1366  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1367  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1368  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1369  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1370  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1371  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1372  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1373  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1374  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1375  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1376  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1377  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1378  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1379  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1380  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1381  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1382}
1383
1384static void LD4_NEON(uint8_t* dst) {    // Down-left
1385  // Note using the same shift trick as VE4() is slower here.
1386  const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1387  const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1388  const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1389  const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1390  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1391  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1392  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1393  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1394  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1395  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1396  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1397  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1398  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1399  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1400  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1401}
1402
1403//------------------------------------------------------------------------------
1404// Chroma
1405
1406static void VE8uv_NEON(uint8_t* dst) {    // vertical
1407  const uint8x8_t top = vld1_u8(dst - BPS);
1408  int j;
1409  for (j = 0; j < 8; ++j) {
1410    vst1_u8(dst + j * BPS, top);
1411  }
1412}
1413
1414static void HE8uv_NEON(uint8_t* dst) {    // horizontal
1415  int j;
1416  for (j = 0; j < 8; ++j) {
1417    const uint8x8_t left = vld1_dup_u8(dst - 1);
1418    vst1_u8(dst, left);
1419    dst += BPS;
1420  }
1421}
1422
1423static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1424  uint16x8_t sum_top;
1425  uint16x8_t sum_left;
1426  uint8x8_t dc0;
1427
1428  if (do_top) {
1429    const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1430    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1431    const uint16x4_t p1 = vpadd_u16(p0, p0);
1432    const uint16x4_t p2 = vpadd_u16(p1, p1);
1433    sum_top = vcombine_u16(p2, p2);
1434  }
1435
1436  if (do_left) {
1437    const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1438    const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1439    const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1440    const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1441    const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
1442    const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
1443    const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
1444    const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
1445    const uint16x8_t s0 = vaddq_u16(L0, L1);
1446    const uint16x8_t s1 = vaddq_u16(L2, L3);
1447    const uint16x8_t s2 = vaddq_u16(L4, L5);
1448    const uint16x8_t s3 = vaddq_u16(L6, L7);
1449    const uint16x8_t s01 = vaddq_u16(s0, s1);
1450    const uint16x8_t s23 = vaddq_u16(s2, s3);
1451    sum_left = vaddq_u16(s01, s23);
1452  }
1453
1454  if (do_top && do_left) {
1455    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1456    dc0 = vrshrn_n_u16(sum, 4);
1457  } else if (do_top) {
1458    dc0 = vrshrn_n_u16(sum_top, 3);
1459  } else if (do_left) {
1460    dc0 = vrshrn_n_u16(sum_left, 3);
1461  } else {
1462    dc0 = vdup_n_u8(0x80);
1463  }
1464
1465  {
1466    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1467    int i;
1468    for (i = 0; i < 8; ++i) {
1469      vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1470    }
1471  }
1472}
1473
1474static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
1475static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
1476static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
1477static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1478
1479static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1480
1481//------------------------------------------------------------------------------
1482// 16x16
1483
1484static void VE16_NEON(uint8_t* dst) {     // vertical
1485  const uint8x16_t top = vld1q_u8(dst - BPS);
1486  int j;
1487  for (j = 0; j < 16; ++j) {
1488    vst1q_u8(dst + j * BPS, top);
1489  }
1490}
1491
1492static void HE16_NEON(uint8_t* dst) {     // horizontal
1493  int j;
1494  for (j = 0; j < 16; ++j) {
1495    const uint8x16_t left = vld1q_dup_u8(dst - 1);
1496    vst1q_u8(dst, left);
1497    dst += BPS;
1498  }
1499}
1500
1501static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1502  uint16x8_t sum_top;
1503  uint16x8_t sum_left;
1504  uint8x8_t dc0;
1505
1506  if (do_top) {
1507    const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
1508    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
1509    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1510    const uint16x4_t p2 = vpadd_u16(p1, p1);
1511    const uint16x4_t p3 = vpadd_u16(p2, p2);
1512    sum_top = vcombine_u16(p3, p3);
1513  }
1514
1515  if (do_left) {
1516    int i;
1517    sum_left = vdupq_n_u16(0);
1518    for (i = 0; i < 16; i += 8) {
1519      const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
1520      const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
1521      const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
1522      const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
1523      const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
1524      const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
1525      const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
1526      const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
1527      const uint16x8_t s0 = vaddq_u16(L0, L1);
1528      const uint16x8_t s1 = vaddq_u16(L2, L3);
1529      const uint16x8_t s2 = vaddq_u16(L4, L5);
1530      const uint16x8_t s3 = vaddq_u16(L6, L7);
1531      const uint16x8_t s01 = vaddq_u16(s0, s1);
1532      const uint16x8_t s23 = vaddq_u16(s2, s3);
1533      const uint16x8_t sum = vaddq_u16(s01, s23);
1534      sum_left = vaddq_u16(sum_left, sum);
1535    }
1536  }
1537
1538  if (do_top && do_left) {
1539    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1540    dc0 = vrshrn_n_u16(sum, 5);
1541  } else if (do_top) {
1542    dc0 = vrshrn_n_u16(sum_top, 4);
1543  } else if (do_left) {
1544    dc0 = vrshrn_n_u16(sum_left, 4);
1545  } else {
1546    dc0 = vdup_n_u8(0x80);
1547  }
1548
1549  {
1550    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1551    int i;
1552    for (i = 0; i < 16; ++i) {
1553      vst1q_u8(dst + i * BPS, dc);
1554    }
1555  }
1556}
1557
1558static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
1559static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
1560static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
1561static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1562
1563static void TM16_NEON(uint8_t* dst) {
1564  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1565  const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
1566  // A[c] - A[-1]
1567  const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1568  const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1569  int y;
1570  for (y = 0; y < 16; y += 4) {
1571    // left edge
1572    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1573    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1574    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1575    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1576    const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
1577    const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1578    const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1579    const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1580    const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1581    const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1582    const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1583    const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1584    // Saturate and store the result.
1585    const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1586    const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1587    const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1588    const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1589    vst1q_u8(dst + 0 * BPS, row0);
1590    vst1q_u8(dst + 1 * BPS, row1);
1591    vst1q_u8(dst + 2 * BPS, row2);
1592    vst1q_u8(dst + 3 * BPS, row3);
1593    dst += 4 * BPS;
1594  }
1595}
1596
1597//------------------------------------------------------------------------------
1598// Entry point
1599
1600extern void VP8DspInitNEON(void);
1601
1602WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1603  VP8Transform = TransformTwo_NEON;
1604  VP8TransformAC3 = TransformAC3_NEON;
1605  VP8TransformDC = TransformDC_NEON;
1606  VP8TransformWHT = TransformWHT_NEON;
1607
1608  VP8VFilter16 = VFilter16_NEON;
1609  VP8VFilter16i = VFilter16i_NEON;
1610  VP8HFilter16 = HFilter16_NEON;
1611#if !defined(WORK_AROUND_GCC)
1612  VP8HFilter16i = HFilter16i_NEON;
1613#endif
1614  VP8VFilter8 = VFilter8_NEON;
1615  VP8VFilter8i = VFilter8i_NEON;
1616#if !defined(WORK_AROUND_GCC)
1617  VP8HFilter8 = HFilter8_NEON;
1618  VP8HFilter8i = HFilter8i_NEON;
1619#endif
1620  VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1621  VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1622  VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1623  VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1624
1625  VP8PredLuma4[0] = DC4_NEON;
1626  VP8PredLuma4[1] = TM4_NEON;
1627  VP8PredLuma4[2] = VE4_NEON;
1628  VP8PredLuma4[4] = RD4_NEON;
1629  VP8PredLuma4[6] = LD4_NEON;
1630
1631  VP8PredLuma16[0] = DC16TopLeft_NEON;
1632  VP8PredLuma16[1] = TM16_NEON;
1633  VP8PredLuma16[2] = VE16_NEON;
1634  VP8PredLuma16[3] = HE16_NEON;
1635  VP8PredLuma16[4] = DC16NoTop_NEON;
1636  VP8PredLuma16[5] = DC16NoLeft_NEON;
1637  VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1638
1639  VP8PredChroma8[0] = DC8uv_NEON;
1640  VP8PredChroma8[1] = TM8uv_NEON;
1641  VP8PredChroma8[2] = VE8uv_NEON;
1642  VP8PredChroma8[3] = HE8uv_NEON;
1643  VP8PredChroma8[4] = DC8uvNoTop_NEON;
1644  VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1645  VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1646}
1647
1648#else  // !WEBP_USE_NEON
1649
1650WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1651
1652#endif  // WEBP_USE_NEON
1653