1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12#include "./vpx_config.h"
13#include "vpx_ports/arm.h"
14
15static INLINE void vp8_loop_filter_neon(uint8x16_t qblimit,  // flimit
16                                        uint8x16_t qlimit,   // limit
17                                        uint8x16_t qthresh,  // thresh
18                                        uint8x16_t q3,       // p3
19                                        uint8x16_t q4,       // p2
20                                        uint8x16_t q5,       // p1
21                                        uint8x16_t q6,       // p0
22                                        uint8x16_t q7,       // q0
23                                        uint8x16_t q8,       // q1
24                                        uint8x16_t q9,       // q2
25                                        uint8x16_t q10,      // q3
26                                        uint8x16_t *q5r,     // p1
27                                        uint8x16_t *q6r,     // p0
28                                        uint8x16_t *q7r,     // q0
29                                        uint8x16_t *q8r) {   // q1
30  uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
31  int16x8_t q2s16, q11s16;
32  uint16x8_t q4u16;
33  int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
34  int8x8_t d2s8, d3s8;
35
36  q11u8 = vabdq_u8(q3, q4);
37  q12u8 = vabdq_u8(q4, q5);
38  q13u8 = vabdq_u8(q5, q6);
39  q14u8 = vabdq_u8(q8, q7);
40  q3 = vabdq_u8(q9, q8);
41  q4 = vabdq_u8(q10, q9);
42
43  q11u8 = vmaxq_u8(q11u8, q12u8);
44  q12u8 = vmaxq_u8(q13u8, q14u8);
45  q3 = vmaxq_u8(q3, q4);
46  q15u8 = vmaxq_u8(q11u8, q12u8);
47
48  q9 = vabdq_u8(q6, q7);
49
50  // vp8_hevmask
51  q13u8 = vcgtq_u8(q13u8, qthresh);
52  q14u8 = vcgtq_u8(q14u8, qthresh);
53  q15u8 = vmaxq_u8(q15u8, q3);
54
55  q2u8 = vabdq_u8(q5, q8);
56  q9 = vqaddq_u8(q9, q9);
57
58  q15u8 = vcgeq_u8(qlimit, q15u8);
59
60  // vp8_filter() function
61  // convert to signed
62  q10 = vdupq_n_u8(0x80);
63  q8 = veorq_u8(q8, q10);
64  q7 = veorq_u8(q7, q10);
65  q6 = veorq_u8(q6, q10);
66  q5 = veorq_u8(q5, q10);
67
68  q2u8 = vshrq_n_u8(q2u8, 1);
69  q9 = vqaddq_u8(q9, q2u8);
70
71  q10 = vdupq_n_u8(3);
72
73  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
74                   vget_low_s8(vreinterpretq_s8_u8(q6)));
75  q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
76                    vget_high_s8(vreinterpretq_s8_u8(q6)));
77
78  q9 = vcgeq_u8(qblimit, q9);
79
80  q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
81
82  q14u8 = vorrq_u8(q13u8, q14u8);
83
84  q4u16 = vmovl_u8(vget_low_u8(q10));
85  q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
86  q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
87
88  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
89  q15u8 = vandq_u8(q15u8, q9);
90
91  q1s8 = vreinterpretq_s8_u8(q1u8);
92  q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
93  q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
94
95  q9 = vdupq_n_u8(4);
96  // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
97  d2s8 = vqmovn_s16(q2s16);
98  d3s8 = vqmovn_s16(q11s16);
99  q1s8 = vcombine_s8(d2s8, d3s8);
100  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
101  q1s8 = vreinterpretq_s8_u8(q1u8);
102
103  q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
104  q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
105  q2s8 = vshrq_n_s8(q2s8, 3);
106  q1s8 = vshrq_n_s8(q1s8, 3);
107
108  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
109  q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
110
111  q1s8 = vrshrq_n_s8(q1s8, 1);
112  q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
113
114  q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
115  q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
116
117  q0u8 = vdupq_n_u8(0x80);
118  *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
119  *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
120  *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
121  *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
122  return;
123}
124
125void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
126                                            unsigned char blimit,
127                                            unsigned char limit,
128                                            unsigned char thresh) {
129  uint8x16_t qblimit, qlimit, qthresh, q3, q4;
130  uint8x16_t q5, q6, q7, q8, q9, q10;
131
132  qblimit = vdupq_n_u8(blimit);
133  qlimit = vdupq_n_u8(limit);
134  qthresh = vdupq_n_u8(thresh);
135  src -= (pitch << 2);
136
137  q3 = vld1q_u8(src);
138  src += pitch;
139  q4 = vld1q_u8(src);
140  src += pitch;
141  q5 = vld1q_u8(src);
142  src += pitch;
143  q6 = vld1q_u8(src);
144  src += pitch;
145  q7 = vld1q_u8(src);
146  src += pitch;
147  q8 = vld1q_u8(src);
148  src += pitch;
149  q9 = vld1q_u8(src);
150  src += pitch;
151  q10 = vld1q_u8(src);
152
153  vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
154                       q10, &q5, &q6, &q7, &q8);
155
156  src -= (pitch * 5);
157  vst1q_u8(src, q5);
158  src += pitch;
159  vst1q_u8(src, q6);
160  src += pitch;
161  vst1q_u8(src, q7);
162  src += pitch;
163  vst1q_u8(src, q8);
164  return;
165}
166
167void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
168                                             unsigned char blimit,
169                                             unsigned char limit,
170                                             unsigned char thresh,
171                                             unsigned char *v) {
172  uint8x16_t qblimit, qlimit, qthresh, q3, q4;
173  uint8x16_t q5, q6, q7, q8, q9, q10;
174  uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
175  uint8x8_t d15, d16, d17, d18, d19, d20, d21;
176
177  qblimit = vdupq_n_u8(blimit);
178  qlimit = vdupq_n_u8(limit);
179  qthresh = vdupq_n_u8(thresh);
180
181  u -= (pitch << 2);
182  v -= (pitch << 2);
183
184  d6 = vld1_u8(u);
185  u += pitch;
186  d7 = vld1_u8(v);
187  v += pitch;
188  d8 = vld1_u8(u);
189  u += pitch;
190  d9 = vld1_u8(v);
191  v += pitch;
192  d10 = vld1_u8(u);
193  u += pitch;
194  d11 = vld1_u8(v);
195  v += pitch;
196  d12 = vld1_u8(u);
197  u += pitch;
198  d13 = vld1_u8(v);
199  v += pitch;
200  d14 = vld1_u8(u);
201  u += pitch;
202  d15 = vld1_u8(v);
203  v += pitch;
204  d16 = vld1_u8(u);
205  u += pitch;
206  d17 = vld1_u8(v);
207  v += pitch;
208  d18 = vld1_u8(u);
209  u += pitch;
210  d19 = vld1_u8(v);
211  v += pitch;
212  d20 = vld1_u8(u);
213  d21 = vld1_u8(v);
214
215  q3 = vcombine_u8(d6, d7);
216  q4 = vcombine_u8(d8, d9);
217  q5 = vcombine_u8(d10, d11);
218  q6 = vcombine_u8(d12, d13);
219  q7 = vcombine_u8(d14, d15);
220  q8 = vcombine_u8(d16, d17);
221  q9 = vcombine_u8(d18, d19);
222  q10 = vcombine_u8(d20, d21);
223
224  vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
225                       q10, &q5, &q6, &q7, &q8);
226
227  u -= (pitch * 5);
228  vst1_u8(u, vget_low_u8(q5));
229  u += pitch;
230  vst1_u8(u, vget_low_u8(q6));
231  u += pitch;
232  vst1_u8(u, vget_low_u8(q7));
233  u += pitch;
234  vst1_u8(u, vget_low_u8(q8));
235
236  v -= (pitch * 5);
237  vst1_u8(v, vget_high_u8(q5));
238  v += pitch;
239  vst1_u8(v, vget_high_u8(q6));
240  v += pitch;
241  vst1_u8(v, vget_high_u8(q7));
242  v += pitch;
243  vst1_u8(v, vget_high_u8(q8));
244  return;
245}
246
247static INLINE void write_4x8(unsigned char *dst, int pitch,
248                             const uint8x8x4_t result) {
249#ifdef VPX_INCOMPATIBLE_GCC
250  /*
251   * uint8x8x4_t result
252  00 01 02 03 | 04 05 06 07
253  10 11 12 13 | 14 15 16 17
254  20 21 22 23 | 24 25 26 27
255  30 31 32 33 | 34 35 36 37
256  ---
257  * after vtrn_u16
258  00 01 20 21 | 04 05 24 25
259  02 03 22 23 | 06 07 26 27
260  10 11 30 31 | 14 15 34 35
261  12 13 32 33 | 16 17 36 37
262  ---
263  * after vtrn_u8
264  00 10 20 30 | 04 14 24 34
265  01 11 21 31 | 05 15 25 35
266  02 12 22 32 | 06 16 26 36
267  03 13 23 33 | 07 17 27 37
268  */
269  const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
270                                        vreinterpret_u16_u8(result.val[2]));
271  const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
272                                        vreinterpret_u16_u8(result.val[3]));
273  const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
274                                     vreinterpret_u8_u16(r13_u16.val[0]));
275  const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
276                                     vreinterpret_u8_u16(r13_u16.val[1]));
277  const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
278  const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
279  const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
280  const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
281  vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
282  dst += pitch;
283  vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
284  dst += pitch;
285  vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
286  dst += pitch;
287  vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
288  dst += pitch;
289  vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
290  dst += pitch;
291  vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
292  dst += pitch;
293  vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
294  dst += pitch;
295  vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
296#else
297  vst4_lane_u8(dst, result, 0);
298  dst += pitch;
299  vst4_lane_u8(dst, result, 1);
300  dst += pitch;
301  vst4_lane_u8(dst, result, 2);
302  dst += pitch;
303  vst4_lane_u8(dst, result, 3);
304  dst += pitch;
305  vst4_lane_u8(dst, result, 4);
306  dst += pitch;
307  vst4_lane_u8(dst, result, 5);
308  dst += pitch;
309  vst4_lane_u8(dst, result, 6);
310  dst += pitch;
311  vst4_lane_u8(dst, result, 7);
312#endif  // VPX_INCOMPATIBLE_GCC
313}
314
315void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
316                                          unsigned char blimit,
317                                          unsigned char limit,
318                                          unsigned char thresh) {
319  unsigned char *s, *d;
320  uint8x16_t qblimit, qlimit, qthresh, q3, q4;
321  uint8x16_t q5, q6, q7, q8, q9, q10;
322  uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
323  uint8x8_t d15, d16, d17, d18, d19, d20, d21;
324  uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
325  uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
326  uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
327  uint8x8x4_t q4ResultH, q4ResultL;
328
329  qblimit = vdupq_n_u8(blimit);
330  qlimit = vdupq_n_u8(limit);
331  qthresh = vdupq_n_u8(thresh);
332
333  s = src - 4;
334  d6 = vld1_u8(s);
335  s += pitch;
336  d8 = vld1_u8(s);
337  s += pitch;
338  d10 = vld1_u8(s);
339  s += pitch;
340  d12 = vld1_u8(s);
341  s += pitch;
342  d14 = vld1_u8(s);
343  s += pitch;
344  d16 = vld1_u8(s);
345  s += pitch;
346  d18 = vld1_u8(s);
347  s += pitch;
348  d20 = vld1_u8(s);
349  s += pitch;
350  d7 = vld1_u8(s);
351  s += pitch;
352  d9 = vld1_u8(s);
353  s += pitch;
354  d11 = vld1_u8(s);
355  s += pitch;
356  d13 = vld1_u8(s);
357  s += pitch;
358  d15 = vld1_u8(s);
359  s += pitch;
360  d17 = vld1_u8(s);
361  s += pitch;
362  d19 = vld1_u8(s);
363  s += pitch;
364  d21 = vld1_u8(s);
365
366  q3 = vcombine_u8(d6, d7);
367  q4 = vcombine_u8(d8, d9);
368  q5 = vcombine_u8(d10, d11);
369  q6 = vcombine_u8(d12, d13);
370  q7 = vcombine_u8(d14, d15);
371  q8 = vcombine_u8(d16, d17);
372  q9 = vcombine_u8(d18, d19);
373  q10 = vcombine_u8(d20, d21);
374
375  q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
376  q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
377  q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
378  q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
379
380  q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
381                     vreinterpretq_u16_u32(q2tmp2.val[0]));
382  q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
383                     vreinterpretq_u16_u32(q2tmp3.val[0]));
384  q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
385                     vreinterpretq_u16_u32(q2tmp2.val[1]));
386  q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
387                     vreinterpretq_u16_u32(q2tmp3.val[1]));
388
389  q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
390                    vreinterpretq_u8_u16(q2tmp5.val[0]));
391  q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
392                    vreinterpretq_u8_u16(q2tmp5.val[1]));
393  q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
394                     vreinterpretq_u8_u16(q2tmp7.val[0]));
395  q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
396                     vreinterpretq_u8_u16(q2tmp7.val[1]));
397
398  q3 = q2tmp8.val[0];
399  q4 = q2tmp8.val[1];
400  q5 = q2tmp9.val[0];
401  q6 = q2tmp9.val[1];
402  q7 = q2tmp10.val[0];
403  q8 = q2tmp10.val[1];
404  q9 = q2tmp11.val[0];
405  q10 = q2tmp11.val[1];
406
407  vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
408                       q10, &q5, &q6, &q7, &q8);
409
410  q4ResultL.val[0] = vget_low_u8(q5);   // d10
411  q4ResultL.val[1] = vget_low_u8(q6);   // d12
412  q4ResultL.val[2] = vget_low_u8(q7);   // d14
413  q4ResultL.val[3] = vget_low_u8(q8);   // d16
414  q4ResultH.val[0] = vget_high_u8(q5);  // d11
415  q4ResultH.val[1] = vget_high_u8(q6);  // d13
416  q4ResultH.val[2] = vget_high_u8(q7);  // d15
417  q4ResultH.val[3] = vget_high_u8(q8);  // d17
418
419  d = src - 2;
420  write_4x8(d, pitch, q4ResultL);
421  d += pitch * 8;
422  write_4x8(d, pitch, q4ResultH);
423}
424
425void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
426                                           unsigned char blimit,
427                                           unsigned char limit,
428                                           unsigned char thresh,
429                                           unsigned char *v) {
430  unsigned char *us, *ud;
431  unsigned char *vs, *vd;
432  uint8x16_t qblimit, qlimit, qthresh, q3, q4;
433  uint8x16_t q5, q6, q7, q8, q9, q10;
434  uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
435  uint8x8_t d15, d16, d17, d18, d19, d20, d21;
436  uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
437  uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
438  uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
439  uint8x8x4_t q4ResultH, q4ResultL;
440
441  qblimit = vdupq_n_u8(blimit);
442  qlimit = vdupq_n_u8(limit);
443  qthresh = vdupq_n_u8(thresh);
444
445  us = u - 4;
446  d6 = vld1_u8(us);
447  us += pitch;
448  d8 = vld1_u8(us);
449  us += pitch;
450  d10 = vld1_u8(us);
451  us += pitch;
452  d12 = vld1_u8(us);
453  us += pitch;
454  d14 = vld1_u8(us);
455  us += pitch;
456  d16 = vld1_u8(us);
457  us += pitch;
458  d18 = vld1_u8(us);
459  us += pitch;
460  d20 = vld1_u8(us);
461
462  vs = v - 4;
463  d7 = vld1_u8(vs);
464  vs += pitch;
465  d9 = vld1_u8(vs);
466  vs += pitch;
467  d11 = vld1_u8(vs);
468  vs += pitch;
469  d13 = vld1_u8(vs);
470  vs += pitch;
471  d15 = vld1_u8(vs);
472  vs += pitch;
473  d17 = vld1_u8(vs);
474  vs += pitch;
475  d19 = vld1_u8(vs);
476  vs += pitch;
477  d21 = vld1_u8(vs);
478
479  q3 = vcombine_u8(d6, d7);
480  q4 = vcombine_u8(d8, d9);
481  q5 = vcombine_u8(d10, d11);
482  q6 = vcombine_u8(d12, d13);
483  q7 = vcombine_u8(d14, d15);
484  q8 = vcombine_u8(d16, d17);
485  q9 = vcombine_u8(d18, d19);
486  q10 = vcombine_u8(d20, d21);
487
488  q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
489  q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
490  q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
491  q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
492
493  q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
494                     vreinterpretq_u16_u32(q2tmp2.val[0]));
495  q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
496                     vreinterpretq_u16_u32(q2tmp3.val[0]));
497  q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
498                     vreinterpretq_u16_u32(q2tmp2.val[1]));
499  q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
500                     vreinterpretq_u16_u32(q2tmp3.val[1]));
501
502  q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
503                    vreinterpretq_u8_u16(q2tmp5.val[0]));
504  q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
505                    vreinterpretq_u8_u16(q2tmp5.val[1]));
506  q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
507                     vreinterpretq_u8_u16(q2tmp7.val[0]));
508  q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
509                     vreinterpretq_u8_u16(q2tmp7.val[1]));
510
511  q3 = q2tmp8.val[0];
512  q4 = q2tmp8.val[1];
513  q5 = q2tmp9.val[0];
514  q6 = q2tmp9.val[1];
515  q7 = q2tmp10.val[0];
516  q8 = q2tmp10.val[1];
517  q9 = q2tmp11.val[0];
518  q10 = q2tmp11.val[1];
519
520  vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
521                       q10, &q5, &q6, &q7, &q8);
522
523  q4ResultL.val[0] = vget_low_u8(q5);  // d10
524  q4ResultL.val[1] = vget_low_u8(q6);  // d12
525  q4ResultL.val[2] = vget_low_u8(q7);  // d14
526  q4ResultL.val[3] = vget_low_u8(q8);  // d16
527  ud = u - 2;
528  write_4x8(ud, pitch, q4ResultL);
529
530  q4ResultH.val[0] = vget_high_u8(q5);  // d11
531  q4ResultH.val[1] = vget_high_u8(q6);  // d13
532  q4ResultH.val[2] = vget_high_u8(q7);  // d15
533  q4ResultH.val[3] = vget_high_u8(q8);  // d17
534  vd = v - 2;
535  write_4x8(vd, pitch, q4ResultH);
536}
537