1/*
2 *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12
13#include "./vpx_config.h"
14#include "./vpx_dsp_rtcd.h"
15#include "vpx/vpx_integer.h"
16
17//------------------------------------------------------------------------------
18// DC 4x4
19
20static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) {
21  const uint16x4_t ref_u16 = vld1_u16(ref);
22  const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16);
23  return vpadd_u16(p0, p0);
24}
25
26static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
27                                const uint16x4_t dc) {
28  const uint16x4_t dc_dup = vdup_lane_u16(dc, 0);
29  int i;
30  for (i = 0; i < 4; ++i, dst += stride) {
31    vst1_u16(dst, dc_dup);
32  }
33}
34
35void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
36                                      const uint16_t *above,
37                                      const uint16_t *left, int bd) {
38  const uint16x4_t a = vld1_u16(above);
39  const uint16x4_t l = vld1_u16(left);
40  uint16x4_t sum;
41  uint16x4_t dc;
42  (void)bd;
43  sum = vadd_u16(a, l);
44  sum = vpadd_u16(sum, sum);
45  sum = vpadd_u16(sum, sum);
46  dc = vrshr_n_u16(sum, 3);
47  dc_store_4x4(dst, stride, dc);
48}
49
50void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
51                                           const uint16_t *above,
52                                           const uint16_t *left, int bd) {
53  const uint16x4_t sum = dc_sum_4(left);
54  const uint16x4_t dc = vrshr_n_u16(sum, 2);
55  (void)above;
56  (void)bd;
57  dc_store_4x4(dst, stride, dc);
58}
59
60void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
61                                          const uint16_t *above,
62                                          const uint16_t *left, int bd) {
63  const uint16x4_t sum = dc_sum_4(above);
64  const uint16x4_t dc = vrshr_n_u16(sum, 2);
65  (void)left;
66  (void)bd;
67  dc_store_4x4(dst, stride, dc);
68}
69
70void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
71                                          const uint16_t *above,
72                                          const uint16_t *left, int bd) {
73  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
74  (void)above;
75  (void)left;
76  dc_store_4x4(dst, stride, dc);
77}
78
79//------------------------------------------------------------------------------
80// DC 8x8
81
82static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) {
83  const uint16x8_t ref_u16 = vld1q_u16(ref);
84  uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16));
85  sum = vpadd_u16(sum, sum);
86  return vpadd_u16(sum, sum);
87}
88
89static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
90                                const uint16x4_t dc) {
91  const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
92  int i;
93  for (i = 0; i < 8; ++i, dst += stride) {
94    vst1q_u16(dst, dc_dup);
95  }
96}
97
98void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
99                                      const uint16_t *above,
100                                      const uint16_t *left, int bd) {
101  const uint16x8_t above_u16 = vld1q_u16(above);
102  const uint16x8_t left_u16 = vld1q_u16(left);
103  const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
104  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
105  uint16x4_t dc;
106  (void)bd;
107  sum = vpadd_u16(sum, sum);
108  sum = vpadd_u16(sum, sum);
109  dc = vrshr_n_u16(sum, 4);
110  dc_store_8x8(dst, stride, dc);
111}
112
113void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
114                                           const uint16_t *above,
115                                           const uint16_t *left, int bd) {
116  const uint16x4_t sum = dc_sum_8(left);
117  const uint16x4_t dc = vrshr_n_u16(sum, 3);
118  (void)above;
119  (void)bd;
120  dc_store_8x8(dst, stride, dc);
121}
122
123void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
124                                          const uint16_t *above,
125                                          const uint16_t *left, int bd) {
126  const uint16x4_t sum = dc_sum_8(above);
127  const uint16x4_t dc = vrshr_n_u16(sum, 3);
128  (void)left;
129  (void)bd;
130  dc_store_8x8(dst, stride, dc);
131}
132
133void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
134                                          const uint16_t *above,
135                                          const uint16_t *left, int bd) {
136  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
137  (void)above;
138  (void)left;
139  dc_store_8x8(dst, stride, dc);
140}
141
142//------------------------------------------------------------------------------
143// DC 16x16
144
145static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
146  const uint16x8x2_t ref_u16 = vld2q_u16(ref);
147  const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]);
148  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
149  sum = vpadd_u16(sum, sum);
150  return vpadd_u16(sum, sum);
151}
152
153static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
154                                  const uint16x4_t dc) {
155  uint16x8x2_t dc_dup;
156  int i;
157  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
158  for (i = 0; i < 16; ++i, dst += stride) {
159    vst2q_u16(dst, dc_dup);
160  }
161}
162
163void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
164                                        const uint16_t *above,
165                                        const uint16_t *left, int bd) {
166  const uint16x8x2_t a = vld2q_u16(above);
167  const uint16x8x2_t l = vld2q_u16(left);
168  const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]);
169  const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]);
170  const uint16x8_t pal0 = vaddq_u16(pa, pl);
171  uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
172  uint32x2_t sum;
173  uint16x4_t dc;
174  (void)bd;
175  pal1 = vpadd_u16(pal1, pal1);
176  sum = vpaddl_u16(pal1);
177  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
178  dc_store_16x16(dst, stride, dc);
179}
180
181void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
182                                             const uint16_t *above,
183                                             const uint16_t *left, int bd) {
184  const uint16x4_t sum = dc_sum_16(left);
185  const uint16x4_t dc = vrshr_n_u16(sum, 4);
186  (void)above;
187  (void)bd;
188  dc_store_16x16(dst, stride, dc);
189}
190
191void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
192                                            const uint16_t *above,
193                                            const uint16_t *left, int bd) {
194  const uint16x4_t sum = dc_sum_16(above);
195  const uint16x4_t dc = vrshr_n_u16(sum, 4);
196  (void)left;
197  (void)bd;
198  dc_store_16x16(dst, stride, dc);
199}
200
201void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
202                                            const uint16_t *above,
203                                            const uint16_t *left, int bd) {
204  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
205  (void)above;
206  (void)left;
207  dc_store_16x16(dst, stride, dc);
208}
209
210//------------------------------------------------------------------------------
211// DC 32x32
212
213static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
214  const uint16x8x4_t r = vld4q_u16(ref);
215  const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]);
216  const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]);
217  const uint16x8_t p2 = vaddq_u16(p0, p1);
218  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
219  sum = vpadd_u16(sum, sum);
220  return vpaddl_u16(sum);
221}
222
223static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
224                                  const uint16x4_t dc) {
225  uint16x8x2_t dc_dup;
226  int i;
227  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
228
229  for (i = 0; i < 32; ++i) {
230    vst2q_u16(dst, dc_dup);
231    dst += 16;
232    vst2q_u16(dst, dc_dup);
233    dst += stride - 16;
234  }
235}
236
237void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
238                                        const uint16_t *above,
239                                        const uint16_t *left, int bd) {
240  const uint16x8x4_t a = vld4q_u16(above);
241  const uint16x8x4_t l = vld4q_u16(left);
242  const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]);
243  const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]);
244  const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]);
245  const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]);
246  const uint16x8_t pa = vaddq_u16(pa0, pa1);
247  const uint16x8_t pl = vaddq_u16(pl0, pl1);
248  const uint16x8_t pal0 = vaddq_u16(pa, pl);
249  const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
250  uint32x2_t sum = vpaddl_u16(pal1);
251  uint16x4_t dc;
252  (void)bd;
253  sum = vpadd_u32(sum, sum);
254  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6));
255  dc_store_32x32(dst, stride, dc);
256}
257
258void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
259                                             const uint16_t *above,
260                                             const uint16_t *left, int bd) {
261  const uint32x2_t sum = dc_sum_32(left);
262  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
263  (void)above;
264  (void)bd;
265  dc_store_32x32(dst, stride, dc);
266}
267
268void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
269                                            const uint16_t *above,
270                                            const uint16_t *left, int bd) {
271  const uint32x2_t sum = dc_sum_32(above);
272  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
273  (void)left;
274  (void)bd;
275  dc_store_32x32(dst, stride, dc);
276}
277
278void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
279                                            const uint16_t *above,
280                                            const uint16_t *left, int bd) {
281  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
282  (void)above;
283  (void)left;
284  dc_store_32x32(dst, stride, dc);
285}
286
287// -----------------------------------------------------------------------------
288
289void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
290                                       const uint16_t *above,
291                                       const uint16_t *left, int bd) {
292  const uint16x8_t ABCDEFGH = vld1q_u16(above);
293  const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
294  const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
295  const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
296  const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
297  const uint16x4_t avg2_low = vget_low_u16(avg2);
298  const uint16x4_t avg2_high = vget_high_u16(avg2);
299  const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
300  const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
301  const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
302  (void)left;
303  (void)bd;
304  vst1_u16(dst, avg2_low);
305  dst += stride;
306  vst1_u16(dst, r1);
307  dst += stride;
308  vst1_u16(dst, r2);
309  dst += stride;
310  vst1_u16(dst, r3);
311  vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
312}
313
314static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
315                               const uint16x8_t above_right, uint16x8_t *row) {
316  *row = vextq_u16(*row, above_right, 1);
317  vst1q_u16(*dst, *row);
318  *dst += stride;
319}
320
321void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
322                                       const uint16_t *above,
323                                       const uint16_t *left, int bd) {
324  const uint16x8_t A0 = vld1q_u16(above);
325  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
326  const uint16x8_t A1 = vld1q_u16(above + 1);
327  const uint16x8_t A2 = vld1q_u16(above + 2);
328  const uint16x8_t avg1 = vhaddq_u16(A0, A2);
329  uint16x8_t row = vrhaddq_u16(avg1, A1);
330  (void)left;
331  (void)bd;
332
333  vst1q_u16(dst, row);
334  dst += stride;
335  d45_store_8(&dst, stride, above_right, &row);
336  d45_store_8(&dst, stride, above_right, &row);
337  d45_store_8(&dst, stride, above_right, &row);
338  d45_store_8(&dst, stride, above_right, &row);
339  d45_store_8(&dst, stride, above_right, &row);
340  d45_store_8(&dst, stride, above_right, &row);
341  vst1q_u16(dst, above_right);
342}
343
344static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
345                                const uint16x8_t above_right, uint16x8_t *row_0,
346                                uint16x8_t *row_1) {
347  *row_0 = vextq_u16(*row_0, *row_1, 1);
348  *row_1 = vextq_u16(*row_1, above_right, 1);
349  vst1q_u16(*dst, *row_0);
350  *dst += 8;
351  vst1q_u16(*dst, *row_1);
352  *dst += stride - 8;
353}
354
355void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
356                                         const uint16_t *above,
357                                         const uint16_t *left, int bd) {
358  const uint16x8_t A0_0 = vld1q_u16(above);
359  const uint16x8_t A0_1 = vld1q_u16(above + 8);
360  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
361  const uint16x8_t A1_0 = vld1q_u16(above + 1);
362  const uint16x8_t A1_1 = vld1q_u16(above + 9);
363  const uint16x8_t A2_0 = vld1q_u16(above + 2);
364  const uint16x8_t A2_1 = vld1q_u16(above + 10);
365  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
366  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
367  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
368  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
369  (void)left;
370  (void)bd;
371
372  vst1q_u16(dst, row_0);
373  vst1q_u16(dst + 8, row_1);
374  dst += stride;
375  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
376  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
377  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
378  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
379  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
380  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
381  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
382  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
383  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
384  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
385  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
386  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
387  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
388  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
389  vst1q_u16(dst, above_right);
390  vst1q_u16(dst + 8, above_right);
391}
392
393void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
394                                         const uint16_t *above,
395                                         const uint16_t *left, int bd) {
396  const uint16x8_t A0_0 = vld1q_u16(above);
397  const uint16x8_t A0_1 = vld1q_u16(above + 8);
398  const uint16x8_t A0_2 = vld1q_u16(above + 16);
399  const uint16x8_t A0_3 = vld1q_u16(above + 24);
400  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
401  const uint16x8_t A1_0 = vld1q_u16(above + 1);
402  const uint16x8_t A1_1 = vld1q_u16(above + 9);
403  const uint16x8_t A1_2 = vld1q_u16(above + 17);
404  const uint16x8_t A1_3 = vld1q_u16(above + 25);
405  const uint16x8_t A2_0 = vld1q_u16(above + 2);
406  const uint16x8_t A2_1 = vld1q_u16(above + 10);
407  const uint16x8_t A2_2 = vld1q_u16(above + 18);
408  const uint16x8_t A2_3 = vld1q_u16(above + 26);
409  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
410  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
411  const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
412  const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
413  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
414  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
415  uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
416  uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
417  int i;
418  (void)left;
419  (void)bd;
420
421  vst1q_u16(dst, row_0);
422  dst += 8;
423  vst1q_u16(dst, row_1);
424  dst += 8;
425  vst1q_u16(dst, row_2);
426  dst += 8;
427  vst1q_u16(dst, row_3);
428  dst += stride - 24;
429
430  for (i = 0; i < 30; ++i) {
431    row_0 = vextq_u16(row_0, row_1, 1);
432    row_1 = vextq_u16(row_1, row_2, 1);
433    row_2 = vextq_u16(row_2, row_3, 1);
434    row_3 = vextq_u16(row_3, above_right, 1);
435    vst1q_u16(dst, row_0);
436    dst += 8;
437    vst1q_u16(dst, row_1);
438    dst += 8;
439    vst1q_u16(dst, row_2);
440    dst += 8;
441    vst1q_u16(dst, row_3);
442    dst += stride - 24;
443  }
444
445  vst1q_u16(dst, above_right);
446  dst += 8;
447  vst1q_u16(dst, above_right);
448  dst += 8;
449  vst1q_u16(dst, above_right);
450  dst += 8;
451  vst1q_u16(dst, above_right);
452}
453
454// -----------------------------------------------------------------------------
455
456void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
457                                        const uint16_t *above,
458                                        const uint16_t *left, int bd) {
459  const uint16x8_t XA0123___ = vld1q_u16(above - 1);
460  const uint16x4_t L0123 = vld1_u16(left);
461  const uint16x4_t L3210 = vrev64_u16(L0123);
462  const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
463  const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
464  const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
465  const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
466  const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
467  const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
468  const uint16x4_t row_0 = vget_low_u16(avg2);
469  const uint16x4_t row_1 = vget_high_u16(avg2);
470  const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
471  const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
472  const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
473  (void)bd;
474  vst1_u16(dst, r0);
475  dst += stride;
476  vst1_u16(dst, r1);
477  dst += stride;
478  vst1_u16(dst, r2);
479  dst += stride;
480  vst1_u16(dst, row_0);
481}
482
483void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
484                                        const uint16_t *above,
485                                        const uint16_t *left, int bd) {
486  const uint16x8_t XA0123456 = vld1q_u16(above - 1);
487  const uint16x8_t A01234567 = vld1q_u16(above);
488  const uint16x8_t A1234567_ = vld1q_u16(above + 1);
489  const uint16x8_t L01234567 = vld1q_u16(left);
490  const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
491  const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
492  const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
493  const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
494  const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
495  const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
496  const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
497  const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
498  const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
499  const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
500  const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
501  const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
502  const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
503  const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
504  const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
505  const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
506  (void)bd;
507  vst1q_u16(dst, r0);
508  dst += stride;
509  vst1q_u16(dst, r1);
510  dst += stride;
511  vst1q_u16(dst, r2);
512  dst += stride;
513  vst1q_u16(dst, r3);
514  dst += stride;
515  vst1q_u16(dst, r4);
516  dst += stride;
517  vst1q_u16(dst, r5);
518  dst += stride;
519  vst1q_u16(dst, r6);
520  dst += stride;
521  vst1q_u16(dst, row_0);
522}
523
524static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
525                                 const uint16x8_t row_0,
526                                 const uint16x8_t row_1) {
527  vst1q_u16(*dst, row_0);
528  *dst += 8;
529  vst1q_u16(*dst, row_1);
530  *dst += stride - 8;
531}
532
533void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
534                                          const uint16_t *above,
535                                          const uint16_t *left, int bd) {
536  const uint16x8_t L01234567 = vld1q_u16(left);
537  const uint16x8_t L89abcdef = vld1q_u16(left + 8);
538  const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
539  const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
540  const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
541  const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
542  const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
543  const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
544  const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
545  const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
546  const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
547  const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
548
549  const uint16x8_t XA0123456 = vld1q_u16(above - 1);
550  const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
551  const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
552  const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
553  const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
554
555  const uint16x8_t A01234567 = vld1q_u16(above);
556  const uint16x8_t A12345678 = vld1q_u16(above + 1);
557  const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
558  const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
559
560  const uint16x8_t A789abcde = vld1q_u16(above + 7);
561  const uint16x8_t A89abcdef = vld1q_u16(above + 8);
562  const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
563  const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
564  const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
565
566  const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
567  const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
568  const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
569  const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
570  const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
571  const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
572  const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
573  const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
574  const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
575  const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
576  const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
577  const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
578  const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
579  const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
580  const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
581  const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
582  const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
583  const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
584  const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
585  const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
586  const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
587  (void)bd;
588
589  d135_store_16(&dst, stride, r0_0, r0_1);
590  d135_store_16(&dst, stride, r1_0, r1_1);
591  d135_store_16(&dst, stride, r2_0, r2_1);
592  d135_store_16(&dst, stride, r3_0, r3_1);
593  d135_store_16(&dst, stride, r4_0, r4_1);
594  d135_store_16(&dst, stride, r5_0, r5_1);
595  d135_store_16(&dst, stride, r6_0, r6_1);
596  d135_store_16(&dst, stride, row_1, row_2);
597  d135_store_16(&dst, stride, r8_0, r0_0);
598  d135_store_16(&dst, stride, r9_0, r1_0);
599  d135_store_16(&dst, stride, ra_0, r2_0);
600  d135_store_16(&dst, stride, rb_0, r3_0);
601  d135_store_16(&dst, stride, rc_0, r4_0);
602  d135_store_16(&dst, stride, rd_0, r5_0);
603  d135_store_16(&dst, stride, re_0, r6_0);
604  vst1q_u16(dst, row_0);
605  dst += 8;
606  vst1q_u16(dst, row_1);
607}
608
609void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
610                                          const uint16_t *above,
611                                          const uint16_t *left, int bd) {
612  const uint16x8_t LL01234567 = vld1q_u16(left + 16);
613  const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
614  const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
615  const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
616  const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
617  const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
618  const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
619  const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
620  const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
621  const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
622  const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
623  uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
624
625  const uint16x8_t LU01234567 = vld1q_u16(left);
626  const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
627  const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
628  const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
629  const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
630  const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
631  const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
632  const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
633  const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
634  const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
635  const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
636  uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
637
638  const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
639  const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
640  const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
641  uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
642
643  const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
644  const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
645  const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
646  const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
647  uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
648
649  const uint16x8_t AL01234567 = vld1q_u16(above);
650  const uint16x8_t AL12345678 = vld1q_u16(above + 1);
651  const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
652  uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
653
654  const uint16x8_t AL789abcde = vld1q_u16(above + 7);
655  const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
656  const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
657  const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
658  uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
659
660  const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
661  const uint16x8_t AR01234567 = vld1q_u16(above + 16);
662  const uint16x8_t AR12345678 = vld1q_u16(above + 17);
663  const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
664  uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
665
666  const uint16x8_t AR789abcde = vld1q_u16(above + 23);
667  const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
668  const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
669  const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
670  uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
671  int i, j;
672  (void)bd;
673
674  dst += 31 * stride;
675  for (i = 0; i < 4; ++i) {
676    for (j = 0; j < 8; ++j) {
677      vst1q_u16(dst, row_0);
678      dst += 8;
679      vst1q_u16(dst, row_1);
680      dst += 8;
681      vst1q_u16(dst, row_2);
682      dst += 8;
683      vst1q_u16(dst, row_3);
684      dst -= stride + 24;
685      row_0 = vextq_u16(row_0, row_1, 1);
686      row_1 = vextq_u16(row_1, row_2, 1);
687      row_2 = vextq_u16(row_2, row_3, 1);
688      row_3 = vextq_u16(row_3, row_4, 1);
689      row_4 = vextq_u16(row_4, row_4, 1);
690    }
691    row_4 = row_5;
692    row_5 = row_6;
693    row_6 = row_7;
694  }
695}
696
697//------------------------------------------------------------------------------
698
699void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
700                                     const uint16_t *above,
701                                     const uint16_t *left, int bd) {
702  const uint16x4_t row = vld1_u16(above);
703  int i;
704  (void)left;
705  (void)bd;
706
707  for (i = 0; i < 4; i++, dst += stride) {
708    vst1_u16(dst, row);
709  }
710}
711
712void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
713                                     const uint16_t *above,
714                                     const uint16_t *left, int bd) {
715  const uint16x8_t row = vld1q_u16(above);
716  int i;
717  (void)left;
718  (void)bd;
719
720  for (i = 0; i < 8; i++, dst += stride) {
721    vst1q_u16(dst, row);
722  }
723}
724
725void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
726                                       const uint16_t *above,
727                                       const uint16_t *left, int bd) {
728  const uint16x8x2_t row = vld2q_u16(above);
729  int i;
730  (void)left;
731  (void)bd;
732
733  for (i = 0; i < 16; i++, dst += stride) {
734    vst2q_u16(dst, row);
735  }
736}
737
738void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
739                                       const uint16_t *above,
740                                       const uint16_t *left, int bd) {
741  const uint16x8x2_t row0 = vld2q_u16(above);
742  const uint16x8x2_t row1 = vld2q_u16(above + 16);
743  int i;
744  (void)left;
745  (void)bd;
746
747  for (i = 0; i < 32; i++) {
748    vst2q_u16(dst, row0);
749    dst += 16;
750    vst2q_u16(dst, row1);
751    dst += stride - 16;
752  }
753}
754
755// -----------------------------------------------------------------------------
756
757void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
758                                     const uint16_t *above,
759                                     const uint16_t *left, int bd) {
760  const uint16x4_t left_u16 = vld1_u16(left);
761  uint16x4_t row;
762  (void)above;
763  (void)bd;
764
765  row = vdup_lane_u16(left_u16, 0);
766  vst1_u16(dst, row);
767  dst += stride;
768  row = vdup_lane_u16(left_u16, 1);
769  vst1_u16(dst, row);
770  dst += stride;
771  row = vdup_lane_u16(left_u16, 2);
772  vst1_u16(dst, row);
773  dst += stride;
774  row = vdup_lane_u16(left_u16, 3);
775  vst1_u16(dst, row);
776}
777
778void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
779                                     const uint16_t *above,
780                                     const uint16_t *left, int bd) {
781  const uint16x8_t left_u16 = vld1q_u16(left);
782  const uint16x4_t left_low = vget_low_u16(left_u16);
783  const uint16x4_t left_high = vget_high_u16(left_u16);
784  uint16x8_t row;
785  (void)above;
786  (void)bd;
787
788  row = vdupq_lane_u16(left_low, 0);
789  vst1q_u16(dst, row);
790  dst += stride;
791  row = vdupq_lane_u16(left_low, 1);
792  vst1q_u16(dst, row);
793  dst += stride;
794  row = vdupq_lane_u16(left_low, 2);
795  vst1q_u16(dst, row);
796  dst += stride;
797  row = vdupq_lane_u16(left_low, 3);
798  vst1q_u16(dst, row);
799  dst += stride;
800  row = vdupq_lane_u16(left_high, 0);
801  vst1q_u16(dst, row);
802  dst += stride;
803  row = vdupq_lane_u16(left_high, 1);
804  vst1q_u16(dst, row);
805  dst += stride;
806  row = vdupq_lane_u16(left_high, 2);
807  vst1q_u16(dst, row);
808  dst += stride;
809  row = vdupq_lane_u16(left_high, 3);
810  vst1q_u16(dst, row);
811}
812
813static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
814                              const uint16x8_t row) {
815  // Note: vst1q is faster than vst2q
816  vst1q_u16(*dst, row);
817  *dst += 8;
818  vst1q_u16(*dst, row);
819  *dst += stride - 8;
820}
821
822void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
823                                       const uint16_t *above,
824                                       const uint16_t *left, int bd) {
825  int i;
826  (void)above;
827  (void)bd;
828
829  for (i = 0; i < 2; i++, left += 8) {
830    const uint16x8_t left_u16q = vld1q_u16(left);
831    const uint16x4_t left_low = vget_low_u16(left_u16q);
832    const uint16x4_t left_high = vget_high_u16(left_u16q);
833    uint16x8_t row;
834
835    row = vdupq_lane_u16(left_low, 0);
836    h_store_16(&dst, stride, row);
837    row = vdupq_lane_u16(left_low, 1);
838    h_store_16(&dst, stride, row);
839    row = vdupq_lane_u16(left_low, 2);
840    h_store_16(&dst, stride, row);
841    row = vdupq_lane_u16(left_low, 3);
842    h_store_16(&dst, stride, row);
843    row = vdupq_lane_u16(left_high, 0);
844    h_store_16(&dst, stride, row);
845    row = vdupq_lane_u16(left_high, 1);
846    h_store_16(&dst, stride, row);
847    row = vdupq_lane_u16(left_high, 2);
848    h_store_16(&dst, stride, row);
849    row = vdupq_lane_u16(left_high, 3);
850    h_store_16(&dst, stride, row);
851  }
852}
853
854static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
855                              const uint16x8_t row) {
856  // Note: vst1q is faster than vst2q
857  vst1q_u16(*dst, row);
858  *dst += 8;
859  vst1q_u16(*dst, row);
860  *dst += 8;
861  vst1q_u16(*dst, row);
862  *dst += 8;
863  vst1q_u16(*dst, row);
864  *dst += stride - 24;
865}
866
867void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
868                                       const uint16_t *above,
869                                       const uint16_t *left, int bd) {
870  int i;
871  (void)above;
872  (void)bd;
873
874  for (i = 0; i < 4; i++, left += 8) {
875    const uint16x8_t left_u16q = vld1q_u16(left);
876    const uint16x4_t left_low = vget_low_u16(left_u16q);
877    const uint16x4_t left_high = vget_high_u16(left_u16q);
878    uint16x8_t row;
879
880    row = vdupq_lane_u16(left_low, 0);
881    h_store_32(&dst, stride, row);
882    row = vdupq_lane_u16(left_low, 1);
883    h_store_32(&dst, stride, row);
884    row = vdupq_lane_u16(left_low, 2);
885    h_store_32(&dst, stride, row);
886    row = vdupq_lane_u16(left_low, 3);
887    h_store_32(&dst, stride, row);
888    row = vdupq_lane_u16(left_high, 0);
889    h_store_32(&dst, stride, row);
890    row = vdupq_lane_u16(left_high, 1);
891    h_store_32(&dst, stride, row);
892    row = vdupq_lane_u16(left_high, 2);
893    h_store_32(&dst, stride, row);
894    row = vdupq_lane_u16(left_high, 3);
895    h_store_32(&dst, stride, row);
896  }
897}
898
899// -----------------------------------------------------------------------------
900
901void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
902                                      const uint16_t *above,
903                                      const uint16_t *left, int bd) {
904  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
905  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
906  const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
907  const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
908  const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
909  const int16x8_t sub = vsubq_s16(above_s16, top_left);
910  int16x8_t sum;
911  uint16x8_t row;
912
913  sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
914  sum = vaddq_s16(sum, sub);
915  sum = vminq_s16(sum, max);
916  row = vqshluq_n_s16(sum, 0);
917  vst1_u16(dst, vget_low_u16(row));
918  dst += stride;
919  vst1_u16(dst, vget_high_u16(row));
920  dst += stride;
921
922  sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
923  sum = vaddq_s16(sum, sub);
924  sum = vminq_s16(sum, max);
925  row = vqshluq_n_s16(sum, 0);
926  vst1_u16(dst, vget_low_u16(row));
927  dst += stride;
928  vst1_u16(dst, vget_high_u16(row));
929}
930
931static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
932                               const int16x8_t left_dup, const int16x8_t sub,
933                               const int16x8_t max) {
934  uint16x8_t row;
935  int16x8_t sum = vaddq_s16(left_dup, sub);
936  sum = vminq_s16(sum, max);
937  row = vqshluq_n_s16(sum, 0);
938  vst1q_u16(*dst, row);
939  *dst += stride;
940}
941
942void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
943                                      const uint16_t *above,
944                                      const uint16_t *left, int bd) {
945  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
946  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
947  const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
948  const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
949  const int16x8_t sub = vsubq_s16(above_s16, top_left);
950  int16x4_t left_s16d;
951  int16x8_t left_dup;
952  int i;
953
954  left_s16d = vget_low_s16(left_s16);
955
956  for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
957    left_dup = vdupq_lane_s16(left_s16d, 0);
958    tm_8_kernel(&dst, stride, left_dup, sub, max);
959
960    left_dup = vdupq_lane_s16(left_s16d, 1);
961    tm_8_kernel(&dst, stride, left_dup, sub, max);
962
963    left_dup = vdupq_lane_s16(left_s16d, 2);
964    tm_8_kernel(&dst, stride, left_dup, sub, max);
965
966    left_dup = vdupq_lane_s16(left_s16d, 3);
967    tm_8_kernel(&dst, stride, left_dup, sub, max);
968  }
969}
970
971static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
972                                const int16x8_t left_dup, const int16x8_t sub0,
973                                const int16x8_t sub1, const int16x8_t max) {
974  uint16x8_t row0, row1;
975  int16x8_t sum0 = vaddq_s16(left_dup, sub0);
976  int16x8_t sum1 = vaddq_s16(left_dup, sub1);
977  sum0 = vminq_s16(sum0, max);
978  sum1 = vminq_s16(sum1, max);
979  row0 = vqshluq_n_s16(sum0, 0);
980  row1 = vqshluq_n_s16(sum1, 0);
981  vst1q_u16(*dst, row0);
982  *dst += 8;
983  vst1q_u16(*dst, row1);
984  *dst += stride - 8;
985}
986
987void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
988                                        const uint16_t *above,
989                                        const uint16_t *left, int bd) {
990  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
991  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
992  const int16x8_t above0 = vld1q_s16((const int16_t *)above);
993  const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
994  const int16x8_t sub0 = vsubq_s16(above0, top_left);
995  const int16x8_t sub1 = vsubq_s16(above1, top_left);
996  int16x8_t left_dup;
997  int i, j;
998
999  for (j = 0; j < 2; j++, left += 8) {
1000    const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
1001    int16x4_t left_s16d = vget_low_s16(left_s16q);
1002    for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
1003      left_dup = vdupq_lane_s16(left_s16d, 0);
1004      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
1005
1006      left_dup = vdupq_lane_s16(left_s16d, 1);
1007      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
1008
1009      left_dup = vdupq_lane_s16(left_s16d, 2);
1010      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
1011
1012      left_dup = vdupq_lane_s16(left_s16d, 3);
1013      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
1014    }
1015  }
1016}
1017
1018static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
1019                                const int16x8_t left_dup, const int16x8_t sub0,
1020                                const int16x8_t sub1, const int16x8_t sub2,
1021                                const int16x8_t sub3, const int16x8_t max) {
1022  uint16x8_t row0, row1, row2, row3;
1023  int16x8_t sum0 = vaddq_s16(left_dup, sub0);
1024  int16x8_t sum1 = vaddq_s16(left_dup, sub1);
1025  int16x8_t sum2 = vaddq_s16(left_dup, sub2);
1026  int16x8_t sum3 = vaddq_s16(left_dup, sub3);
1027  sum0 = vminq_s16(sum0, max);
1028  sum1 = vminq_s16(sum1, max);
1029  sum2 = vminq_s16(sum2, max);
1030  sum3 = vminq_s16(sum3, max);
1031  row0 = vqshluq_n_s16(sum0, 0);
1032  row1 = vqshluq_n_s16(sum1, 0);
1033  row2 = vqshluq_n_s16(sum2, 0);
1034  row3 = vqshluq_n_s16(sum3, 0);
1035  vst1q_u16(*dst, row0);
1036  *dst += 8;
1037  vst1q_u16(*dst, row1);
1038  *dst += 8;
1039  vst1q_u16(*dst, row2);
1040  *dst += 8;
1041  vst1q_u16(*dst, row3);
1042  *dst += stride - 24;
1043}
1044
1045void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
1046                                        const uint16_t *above,
1047                                        const uint16_t *left, int bd) {
1048  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
1049  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
1050  const int16x8_t above0 = vld1q_s16((const int16_t *)above);
1051  const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
1052  const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
1053  const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
1054  const int16x8_t sub0 = vsubq_s16(above0, top_left);
1055  const int16x8_t sub1 = vsubq_s16(above1, top_left);
1056  const int16x8_t sub2 = vsubq_s16(above2, top_left);
1057  const int16x8_t sub3 = vsubq_s16(above3, top_left);
1058  int16x8_t left_dup;
1059  int i, j;
1060
1061  for (i = 0; i < 4; i++, left += 8) {
1062    const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
1063    int16x4_t left_s16d = vget_low_s16(left_s16q);
1064    for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
1065      left_dup = vdupq_lane_s16(left_s16d, 0);
1066      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
1067
1068      left_dup = vdupq_lane_s16(left_s16d, 1);
1069      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
1070
1071      left_dup = vdupq_lane_s16(left_s16d, 2);
1072      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
1073
1074      left_dup = vdupq_lane_s16(left_s16d, 3);
1075      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
1076    }
1077  }
1078}
1079