1/*
2 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <immintrin.h>
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx_dsp/x86/convolve.h"
15
16// -----------------------------------------------------------------------------
17// Copy and average
18
19void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
20                                   uint16_t *dst, ptrdiff_t dst_stride,
21                                   const InterpKernel *filter, int x0_q4,
22                                   int x_step_q4, int y0_q4, int y_step_q4,
23                                   int width, int h, int bd) {
24  (void)filter;
25  (void)x0_q4;
26  (void)x_step_q4;
27  (void)y0_q4;
28  (void)y_step_q4;
29  (void)bd;
30
31  assert(width % 4 == 0);
32  if (width > 32) {  // width = 64
33    do {
34      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
35      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
36      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
37      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
38      src += src_stride;
39      _mm256_storeu_si256((__m256i *)dst, p0);
40      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
41      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
42      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
43      dst += dst_stride;
44      h--;
45    } while (h > 0);
46  } else if (width > 16) {  // width = 32
47    do {
48      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
49      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
50      src += src_stride;
51      _mm256_storeu_si256((__m256i *)dst, p0);
52      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
53      dst += dst_stride;
54      h--;
55    } while (h > 0);
56  } else if (width > 8) {  // width = 16
57    __m256i p0, p1;
58    do {
59      p0 = _mm256_loadu_si256((const __m256i *)src);
60      src += src_stride;
61      p1 = _mm256_loadu_si256((const __m256i *)src);
62      src += src_stride;
63
64      _mm256_storeu_si256((__m256i *)dst, p0);
65      dst += dst_stride;
66      _mm256_storeu_si256((__m256i *)dst, p1);
67      dst += dst_stride;
68      h -= 2;
69    } while (h > 0);
70  } else if (width > 4) {  // width = 8
71    __m128i p0, p1;
72    do {
73      p0 = _mm_loadu_si128((const __m128i *)src);
74      src += src_stride;
75      p1 = _mm_loadu_si128((const __m128i *)src);
76      src += src_stride;
77
78      _mm_storeu_si128((__m128i *)dst, p0);
79      dst += dst_stride;
80      _mm_storeu_si128((__m128i *)dst, p1);
81      dst += dst_stride;
82      h -= 2;
83    } while (h > 0);
84  } else {  // width = 4
85    __m128i p0, p1;
86    do {
87      p0 = _mm_loadl_epi64((const __m128i *)src);
88      src += src_stride;
89      p1 = _mm_loadl_epi64((const __m128i *)src);
90      src += src_stride;
91
92      _mm_storel_epi64((__m128i *)dst, p0);
93      dst += dst_stride;
94      _mm_storel_epi64((__m128i *)dst, p1);
95      dst += dst_stride;
96      h -= 2;
97    } while (h > 0);
98  }
99}
100
101void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
102                                  uint16_t *dst, ptrdiff_t dst_stride,
103                                  const InterpKernel *filter, int x0_q4,
104                                  int x_step_q4, int y0_q4, int y_step_q4,
105                                  int width, int h, int bd) {
106  (void)filter;
107  (void)x0_q4;
108  (void)x_step_q4;
109  (void)y0_q4;
110  (void)y_step_q4;
111  (void)bd;
112
113  assert(width % 4 == 0);
114  if (width > 32) {  // width = 64
115    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
116    do {
117      p0 = _mm256_loadu_si256((const __m256i *)src);
118      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
119      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
120      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
121      src += src_stride;
122      u0 = _mm256_loadu_si256((const __m256i *)dst);
123      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
124      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
125      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
126      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
127      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
128      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
129      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
130      dst += dst_stride;
131      h--;
132    } while (h > 0);
133  } else if (width > 16) {  // width = 32
134    __m256i p0, p1, u0, u1;
135    do {
136      p0 = _mm256_loadu_si256((const __m256i *)src);
137      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
138      src += src_stride;
139      u0 = _mm256_loadu_si256((const __m256i *)dst);
140      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
141      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
142      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
143      dst += dst_stride;
144      h--;
145    } while (h > 0);
146  } else if (width > 8) {  // width = 16
147    __m256i p0, p1, u0, u1;
148    do {
149      p0 = _mm256_loadu_si256((const __m256i *)src);
150      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
151      src += src_stride << 1;
152      u0 = _mm256_loadu_si256((const __m256i *)dst);
153      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
154
155      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
156      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
157                          _mm256_avg_epu16(p1, u1));
158      dst += dst_stride << 1;
159      h -= 2;
160    } while (h > 0);
161  } else if (width > 4) {  // width = 8
162    __m128i p0, p1, u0, u1;
163    do {
164      p0 = _mm_loadu_si128((const __m128i *)src);
165      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
166      src += src_stride << 1;
167      u0 = _mm_loadu_si128((const __m128i *)dst);
168      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
169
170      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
171      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
172      dst += dst_stride << 1;
173      h -= 2;
174    } while (h > 0);
175  } else {  // width = 4
176    __m128i p0, p1, u0, u1;
177    do {
178      p0 = _mm_loadl_epi64((const __m128i *)src);
179      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
180      src += src_stride << 1;
181      u0 = _mm_loadl_epi64((const __m128i *)dst);
182      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
183
184      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
185      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
186      dst += dst_stride << 1;
187      h -= 2;
188    } while (h > 0);
189  }
190}
191
192// -----------------------------------------------------------------------------
193// Horizontal and vertical filtering
194
195#define CONV8_ROUNDING_BITS (7)
196
197static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
198                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
199                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
200
201static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
202                                              8, 9, 10, 11, 10, 11, 12, 13,
203                                              4, 5, 6,  7,  6,  7,  8,  9,
204                                              8, 9, 10, 11, 10, 11, 12, 13 };
205
206static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
207                                              10, 11, 12, 13, 12, 13, 14, 15,
208                                              6,  7,  8,  9,  8,  9,  10, 11,
209                                              10, 11, 12, 13, 12, 13, 14, 15 };
210
211static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
212
213// -----------------------------------------------------------------------------
214// Horizontal Filtering
215
216static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
217  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
218  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
219  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
220  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
221
222  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
223  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
224  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
225  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
226}
227
228// Note:
229//  Shared by 8x2 and 16x1 block
230static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
231                                  __m256i *x /*x[8]*/) {
232  __m256i pp[8];
233  pack_pixels(s0, pp);
234  pack_pixels(s1, &pp[4]);
235  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
236  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
237  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
238  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
239  x[4] = x[2];
240  x[5] = x[3];
241  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
242  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
243}
244
245static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
246  __m256i pp[8];
247  __m256i s0;
248  s0 = _mm256_loadu_si256((const __m256i *)src);
249  pack_pixels(&s0, pp);
250  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
251  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
252  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
253  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
254}
255
256static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
257                                   __m256i *x) {
258  __m256i s0, s1;
259  s0 = _mm256_loadu_si256((const __m256i *)src);
260  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
261  pack_16_pixels(&s0, &s1, x);
262}
263
264static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
265  __m256i s0, s1;
266  s0 = _mm256_loadu_si256((const __m256i *)src);
267  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
268  pack_16_pixels(&s0, &s1, x);
269}
270
271// Note:
272//  Shared by horizontal and vertical filtering
273static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
274  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
275  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
276  const __m256i p0 = _mm256_set1_epi32(0x03020100);
277  const __m256i p1 = _mm256_set1_epi32(0x07060504);
278  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
279  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
280  f[0] = _mm256_shuffle_epi8(hh, p0);
281  f[1] = _mm256_shuffle_epi8(hh, p1);
282  f[2] = _mm256_shuffle_epi8(hh, p2);
283  f[3] = _mm256_shuffle_epi8(hh, p3);
284}
285
286static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
287                                     const __m256i *fil /*fil[4]*/,
288                                     __m256i *y) {
289  __m256i a, a0, a1;
290
291  a0 = _mm256_madd_epi16(fil[0], sig[0]);
292  a1 = _mm256_madd_epi16(fil[3], sig[3]);
293  a = _mm256_add_epi32(a0, a1);
294
295  a0 = _mm256_madd_epi16(fil[1], sig[1]);
296  a1 = _mm256_madd_epi16(fil[2], sig[2]);
297
298  {
299    const __m256i min = _mm256_min_epi32(a0, a1);
300    a = _mm256_add_epi32(a, min);
301  }
302  {
303    const __m256i max = _mm256_max_epi32(a0, a1);
304    a = _mm256_add_epi32(a, max);
305  }
306  {
307    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
308    a = _mm256_add_epi32(a, rounding);
309    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
310  }
311}
312
313static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
314                                    uint16_t *dst) {
315  const __m128i a0 = _mm256_castsi256_si128(*y);
316  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
317  __m128i res = _mm_packus_epi32(a0, a1);
318  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
319  _mm_storeu_si128((__m128i *)dst, res);
320}
321
322static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
323                                    const __m256i *mask, uint16_t *dst,
324                                    ptrdiff_t pitch) {
325  __m256i a = _mm256_packus_epi32(*y0, *y1);
326  a = _mm256_min_epi16(a, *mask);
327  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
328  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
329}
330
331static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
332                                     const __m256i *mask, uint16_t *dst) {
333  __m256i a = _mm256_packus_epi32(*y0, *y1);
334  a = _mm256_min_epi16(a, *mask);
335  _mm256_storeu_si256((__m256i *)dst, a);
336}
337
338static void vpx_highbd_filter_block1d8_h8_avx2(
339    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
340    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
341  __m256i signal[8], res0, res1;
342  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
343
344  __m256i ff[4];
345  pack_filters(filter, ff);
346
347  src_ptr -= 3;
348  do {
349    pack_8x2_pixels(src_ptr, src_pitch, signal);
350    filter_8x1_pixels(signal, ff, &res0);
351    filter_8x1_pixels(&signal[4], ff, &res1);
352    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
353    height -= 2;
354    src_ptr += src_pitch << 1;
355    dst_ptr += dst_pitch << 1;
356  } while (height > 1);
357
358  if (height > 0) {
359    pack_8x1_pixels(src_ptr, signal);
360    filter_8x1_pixels(signal, ff, &res0);
361    store_8x1_pixels(&res0, &max, dst_ptr);
362  }
363}
364
365static void vpx_highbd_filter_block1d16_h8_avx2(
366    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
367    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
368  __m256i signal[8], res0, res1;
369  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
370
371  __m256i ff[4];
372  pack_filters(filter, ff);
373
374  src_ptr -= 3;
375  do {
376    pack_16x1_pixels(src_ptr, signal);
377    filter_8x1_pixels(signal, ff, &res0);
378    filter_8x1_pixels(&signal[4], ff, &res1);
379    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
380    height -= 1;
381    src_ptr += src_pitch;
382    dst_ptr += dst_pitch;
383  } while (height > 0);
384}
385
386// -----------------------------------------------------------------------------
387// 2-tap horizontal filtering
388
389static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
390  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
391  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
392  const __m256i p = _mm256_set1_epi32(0x09080706);
393  f[0] = _mm256_shuffle_epi8(hh, p);
394}
395
396// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
397// the difference is s0/s1 specifies first and second rows or,
398// first 16 samples and 8-sample shifted 16 samples
399static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
400                                     __m256i *sig) {
401  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
402  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
403  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
404  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
405  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
406  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
407  r0 = _mm256_shuffle_epi8(r0, sf2);
408  r1 = _mm256_shuffle_epi8(r1, sf2);
409  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
410  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
411}
412
413static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
414                                      const ptrdiff_t pitch, __m256i *sig) {
415  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
416  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
417  pack_16_2t_pixels(&r0, &r1, sig);
418}
419
420static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
421                                       __m256i *sig /*sig[2]*/) {
422  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
423  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
424  pack_16_2t_pixels(&r0, &r1, sig);
425}
426
427static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
428                                      __m256i *sig /*sig[2]*/) {
429  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
430  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
431  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
432  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
433  r0 = _mm256_permutevar8x32_epi32(r0, idx);
434  r0 = _mm256_shuffle_epi8(r0, sf2);
435  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
436}
437
438// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
439static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
440                                       __m256i *y0, __m256i *y1) {
441  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
442  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
443  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
444  x0 = _mm256_add_epi32(x0, rounding);
445  x1 = _mm256_add_epi32(x1, rounding);
446  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
447  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
448}
449
450static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
451                                        __m256i *y0) {
452  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
453  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
454  x0 = _mm256_add_epi32(x0, rounding);
455  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
456}
457
458static void vpx_highbd_filter_block1d8_h2_avx2(
459    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
460    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
461  __m256i signal[2], res0, res1;
462  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
463
464  __m256i ff;
465  pack_2t_filter(filter, &ff);
466
467  src_ptr -= 3;
468  do {
469    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
470    filter_16_2t_pixels(signal, &ff, &res0, &res1);
471    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
472    height -= 2;
473    src_ptr += src_pitch << 1;
474    dst_ptr += dst_pitch << 1;
475  } while (height > 1);
476
477  if (height > 0) {
478    pack_8x1_2t_pixels(src_ptr, signal);
479    filter_8x1_2t_pixels(signal, &ff, &res0);
480    store_8x1_pixels(&res0, &max, dst_ptr);
481  }
482}
483
484static void vpx_highbd_filter_block1d16_h2_avx2(
485    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
486    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
487  __m256i signal[2], res0, res1;
488  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
489
490  __m256i ff;
491  pack_2t_filter(filter, &ff);
492
493  src_ptr -= 3;
494  do {
495    pack_16x1_2t_pixels(src_ptr, signal);
496    filter_16_2t_pixels(signal, &ff, &res0, &res1);
497    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
498    height -= 1;
499    src_ptr += src_pitch;
500    dst_ptr += dst_pitch;
501  } while (height > 0);
502}
503
504// -----------------------------------------------------------------------------
505// Vertical Filtering
506
507static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
508  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
509  __m256i s1 =
510      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
511  __m256i s2 = _mm256_castsi128_si256(
512      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
513  __m256i s3 = _mm256_castsi128_si256(
514      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
515  __m256i s4 = _mm256_castsi128_si256(
516      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
517  __m256i s5 = _mm256_castsi128_si256(
518      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
519  __m256i s6 = _mm256_castsi128_si256(
520      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
521
522  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
523  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
524  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
525  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
526  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
527  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
528
529  sig[0] = _mm256_unpacklo_epi16(s0, s1);
530  sig[4] = _mm256_unpackhi_epi16(s0, s1);
531  sig[1] = _mm256_unpacklo_epi16(s2, s3);
532  sig[5] = _mm256_unpackhi_epi16(s2, s3);
533  sig[2] = _mm256_unpacklo_epi16(s4, s5);
534  sig[6] = _mm256_unpackhi_epi16(s4, s5);
535  sig[8] = s6;
536}
537
538static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
539                                   __m256i *sig) {
540  // base + 7th row
541  __m256i s0 = _mm256_castsi128_si256(
542      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
543  // base + 8th row
544  __m256i s1 = _mm256_castsi128_si256(
545      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
546  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
547  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
548  sig[3] = _mm256_unpacklo_epi16(s2, s3);
549  sig[7] = _mm256_unpackhi_epi16(s2, s3);
550  sig[8] = s1;
551}
552
553static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
554                                     __m256i *y0, __m256i *y1) {
555  filter_8x1_pixels(sig, f, y0);
556  filter_8x1_pixels(&sig[4], f, y1);
557}
558
559static INLINE void update_pixels(__m256i *sig) {
560  int i;
561  for (i = 0; i < 3; ++i) {
562    sig[i] = sig[i + 1];
563    sig[i + 4] = sig[i + 5];
564  }
565}
566
567static void vpx_highbd_filter_block1d8_v8_avx2(
568    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
569    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
570  __m256i signal[9], res0, res1;
571  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
572
573  __m256i ff[4];
574  pack_filters(filter, ff);
575
576  pack_8x9_init(src_ptr, src_pitch, signal);
577
578  do {
579    pack_8x9_pixels(src_ptr, src_pitch, signal);
580
581    filter_8x9_pixels(signal, ff, &res0, &res1);
582    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
583    update_pixels(signal);
584
585    src_ptr += src_pitch << 1;
586    dst_ptr += dst_pitch << 1;
587    height -= 2;
588  } while (height > 0);
589}
590
591static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
592  __m256i u0, u1, u2, u3;
593  // load 0-6 rows
594  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
595  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
596  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
597  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
598  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
599  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
600  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
601
602  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
603  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
604
605  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
606  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
607
608  sig[0] = _mm256_unpacklo_epi16(u0, u2);
609  sig[4] = _mm256_unpackhi_epi16(u0, u2);
610
611  sig[8] = _mm256_unpacklo_epi16(u1, u3);
612  sig[12] = _mm256_unpackhi_epi16(u1, u3);
613
614  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
615  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
616
617  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
618  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
619
620  sig[1] = _mm256_unpacklo_epi16(u0, u2);
621  sig[5] = _mm256_unpackhi_epi16(u0, u2);
622
623  sig[9] = _mm256_unpacklo_epi16(u1, u3);
624  sig[13] = _mm256_unpackhi_epi16(u1, u3);
625
626  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
627  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
628
629  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
630  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
631
632  sig[2] = _mm256_unpacklo_epi16(u0, u2);
633  sig[6] = _mm256_unpackhi_epi16(u0, u2);
634
635  sig[10] = _mm256_unpacklo_epi16(u1, u3);
636  sig[14] = _mm256_unpackhi_epi16(u1, u3);
637
638  sig[16] = s6;
639}
640
641static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
642                             __m256i *sig) {
643  // base + 7th row
644  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
645  // base + 8th row
646  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
647
648  __m256i u0, u1, u2, u3;
649  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
650  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
651
652  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
653  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
654
655  sig[3] = _mm256_unpacklo_epi16(u0, u2);
656  sig[7] = _mm256_unpackhi_epi16(u0, u2);
657
658  sig[11] = _mm256_unpacklo_epi16(u1, u3);
659  sig[15] = _mm256_unpackhi_epi16(u1, u3);
660
661  sig[16] = s8;
662}
663
664static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
665                                      __m256i *y0, __m256i *y1) {
666  __m256i res[4];
667  int i;
668  for (i = 0; i < 4; ++i) {
669    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
670  }
671
672  {
673    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
674    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
675    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
676    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
677  }
678}
679
680static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
681                                     const __m256i *mask, uint16_t *dst,
682                                     ptrdiff_t pitch) {
683  __m256i p = _mm256_min_epi16(*y0, *mask);
684  _mm256_storeu_si256((__m256i *)dst, p);
685  p = _mm256_min_epi16(*y1, *mask);
686  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
687}
688
689static void update_16x9_pixels(__m256i *sig) {
690  update_pixels(&sig[0]);
691  update_pixels(&sig[8]);
692}
693
694static void vpx_highbd_filter_block1d16_v8_avx2(
695    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
696    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
697  __m256i signal[17], res0, res1;
698  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
699
700  __m256i ff[4];
701  pack_filters(filter, ff);
702
703  pack_16x9_init(src_ptr, src_pitch, signal);
704
705  do {
706    pack_16x9_pixels(src_ptr, src_pitch, signal);
707    filter_16x9_pixels(signal, ff, &res0, &res1);
708    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
709    update_16x9_pixels(signal);
710
711    src_ptr += src_pitch << 1;
712    dst_ptr += dst_pitch << 1;
713    height -= 2;
714  } while (height > 0);
715}
716
717// -----------------------------------------------------------------------------
718// 2-tap vertical filtering
719
720static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
721  sig[2] = _mm256_loadu_si256((const __m256i *)src);
722}
723
724static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
725                                       __m256i *sig) {
726  // load the next row
727  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
728  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
729  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
730  sig[2] = u;
731}
732
733static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
734                                         __m256i *y0, __m256i *y1) {
735  filter_16_2t_pixels(sig, f, y0, y1);
736}
737
738static void vpx_highbd_filter_block1d16_v2_avx2(
739    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
740    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
741  __m256i signal[3], res0, res1;
742  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
743  __m256i ff;
744
745  pack_2t_filter(filter, &ff);
746  pack_16x2_init(src_ptr, signal);
747
748  do {
749    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
750    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
751    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
752
753    src_ptr += src_pitch;
754    dst_ptr += dst_pitch;
755    height -= 1;
756  } while (height > 0);
757}
758
759static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
760  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
761  const __m128i p = _mm_set1_epi32(0x09080706);
762  f[0] = _mm_shuffle_epi8(h, p);
763}
764
765static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
766  sig[2] = _mm_loadu_si128((const __m128i *)src);
767}
768
769static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
770                                          __m128i *sig) {
771  // load the next row
772  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
773  sig[0] = _mm_unpacklo_epi16(sig[2], u);
774  sig[1] = _mm_unpackhi_epi16(sig[2], u);
775  sig[2] = u;
776}
777
778static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
779                                      __m128i *y0, __m128i *y1) {
780  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
781  __m128i x0 = _mm_madd_epi16(sig[0], *f);
782  __m128i x1 = _mm_madd_epi16(sig[1], *f);
783  x0 = _mm_add_epi32(x0, rounding);
784  x1 = _mm_add_epi32(x1, rounding);
785  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
786  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
787}
788
789static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
790                                           const __m128i *mask, uint16_t *dst) {
791  __m128i res = _mm_packus_epi32(*y0, *y1);
792  res = _mm_min_epi16(res, *mask);
793  _mm_storeu_si128((__m128i *)dst, res);
794}
795
796static void vpx_highbd_filter_block1d8_v2_avx2(
797    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
798    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
799  __m128i signal[3], res0, res1;
800  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
801  __m128i ff;
802
803  pack_8x1_2t_filter(filter, &ff);
804  pack_8x2_init(src_ptr, signal);
805
806  do {
807    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
808    filter_8_2t_pixels(signal, &ff, &res0, &res1);
809    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
810
811    src_ptr += src_pitch;
812    dst_ptr += dst_pitch;
813    height -= 1;
814  } while (height > 0);
815}
816
817// Calculation with averaging the input pixels
818
819static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
820                                        uint16_t *dst) {
821  const __m128i a0 = _mm256_castsi256_si128(*y0);
822  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
823  __m128i res = _mm_packus_epi32(a0, a1);
824  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
825  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
826  res = _mm_avg_epu16(res, pix);
827  _mm_storeu_si128((__m128i *)dst, res);
828}
829
830static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
831                                        const __m256i *mask, uint16_t *dst,
832                                        ptrdiff_t pitch) {
833  __m256i a = _mm256_packus_epi32(*y0, *y1);
834  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
835  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
836  const __m256i pix =
837      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
838  a = _mm256_min_epi16(a, *mask);
839  a = _mm256_avg_epu16(a, pix);
840  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
841  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
842}
843
844static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
845                                         const __m256i *mask, uint16_t *dst) {
846  __m256i a = _mm256_packus_epi32(*y0, *y1);
847  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
848  a = _mm256_min_epi16(a, *mask);
849  a = _mm256_avg_epu16(a, pix);
850  _mm256_storeu_si256((__m256i *)dst, a);
851}
852
853static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
854                                         const __m256i *mask, uint16_t *dst,
855                                         ptrdiff_t pitch) {
856  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
857  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
858  __m256i p = _mm256_min_epi16(*y0, *mask);
859  p = _mm256_avg_epu16(p, pix0);
860  _mm256_storeu_si256((__m256i *)dst, p);
861
862  p = _mm256_min_epi16(*y1, *mask);
863  p = _mm256_avg_epu16(p, pix1);
864  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
865}
866
867static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
868                                               const __m128i *y1,
869                                               const __m128i *mask,
870                                               uint16_t *dst) {
871  __m128i res = _mm_packus_epi32(*y0, *y1);
872  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
873  res = _mm_min_epi16(res, *mask);
874  res = _mm_avg_epu16(res, pix);
875  _mm_storeu_si128((__m128i *)dst, res);
876}
877
878static void vpx_highbd_filter_block1d8_h8_avg_avx2(
879    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
880    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
881  __m256i signal[8], res0, res1;
882  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
883
884  __m256i ff[4];
885  pack_filters(filter, ff);
886
887  src_ptr -= 3;
888  do {
889    pack_8x2_pixels(src_ptr, src_pitch, signal);
890    filter_8x1_pixels(signal, ff, &res0);
891    filter_8x1_pixels(&signal[4], ff, &res1);
892    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
893    height -= 2;
894    src_ptr += src_pitch << 1;
895    dst_ptr += dst_pitch << 1;
896  } while (height > 1);
897
898  if (height > 0) {
899    pack_8x1_pixels(src_ptr, signal);
900    filter_8x1_pixels(signal, ff, &res0);
901    store_8x1_avg_pixels(&res0, &max, dst_ptr);
902  }
903}
904
905static void vpx_highbd_filter_block1d16_h8_avg_avx2(
906    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
907    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
908  __m256i signal[8], res0, res1;
909  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
910
911  __m256i ff[4];
912  pack_filters(filter, ff);
913
914  src_ptr -= 3;
915  do {
916    pack_16x1_pixels(src_ptr, signal);
917    filter_8x1_pixels(signal, ff, &res0);
918    filter_8x1_pixels(&signal[4], ff, &res1);
919    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
920    height -= 1;
921    src_ptr += src_pitch;
922    dst_ptr += dst_pitch;
923  } while (height > 0);
924}
925
926static void vpx_highbd_filter_block1d8_v8_avg_avx2(
927    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
928    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
929  __m256i signal[9], res0, res1;
930  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
931
932  __m256i ff[4];
933  pack_filters(filter, ff);
934
935  pack_8x9_init(src_ptr, src_pitch, signal);
936
937  do {
938    pack_8x9_pixels(src_ptr, src_pitch, signal);
939
940    filter_8x9_pixels(signal, ff, &res0, &res1);
941    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
942    update_pixels(signal);
943
944    src_ptr += src_pitch << 1;
945    dst_ptr += dst_pitch << 1;
946    height -= 2;
947  } while (height > 0);
948}
949
950static void vpx_highbd_filter_block1d16_v8_avg_avx2(
951    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
952    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
953  __m256i signal[17], res0, res1;
954  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
955
956  __m256i ff[4];
957  pack_filters(filter, ff);
958
959  pack_16x9_init(src_ptr, src_pitch, signal);
960
961  do {
962    pack_16x9_pixels(src_ptr, src_pitch, signal);
963    filter_16x9_pixels(signal, ff, &res0, &res1);
964    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
965    update_16x9_pixels(signal);
966
967    src_ptr += src_pitch << 1;
968    dst_ptr += dst_pitch << 1;
969    height -= 2;
970  } while (height > 0);
971}
972
973static void vpx_highbd_filter_block1d8_h2_avg_avx2(
974    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
975    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
976  __m256i signal[2], res0, res1;
977  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
978
979  __m256i ff;
980  pack_2t_filter(filter, &ff);
981
982  src_ptr -= 3;
983  do {
984    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
985    filter_16_2t_pixels(signal, &ff, &res0, &res1);
986    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
987    height -= 2;
988    src_ptr += src_pitch << 1;
989    dst_ptr += dst_pitch << 1;
990  } while (height > 1);
991
992  if (height > 0) {
993    pack_8x1_2t_pixels(src_ptr, signal);
994    filter_8x1_2t_pixels(signal, &ff, &res0);
995    store_8x1_avg_pixels(&res0, &max, dst_ptr);
996  }
997}
998
999static void vpx_highbd_filter_block1d16_h2_avg_avx2(
1000    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1001    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1002  __m256i signal[2], res0, res1;
1003  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1004
1005  __m256i ff;
1006  pack_2t_filter(filter, &ff);
1007
1008  src_ptr -= 3;
1009  do {
1010    pack_16x1_2t_pixels(src_ptr, signal);
1011    filter_16_2t_pixels(signal, &ff, &res0, &res1);
1012    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1013    height -= 1;
1014    src_ptr += src_pitch;
1015    dst_ptr += dst_pitch;
1016  } while (height > 0);
1017}
1018
1019static void vpx_highbd_filter_block1d16_v2_avg_avx2(
1020    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1021    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1022  __m256i signal[3], res0, res1;
1023  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1024  __m256i ff;
1025
1026  pack_2t_filter(filter, &ff);
1027  pack_16x2_init(src_ptr, signal);
1028
1029  do {
1030    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
1031    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
1032    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1033
1034    src_ptr += src_pitch;
1035    dst_ptr += dst_pitch;
1036    height -= 1;
1037  } while (height > 0);
1038}
1039
1040static void vpx_highbd_filter_block1d8_v2_avg_avx2(
1041    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1042    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1043  __m128i signal[3], res0, res1;
1044  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
1045  __m128i ff;
1046
1047  pack_8x1_2t_filter(filter, &ff);
1048  pack_8x2_init(src_ptr, signal);
1049
1050  do {
1051    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
1052    filter_8_2t_pixels(signal, &ff, &res0, &res1);
1053    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
1054
1055    src_ptr += src_pitch;
1056    dst_ptr += dst_pitch;
1057    height -= 1;
1058  } while (height > 0);
1059}
1060
1061void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1062                                        ptrdiff_t, uint32_t, const int16_t *,
1063                                        int);
1064void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1065                                        ptrdiff_t, uint32_t, const int16_t *,
1066                                        int);
1067void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1068                                        ptrdiff_t, uint32_t, const int16_t *,
1069                                        int);
1070void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1071                                        ptrdiff_t, uint32_t, const int16_t *,
1072                                        int);
1073#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
1074#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
1075#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
1076#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
1077
1078HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
1079HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
1080HIGH_FUN_CONV_2D(, avx2);
1081
1082void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
1083                                            uint16_t *, ptrdiff_t, uint32_t,
1084                                            const int16_t *, int);
1085void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t,
1086                                            uint16_t *, ptrdiff_t, uint32_t,
1087                                            const int16_t *, int);
1088void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t,
1089                                            uint16_t *, ptrdiff_t, uint32_t,
1090                                            const int16_t *, int);
1091void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
1092                                            uint16_t *, ptrdiff_t, uint32_t,
1093                                            const int16_t *, int);
1094#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
1095  vpx_highbd_filter_block1d4_h8_avg_sse2
1096#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
1097  vpx_highbd_filter_block1d4_h2_avg_sse2
1098#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
1099  vpx_highbd_filter_block1d4_v8_avg_sse2
1100#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
1101  vpx_highbd_filter_block1d4_v2_avg_sse2
1102
1103HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
1104HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
1105                 avx2);
1106HIGH_FUN_CONV_2D(avg_, avx2);
1107
1108#undef HIGHBD_FUNC
1109