1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12
13#include "./vpx_config.h"
14#include "./vp9_rtcd.h"
15#include "vpx_ports/mem.h"
16
17typedef void filter8_1dfunction (
18  const unsigned char *src_ptr,
19  const ptrdiff_t src_pitch,
20  unsigned char *output_ptr,
21  ptrdiff_t out_pitch,
22  unsigned int output_height,
23  const short *filter
24);
25
26#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
27  void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
28                                   uint8_t *dst, ptrdiff_t dst_stride, \
29                                   const int16_t *filter_x, int x_step_q4, \
30                                   const int16_t *filter_y, int y_step_q4, \
31                                   int w, int h) { \
32  if (step_q4 == 16 && filter[3] != 128) { \
33    if (filter[0] || filter[1] || filter[2]) { \
34      while (w >= 16) { \
35        vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
36                                                 src_stride, \
37                                                 dst, \
38                                                 dst_stride, \
39                                                 h, \
40                                                 filter); \
41        src += 16; \
42        dst += 16; \
43        w -= 16; \
44      } \
45      while (w >= 8) { \
46        vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
47                                                src_stride, \
48                                                dst, \
49                                                dst_stride, \
50                                                h, \
51                                                filter); \
52        src += 8; \
53        dst += 8; \
54        w -= 8; \
55      } \
56      while (w >= 4) { \
57        vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
58                                                src_stride, \
59                                                dst, \
60                                                dst_stride, \
61                                                h, \
62                                                filter); \
63        src += 4; \
64        dst += 4; \
65        w -= 4; \
66      } \
67    } else { \
68      while (w >= 16) { \
69        vp9_filter_block1d16_##dir##2_##avg##opt(src, \
70                                                 src_stride, \
71                                                 dst, \
72                                                 dst_stride, \
73                                                 h, \
74                                                 filter); \
75        src += 16; \
76        dst += 16; \
77        w -= 16; \
78      } \
79      while (w >= 8) { \
80        vp9_filter_block1d8_##dir##2_##avg##opt(src, \
81                                                src_stride, \
82                                                dst, \
83                                                dst_stride, \
84                                                h, \
85                                                filter); \
86        src += 8; \
87        dst += 8; \
88        w -= 8; \
89      } \
90      while (w >= 4) { \
91        vp9_filter_block1d4_##dir##2_##avg##opt(src, \
92                                                src_stride, \
93                                                dst, \
94                                                dst_stride, \
95                                                h, \
96                                                filter); \
97        src += 4; \
98        dst += 4; \
99        w -= 4; \
100      } \
101    } \
102  } \
103  if (w) { \
104    vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
105                             filter_x, x_step_q4, filter_y, y_step_q4, \
106                             w, h); \
107  } \
108}
109
110#define FUN_CONV_2D(avg, opt) \
111void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
112                              uint8_t *dst, ptrdiff_t dst_stride, \
113                              const int16_t *filter_x, int x_step_q4, \
114                              const int16_t *filter_y, int y_step_q4, \
115                              int w, int h) { \
116  assert(w <= 64); \
117  assert(h <= 64); \
118  if (x_step_q4 == 16 && y_step_q4 == 16) { \
119    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
120        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
121      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
122      vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
123                                filter_x, x_step_q4, filter_y, y_step_q4, \
124                                w, h + 7); \
125      vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
126                                      filter_x, x_step_q4, filter_y, \
127                                      y_step_q4, w, h); \
128    } else { \
129      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
130      vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
131                                filter_x, x_step_q4, filter_y, y_step_q4, \
132                                w, h + 1); \
133      vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
134                                      filter_x, x_step_q4, filter_y, \
135                                      y_step_q4, w, h); \
136    } \
137  } else { \
138    vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
139                           filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
140  } \
141}
142
143#if CONFIG_VP9_HIGHBITDEPTH
144
145typedef void high_filter8_1dfunction (
146  const uint16_t *src_ptr,
147  const ptrdiff_t src_pitch,
148  uint16_t *output_ptr,
149  ptrdiff_t out_pitch,
150  unsigned int output_height,
151  const int16_t *filter,
152  int bd
153);
154
155#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
156  void vp9_high_convolve8_##name##_##opt(const uint8_t *src8, \
157                                         ptrdiff_t src_stride, \
158                                         uint8_t *dst8, ptrdiff_t dst_stride, \
159                                         const int16_t *filter_x, \
160                                         int x_step_q4, \
161                                         const int16_t *filter_y, \
162                                         int y_step_q4, \
163                                         int w, int h, int bd) { \
164  if (step_q4 == 16 && filter[3] != 128) { \
165    uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
166    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
167    if (filter[0] || filter[1] || filter[2]) { \
168      while (w >= 16) { \
169        vp9_high_filter_block1d16_##dir##8_##avg##opt(src_start, \
170                                                      src_stride, \
171                                                      dst, \
172                                                      dst_stride, \
173                                                      h, \
174                                                      filter, \
175                                                      bd); \
176        src += 16; \
177        dst += 16; \
178        w -= 16; \
179      } \
180      while (w >= 8) { \
181        vp9_high_filter_block1d8_##dir##8_##avg##opt(src_start, \
182                                                     src_stride, \
183                                                     dst, \
184                                                     dst_stride, \
185                                                     h, \
186                                                     filter, \
187                                                     bd); \
188        src += 8; \
189        dst += 8; \
190        w -= 8; \
191      } \
192      while (w >= 4) { \
193        vp9_high_filter_block1d4_##dir##8_##avg##opt(src_start, \
194                                                     src_stride, \
195                                                     dst, \
196                                                     dst_stride, \
197                                                     h, \
198                                                     filter, \
199                                                     bd); \
200        src += 4; \
201        dst += 4; \
202        w -= 4; \
203      } \
204    } else { \
205      while (w >= 16) { \
206        vp9_high_filter_block1d16_##dir##2_##avg##opt(src, \
207                                                      src_stride, \
208                                                      dst, \
209                                                      dst_stride, \
210                                                      h, \
211                                                      filter, \
212                                                      bd); \
213        src += 16; \
214        dst += 16; \
215        w -= 16; \
216      } \
217      while (w >= 8) { \
218        vp9_high_filter_block1d8_##dir##2_##avg##opt(src, \
219                                                     src_stride, \
220                                                     dst, \
221                                                     dst_stride, \
222                                                     h, \
223                                                     filter, \
224                                                     bd); \
225        src += 8; \
226        dst += 8; \
227        w -= 8; \
228      } \
229      while (w >= 4) { \
230        vp9_high_filter_block1d4_##dir##2_##avg##opt(src, \
231                                                     src_stride, \
232                                                     dst, \
233                                                     dst_stride, \
234                                                     h, \
235                                                     filter, \
236                                                     bd); \
237        src += 4; \
238        dst += 4; \
239        w -= 4; \
240      } \
241    } \
242  } \
243  if (w) { \
244    vp9_high_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
245                                  filter_x, x_step_q4, filter_y, y_step_q4, \
246                                  w, h, bd); \
247  } \
248}
249
250#define HIGH_FUN_CONV_2D(avg, opt) \
251void vp9_high_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
252                                   uint8_t *dst, ptrdiff_t dst_stride, \
253                                   const int16_t *filter_x, int x_step_q4, \
254                                   const int16_t *filter_y, int y_step_q4, \
255                                   int w, int h, int bd) { \
256  assert(w <= 64); \
257  assert(h <= 64); \
258  if (x_step_q4 == 16 && y_step_q4 == 16) { \
259    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
260        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
261      DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 71); \
262      vp9_high_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
263                                     CONVERT_TO_BYTEPTR(fdata2), 64, \
264                                     filter_x, x_step_q4, filter_y, y_step_q4, \
265                                     w, h + 7, bd); \
266      vp9_high_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
267                                           64, dst, dst_stride, \
268                                           filter_x, x_step_q4, filter_y, \
269                                           y_step_q4, w, h, bd); \
270    } else { \
271      DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 65); \
272      vp9_high_convolve8_horiz_##opt(src, src_stride, \
273                                     CONVERT_TO_BYTEPTR(fdata2), 64, \
274                                     filter_x, x_step_q4, filter_y, y_step_q4, \
275                                     w, h + 1, bd); \
276      vp9_high_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
277                                           dst, dst_stride, \
278                                           filter_x, x_step_q4, filter_y, \
279                                           y_step_q4, w, h, bd); \
280    } \
281  } else { \
282    vp9_high_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
283                                filter_x, x_step_q4, filter_y, y_step_q4, w, \
284                                h, bd); \
285  } \
286}
287#endif  // CONFIG_VP9_HIGHBITDEPTH
288
289#if HAVE_AVX2 && HAVE_SSSE3
290filter8_1dfunction vp9_filter_block1d16_v8_avx2;
291filter8_1dfunction vp9_filter_block1d16_h8_avx2;
292filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
293#if ARCH_X86_64
294filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
295filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
296filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
297#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
298#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
299#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
300#else  // ARCH_X86
301filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
302filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
303filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
304#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
305#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
306#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
307#endif  // ARCH_X86_64 / ARCH_X86
308filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
309filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
310filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
311filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
312filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
313filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
314#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
315#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
316#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
317#define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
318#define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
319#define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
320#define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
321// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
322//                                uint8_t *dst, ptrdiff_t dst_stride,
323//                                const int16_t *filter_x, int x_step_q4,
324//                                const int16_t *filter_y, int y_step_q4,
325//                                int w, int h);
326// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
327//                               uint8_t *dst, ptrdiff_t dst_stride,
328//                               const int16_t *filter_x, int x_step_q4,
329//                               const int16_t *filter_y, int y_step_q4,
330//                               int w, int h);
331FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
332FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
333
334// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
335//                          uint8_t *dst, ptrdiff_t dst_stride,
336//                          const int16_t *filter_x, int x_step_q4,
337//                          const int16_t *filter_y, int y_step_q4,
338//                          int w, int h);
339FUN_CONV_2D(, avx2);
340#endif  // HAVE_AX2 && HAVE_SSSE3
341#if HAVE_SSSE3
342#if ARCH_X86_64
343filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
344filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
345filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
346filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
347filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
348filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
349#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
350#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
351#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
352#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
353#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
354#else  // ARCH_X86
355filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
356filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
357filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
358filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
359filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
360filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
361#endif  // ARCH_X86_64 / ARCH_X86
362filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
363filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
364filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
365filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
366filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
367filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
368
369filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
370filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
371filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
372filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
373filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
374filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
375filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
376filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
377filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
378filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
379filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
380filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
381
382// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
383//                                uint8_t *dst, ptrdiff_t dst_stride,
384//                                const int16_t *filter_x, int x_step_q4,
385//                                const int16_t *filter_y, int y_step_q4,
386//                                int w, int h);
387// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
388//                               uint8_t *dst, ptrdiff_t dst_stride,
389//                               const int16_t *filter_x, int x_step_q4,
390//                               const int16_t *filter_y, int y_step_q4,
391//                               int w, int h);
392// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
393//                                    uint8_t *dst, ptrdiff_t dst_stride,
394//                                    const int16_t *filter_x, int x_step_q4,
395//                                    const int16_t *filter_y, int y_step_q4,
396//                                    int w, int h);
397// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
398//                                   uint8_t *dst, ptrdiff_t dst_stride,
399//                                   const int16_t *filter_x, int x_step_q4,
400//                                   const int16_t *filter_y, int y_step_q4,
401//                                   int w, int h);
402FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
403FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
404FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
405FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
406            ssse3);
407
408// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
409//                          uint8_t *dst, ptrdiff_t dst_stride,
410//                          const int16_t *filter_x, int x_step_q4,
411//                          const int16_t *filter_y, int y_step_q4,
412//                          int w, int h);
413// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
414//                              uint8_t *dst, ptrdiff_t dst_stride,
415//                              const int16_t *filter_x, int x_step_q4,
416//                              const int16_t *filter_y, int y_step_q4,
417//                              int w, int h);
418FUN_CONV_2D(, ssse3);
419FUN_CONV_2D(avg_ , ssse3);
420#endif  // HAVE_SSSE3
421
422#if HAVE_SSE2
423filter8_1dfunction vp9_filter_block1d16_v8_sse2;
424filter8_1dfunction vp9_filter_block1d16_h8_sse2;
425filter8_1dfunction vp9_filter_block1d8_v8_sse2;
426filter8_1dfunction vp9_filter_block1d8_h8_sse2;
427filter8_1dfunction vp9_filter_block1d4_v8_sse2;
428filter8_1dfunction vp9_filter_block1d4_h8_sse2;
429filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
430filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
431filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
432filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
433filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
434filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
435
436filter8_1dfunction vp9_filter_block1d16_v2_sse2;
437filter8_1dfunction vp9_filter_block1d16_h2_sse2;
438filter8_1dfunction vp9_filter_block1d8_v2_sse2;
439filter8_1dfunction vp9_filter_block1d8_h2_sse2;
440filter8_1dfunction vp9_filter_block1d4_v2_sse2;
441filter8_1dfunction vp9_filter_block1d4_h2_sse2;
442filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
443filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
444filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
445filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
446filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
447filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
448
449// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
450//                               uint8_t *dst, ptrdiff_t dst_stride,
451//                               const int16_t *filter_x, int x_step_q4,
452//                               const int16_t *filter_y, int y_step_q4,
453//                               int w, int h);
454// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
455//                              uint8_t *dst, ptrdiff_t dst_stride,
456//                              const int16_t *filter_x, int x_step_q4,
457//                              const int16_t *filter_y, int y_step_q4,
458//                              int w, int h);
459// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
460//                                   uint8_t *dst, ptrdiff_t dst_stride,
461//                                   const int16_t *filter_x, int x_step_q4,
462//                                   const int16_t *filter_y, int y_step_q4,
463//                                   int w, int h);
464// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
465//                                  uint8_t *dst, ptrdiff_t dst_stride,
466//                                  const int16_t *filter_x, int x_step_q4,
467//                                  const int16_t *filter_y, int y_step_q4,
468//                                  int w, int h);
469FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
470FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
471FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
472FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
473
474// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
475//                         uint8_t *dst, ptrdiff_t dst_stride,
476//                         const int16_t *filter_x, int x_step_q4,
477//                         const int16_t *filter_y, int y_step_q4,
478//                         int w, int h);
479// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
480//                             uint8_t *dst, ptrdiff_t dst_stride,
481//                             const int16_t *filter_x, int x_step_q4,
482//                             const int16_t *filter_y, int y_step_q4,
483//                             int w, int h);
484FUN_CONV_2D(, sse2);
485FUN_CONV_2D(avg_ , sse2);
486
487#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
488high_filter8_1dfunction vp9_high_filter_block1d16_v8_sse2;
489high_filter8_1dfunction vp9_high_filter_block1d16_h8_sse2;
490high_filter8_1dfunction vp9_high_filter_block1d8_v8_sse2;
491high_filter8_1dfunction vp9_high_filter_block1d8_h8_sse2;
492high_filter8_1dfunction vp9_high_filter_block1d4_v8_sse2;
493high_filter8_1dfunction vp9_high_filter_block1d4_h8_sse2;
494high_filter8_1dfunction vp9_high_filter_block1d16_v8_avg_sse2;
495high_filter8_1dfunction vp9_high_filter_block1d16_h8_avg_sse2;
496high_filter8_1dfunction vp9_high_filter_block1d8_v8_avg_sse2;
497high_filter8_1dfunction vp9_high_filter_block1d8_h8_avg_sse2;
498high_filter8_1dfunction vp9_high_filter_block1d4_v8_avg_sse2;
499high_filter8_1dfunction vp9_high_filter_block1d4_h8_avg_sse2;
500
501high_filter8_1dfunction vp9_high_filter_block1d16_v2_sse2;
502high_filter8_1dfunction vp9_high_filter_block1d16_h2_sse2;
503high_filter8_1dfunction vp9_high_filter_block1d8_v2_sse2;
504high_filter8_1dfunction vp9_high_filter_block1d8_h2_sse2;
505high_filter8_1dfunction vp9_high_filter_block1d4_v2_sse2;
506high_filter8_1dfunction vp9_high_filter_block1d4_h2_sse2;
507high_filter8_1dfunction vp9_high_filter_block1d16_v2_avg_sse2;
508high_filter8_1dfunction vp9_high_filter_block1d16_h2_avg_sse2;
509high_filter8_1dfunction vp9_high_filter_block1d8_v2_avg_sse2;
510high_filter8_1dfunction vp9_high_filter_block1d8_h2_avg_sse2;
511high_filter8_1dfunction vp9_high_filter_block1d4_v2_avg_sse2;
512high_filter8_1dfunction vp9_high_filter_block1d4_h2_avg_sse2;
513
514// void vp9_high_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
515//                                    uint8_t *dst, ptrdiff_t dst_stride,
516//                                    const int16_t *filter_x, int x_step_q4,
517//                                    const int16_t *filter_y, int y_step_q4,
518//                                    int w, int h, int bd);
519// void vp9_high_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
520//                                   uint8_t *dst, ptrdiff_t dst_stride,
521//                                   const int16_t *filter_x, int x_step_q4,
522//                                   const int16_t *filter_y, int y_step_q4,
523//                                   int w, int h, int bd);
524// void vp9_high_convolve8_avg_horiz_sse2(const uint8_t *src,
525//                                        ptrdiff_t src_stride,
526//                                        uint8_t *dst, ptrdiff_t dst_stride,
527//                                        const int16_t *filter_x,
528//                                        int x_step_q4,
529//                                        const int16_t *filter_y,
530//                                        int y_step_q4,
531//                                        int w, int h, int bd);
532// void vp9_high_convolve8_avg_vert_sse2(const uint8_t *src,
533//                                       ptrdiff_t src_stride,
534//                                       uint8_t *dst, ptrdiff_t dst_stride,
535//                                       const int16_t *filter_x, int x_step_q4,
536//                                       const int16_t *filter_y, int y_step_q4,
537//                                       int w, int h, int bd);
538HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
539HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
540HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
541HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
542                 sse2);
543
544// void vp9_high_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
545//                              uint8_t *dst, ptrdiff_t dst_stride,
546//                              const int16_t *filter_x, int x_step_q4,
547//                              const int16_t *filter_y, int y_step_q4,
548//                              int w, int h, int bd);
549// void vp9_high_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
550//                                  uint8_t *dst, ptrdiff_t dst_stride,
551//                                  const int16_t *filter_x, int x_step_q4,
552//                                  const int16_t *filter_y, int y_step_q4,
553//                                  int w, int h, int bd);
554HIGH_FUN_CONV_2D(, sse2);
555HIGH_FUN_CONV_2D(avg_ , sse2);
556#endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
557#endif  // HAVE_SSE2
558