1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12
13#include "./vpx_config.h"
14#include "./vp9_rtcd.h"
15#include "vpx_ports/mem.h"
16
17typedef void filter8_1dfunction (
18  const unsigned char *src_ptr,
19  const ptrdiff_t src_pitch,
20  unsigned char *output_ptr,
21  ptrdiff_t out_pitch,
22  unsigned int output_height,
23  const short *filter
24);
25
26#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
27  void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
28                                   uint8_t *dst, ptrdiff_t dst_stride, \
29                                   const int16_t *filter_x, int x_step_q4, \
30                                   const int16_t *filter_y, int y_step_q4, \
31                                   int w, int h) { \
32  if (step_q4 == 16 && filter[3] != 128) { \
33    if (filter[0] || filter[1] || filter[2]) { \
34      while (w >= 16) { \
35        vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
36                                                 src_stride, \
37                                                 dst, \
38                                                 dst_stride, \
39                                                 h, \
40                                                 filter); \
41        src += 16; \
42        dst += 16; \
43        w -= 16; \
44      } \
45      while (w >= 8) { \
46        vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
47                                                src_stride, \
48                                                dst, \
49                                                dst_stride, \
50                                                h, \
51                                                filter); \
52        src += 8; \
53        dst += 8; \
54        w -= 8; \
55      } \
56      while (w >= 4) { \
57        vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
58                                                src_stride, \
59                                                dst, \
60                                                dst_stride, \
61                                                h, \
62                                                filter); \
63        src += 4; \
64        dst += 4; \
65        w -= 4; \
66      } \
67    } else { \
68      while (w >= 16) { \
69        vp9_filter_block1d16_##dir##2_##avg##opt(src, \
70                                                 src_stride, \
71                                                 dst, \
72                                                 dst_stride, \
73                                                 h, \
74                                                 filter); \
75        src += 16; \
76        dst += 16; \
77        w -= 16; \
78      } \
79      while (w >= 8) { \
80        vp9_filter_block1d8_##dir##2_##avg##opt(src, \
81                                                src_stride, \
82                                                dst, \
83                                                dst_stride, \
84                                                h, \
85                                                filter); \
86        src += 8; \
87        dst += 8; \
88        w -= 8; \
89      } \
90      while (w >= 4) { \
91        vp9_filter_block1d4_##dir##2_##avg##opt(src, \
92                                                src_stride, \
93                                                dst, \
94                                                dst_stride, \
95                                                h, \
96                                                filter); \
97        src += 4; \
98        dst += 4; \
99        w -= 4; \
100      } \
101    } \
102  } \
103  if (w) { \
104    vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
105                             filter_x, x_step_q4, filter_y, y_step_q4, \
106                             w, h); \
107  } \
108}
109
110#define FUN_CONV_2D(avg, opt) \
111void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
112                              uint8_t *dst, ptrdiff_t dst_stride, \
113                              const int16_t *filter_x, int x_step_q4, \
114                              const int16_t *filter_y, int y_step_q4, \
115                              int w, int h) { \
116  assert(w <= 64); \
117  assert(h <= 64); \
118  if (x_step_q4 == 16 && y_step_q4 == 16) { \
119    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
120        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
121      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
122      vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
123                                filter_x, x_step_q4, filter_y, y_step_q4, \
124                                w, h + 7); \
125      vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
126                                      filter_x, x_step_q4, filter_y, \
127                                      y_step_q4, w, h); \
128    } else { \
129      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
130      vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
131                                filter_x, x_step_q4, filter_y, y_step_q4, \
132                                w, h + 1); \
133      vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
134                                      filter_x, x_step_q4, filter_y, \
135                                      y_step_q4, w, h); \
136    } \
137  } else { \
138    vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
139                           filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
140  } \
141}
142#if HAVE_AVX2
143filter8_1dfunction vp9_filter_block1d16_v8_avx2;
144filter8_1dfunction vp9_filter_block1d16_h8_avx2;
145filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
146#if (ARCH_X86_64)
147filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
148filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
149filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
150#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
151#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
152#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
153#else
154filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
155filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
156filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
157#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
158#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
159#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
160#endif
161filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
162filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
163filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
164filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
165filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
166filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
167#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
168#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
169#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
170#define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
171#define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
172#define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
173#define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
174// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
175//                                uint8_t *dst, ptrdiff_t dst_stride,
176//                                const int16_t *filter_x, int x_step_q4,
177//                                const int16_t *filter_y, int y_step_q4,
178//                                int w, int h);
179// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
180//                               uint8_t *dst, ptrdiff_t dst_stride,
181//                               const int16_t *filter_x, int x_step_q4,
182//                               const int16_t *filter_y, int y_step_q4,
183//                               int w, int h);
184FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
185FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
186
187// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
188//                          uint8_t *dst, ptrdiff_t dst_stride,
189//                          const int16_t *filter_x, int x_step_q4,
190//                          const int16_t *filter_y, int y_step_q4,
191//                          int w, int h);
192FUN_CONV_2D(, avx2);
193#endif
194#if HAVE_SSSE3
195#if (ARCH_X86_64)
196filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
197filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
198filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
199filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
200filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
201filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
202#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
203#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
204#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
205#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
206#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
207#else
208filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
209filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
210filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
211filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
212filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
213filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
214#endif
215filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
216filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
217filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
218filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
219filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
220filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
221
222filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
223filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
224filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
225filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
226filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
227filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
228filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
229filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
230filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
231filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
232filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
233filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
234
235// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
236//                                uint8_t *dst, ptrdiff_t dst_stride,
237//                                const int16_t *filter_x, int x_step_q4,
238//                                const int16_t *filter_y, int y_step_q4,
239//                                int w, int h);
240// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
241//                               uint8_t *dst, ptrdiff_t dst_stride,
242//                               const int16_t *filter_x, int x_step_q4,
243//                               const int16_t *filter_y, int y_step_q4,
244//                               int w, int h);
245// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
246//                                    uint8_t *dst, ptrdiff_t dst_stride,
247//                                    const int16_t *filter_x, int x_step_q4,
248//                                    const int16_t *filter_y, int y_step_q4,
249//                                    int w, int h);
250// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
251//                                   uint8_t *dst, ptrdiff_t dst_stride,
252//                                   const int16_t *filter_x, int x_step_q4,
253//                                   const int16_t *filter_y, int y_step_q4,
254//                                   int w, int h);
255FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
256FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
257FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
258FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
259            ssse3);
260
261// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
262//                          uint8_t *dst, ptrdiff_t dst_stride,
263//                          const int16_t *filter_x, int x_step_q4,
264//                          const int16_t *filter_y, int y_step_q4,
265//                          int w, int h);
266// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
267//                              uint8_t *dst, ptrdiff_t dst_stride,
268//                              const int16_t *filter_x, int x_step_q4,
269//                              const int16_t *filter_y, int y_step_q4,
270//                              int w, int h);
271FUN_CONV_2D(, ssse3);
272FUN_CONV_2D(avg_ , ssse3);
273#endif
274
275#if HAVE_SSE2
276filter8_1dfunction vp9_filter_block1d16_v8_sse2;
277filter8_1dfunction vp9_filter_block1d16_h8_sse2;
278filter8_1dfunction vp9_filter_block1d8_v8_sse2;
279filter8_1dfunction vp9_filter_block1d8_h8_sse2;
280filter8_1dfunction vp9_filter_block1d4_v8_sse2;
281filter8_1dfunction vp9_filter_block1d4_h8_sse2;
282filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
283filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
284filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
285filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
286filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
287filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
288
289filter8_1dfunction vp9_filter_block1d16_v2_sse2;
290filter8_1dfunction vp9_filter_block1d16_h2_sse2;
291filter8_1dfunction vp9_filter_block1d8_v2_sse2;
292filter8_1dfunction vp9_filter_block1d8_h2_sse2;
293filter8_1dfunction vp9_filter_block1d4_v2_sse2;
294filter8_1dfunction vp9_filter_block1d4_h2_sse2;
295filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
296filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
297filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
298filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
299filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
300filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
301
302// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
303//                               uint8_t *dst, ptrdiff_t dst_stride,
304//                               const int16_t *filter_x, int x_step_q4,
305//                               const int16_t *filter_y, int y_step_q4,
306//                               int w, int h);
307// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
308//                              uint8_t *dst, ptrdiff_t dst_stride,
309//                              const int16_t *filter_x, int x_step_q4,
310//                              const int16_t *filter_y, int y_step_q4,
311//                              int w, int h);
312// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
313//                                   uint8_t *dst, ptrdiff_t dst_stride,
314//                                   const int16_t *filter_x, int x_step_q4,
315//                                   const int16_t *filter_y, int y_step_q4,
316//                                   int w, int h);
317// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
318//                                  uint8_t *dst, ptrdiff_t dst_stride,
319//                                  const int16_t *filter_x, int x_step_q4,
320//                                  const int16_t *filter_y, int y_step_q4,
321//                                  int w, int h);
322FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
323FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
324FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
325FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
326
327// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
328//                         uint8_t *dst, ptrdiff_t dst_stride,
329//                         const int16_t *filter_x, int x_step_q4,
330//                         const int16_t *filter_y, int y_step_q4,
331//                         int w, int h);
332// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
333//                             uint8_t *dst, ptrdiff_t dst_stride,
334//                             const int16_t *filter_x, int x_step_q4,
335//                             const int16_t *filter_y, int y_step_q4,
336//                             int w, int h);
337FUN_CONV_2D(, sse2);
338FUN_CONV_2D(avg_ , sse2);
339#endif
340