1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "vpx_config.h"
12#include "vp8_rtcd.h"
13#include "vpx_ports/mem.h"
14#include "filter_x86.h"
15
16extern const short vp8_six_tap_x86[8][6 * 8];
17
18extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
19                                      unsigned short *output_ptr,
20                                      unsigned int src_pixels_per_line,
21                                      unsigned int pixel_step,
22                                      unsigned int output_height,
23                                      unsigned int output_width,
24                                      const short *vp8_filter);
25extern void vp8_filter_block1dc_v6_mmx(
26    unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
27    unsigned int pixels_per_line, unsigned int pixel_step,
28    unsigned int output_height, unsigned int output_width,
29    const short *vp8_filter);
30extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
31                                        unsigned short *output_ptr,
32                                        unsigned int src_pixels_per_line,
33                                        unsigned int pixel_step,
34                                        unsigned int output_height,
35                                        unsigned int output_width,
36                                        const short *vp8_filter);
37extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
38                                         unsigned short *output_ptr,
39                                         unsigned int src_pixels_per_line,
40                                         unsigned int pixel_step,
41                                         unsigned int output_height,
42                                         unsigned int output_width,
43                                         const short *vp8_filter);
44extern void vp8_filter_block1d8_v6_sse2(
45    unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
46    unsigned int pixels_per_line, unsigned int pixel_step,
47    unsigned int output_height, unsigned int output_width,
48    const short *vp8_filter);
49extern void vp8_filter_block1d16_v6_sse2(
50    unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
51    unsigned int pixels_per_line, unsigned int pixel_step,
52    unsigned int output_height, unsigned int output_width,
53    const short *vp8_filter);
54extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
55                                         unsigned short *output_ptr,
56                                         unsigned int src_pixels_per_line,
57                                         unsigned int output_height,
58                                         unsigned int output_width);
59extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
60                                             unsigned int src_pixels_per_line,
61                                             unsigned char *output_ptr,
62                                             int dst_ptich,
63                                             unsigned int output_height,
64                                             const short *vp8_filter);
65extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
66                                              unsigned int src_pixels_per_line,
67                                              unsigned char *output_ptr,
68                                              int dst_ptich,
69                                              unsigned int output_height,
70                                              const short *vp8_filter);
71extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
72                                             unsigned int src_pixels_per_line,
73                                             unsigned char *output_ptr,
74                                             int dst_ptich,
75                                             unsigned int output_height,
76                                             const short *vp8_filter);
77
78#if HAVE_MMX
79void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
80                               int xoffset, int yoffset, unsigned char *dst_ptr,
81                               int dst_pitch) {
82  DECLARE_ALIGNED(16, unsigned short,
83                  FData2[16 * 16]); /* Temp data bufffer used in filtering */
84  const short *HFilter, *VFilter;
85  HFilter = vp8_six_tap_x86[xoffset];
86  vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
87                            src_pixels_per_line, 1, 9, 8, HFilter);
88  VFilter = vp8_six_tap_x86[yoffset];
89  vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
90                             VFilter);
91}
92#endif
93
94#if HAVE_SSE2
95void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
96                                  int src_pixels_per_line, int xoffset,
97                                  int yoffset, unsigned char *dst_ptr,
98                                  int dst_pitch
99
100                                  ) {
101  DECLARE_ALIGNED(16, unsigned short,
102                  FData2[24 * 24]); /* Temp data bufffer used in filtering */
103
104  const short *HFilter, *VFilter;
105
106  if (xoffset) {
107    if (yoffset) {
108      HFilter = vp8_six_tap_x86[xoffset];
109      vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
110                                   src_pixels_per_line, 1, 21, 32, HFilter);
111      VFilter = vp8_six_tap_x86[yoffset];
112      vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
113                                   dst_pitch, VFilter);
114    } else {
115      /* First-pass only */
116      HFilter = vp8_six_tap_x86[xoffset];
117      vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
118                                        dst_pitch, 16, HFilter);
119    }
120  } else {
121    /* Second-pass only */
122    VFilter = vp8_six_tap_x86[yoffset];
123    vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
124                                 src_pixels_per_line, 21, 32);
125    vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
126                                 dst_pitch, VFilter);
127  }
128}
129
130void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
131                                int xoffset, int yoffset,
132                                unsigned char *dst_ptr, int dst_pitch) {
133  DECLARE_ALIGNED(16, unsigned short,
134                  FData2[256]); /* Temp data bufffer used in filtering */
135  const short *HFilter, *VFilter;
136
137  if (xoffset) {
138    if (yoffset) {
139      HFilter = vp8_six_tap_x86[xoffset];
140      vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
141                                  src_pixels_per_line, 1, 13, 16, HFilter);
142      VFilter = vp8_six_tap_x86[yoffset];
143      vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
144                                  dst_pitch, VFilter);
145    } else {
146      /* First-pass only */
147      HFilter = vp8_six_tap_x86[xoffset];
148      vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
149                                       dst_pitch, 8, HFilter);
150    }
151  } else {
152    /* Second-pass only */
153    VFilter = vp8_six_tap_x86[yoffset];
154    vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
155                                     src_pixels_per_line, dst_ptr, dst_pitch, 8,
156                                     VFilter);
157  }
158}
159
160void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
161                                int xoffset, int yoffset,
162                                unsigned char *dst_ptr, int dst_pitch) {
163  DECLARE_ALIGNED(16, unsigned short,
164                  FData2[256]); /* Temp data bufffer used in filtering */
165  const short *HFilter, *VFilter;
166
167  if (xoffset) {
168    if (yoffset) {
169      HFilter = vp8_six_tap_x86[xoffset];
170      vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
171                                  src_pixels_per_line, 1, 9, 16, HFilter);
172      VFilter = vp8_six_tap_x86[yoffset];
173      vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
174                                  dst_pitch, VFilter);
175    } else {
176      /* First-pass only */
177      HFilter = vp8_six_tap_x86[xoffset];
178      vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
179                                       dst_pitch, 4, HFilter);
180    }
181  } else {
182    /* Second-pass only */
183    VFilter = vp8_six_tap_x86[yoffset];
184    vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
185                                     src_pixels_per_line, dst_ptr, dst_pitch, 4,
186                                     VFilter);
187  }
188}
189
190#endif
191
192#if HAVE_SSSE3
193
194extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
195                                         unsigned int src_pixels_per_line,
196                                         unsigned char *output_ptr,
197                                         unsigned int output_pitch,
198                                         unsigned int output_height,
199                                         unsigned int vp8_filter_index);
200
201extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
202                                          unsigned int src_pixels_per_line,
203                                          unsigned char *output_ptr,
204                                          unsigned int output_pitch,
205                                          unsigned int output_height,
206                                          unsigned int vp8_filter_index);
207
208extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
209                                          unsigned int src_pitch,
210                                          unsigned char *output_ptr,
211                                          unsigned int out_pitch,
212                                          unsigned int output_height,
213                                          unsigned int vp8_filter_index);
214
215extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
216                                         unsigned int src_pitch,
217                                         unsigned char *output_ptr,
218                                         unsigned int out_pitch,
219                                         unsigned int output_height,
220                                         unsigned int vp8_filter_index);
221
222extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
223                                         unsigned int src_pixels_per_line,
224                                         unsigned char *output_ptr,
225                                         unsigned int output_pitch,
226                                         unsigned int output_height,
227                                         unsigned int vp8_filter_index);
228
229extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
230                                         unsigned int src_pitch,
231                                         unsigned char *output_ptr,
232                                         unsigned int out_pitch,
233                                         unsigned int output_height,
234                                         unsigned int vp8_filter_index);
235
236void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
237                                   int src_pixels_per_line, int xoffset,
238                                   int yoffset, unsigned char *dst_ptr,
239                                   int dst_pitch
240
241                                   ) {
242  DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
243
244  if (xoffset) {
245    if (yoffset) {
246      vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
247                                    src_pixels_per_line, FData2, 16, 21,
248                                    xoffset);
249      vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
250                                    yoffset);
251    } else {
252      /* First-pass only */
253      vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
254                                    dst_pitch, 16, xoffset);
255    }
256  } else {
257    if (yoffset) {
258      /* Second-pass only */
259      vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
260                                    src_pixels_per_line, dst_ptr, dst_pitch, 16,
261                                    yoffset);
262    } else {
263      /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
264       * yoffset==0) case correctly. Add copy function here to guarantee
265       * six-tap function handles all possible offsets. */
266      vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
267    }
268  }
269}
270
271void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
272                                 int src_pixels_per_line, int xoffset,
273                                 int yoffset, unsigned char *dst_ptr,
274                                 int dst_pitch) {
275  DECLARE_ALIGNED(16, unsigned char, FData2[256]);
276
277  if (xoffset) {
278    if (yoffset) {
279      vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
280                                   src_pixels_per_line, FData2, 8, 13, xoffset);
281      vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
282    } else {
283      vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
284                                   dst_pitch, 8, xoffset);
285    }
286  } else {
287    if (yoffset) {
288      /* Second-pass only */
289      vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
290                                   src_pixels_per_line, dst_ptr, dst_pitch, 8,
291                                   yoffset);
292    } else {
293      /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
294       * yoffset==0) case correctly. Add copy function here to guarantee
295       * six-tap function handles all possible offsets. */
296      vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
297    }
298  }
299}
300
301void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
302                                 int src_pixels_per_line, int xoffset,
303                                 int yoffset, unsigned char *dst_ptr,
304                                 int dst_pitch) {
305  DECLARE_ALIGNED(16, unsigned char, FData2[256]);
306
307  if (xoffset) {
308    if (yoffset) {
309      vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
310                                   src_pixels_per_line, FData2, 8, 9, xoffset);
311      vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
312    } else {
313      /* First-pass only */
314      vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
315                                   dst_pitch, 4, xoffset);
316    }
317  } else {
318    if (yoffset) {
319      /* Second-pass only */
320      vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
321                                   src_pixels_per_line, dst_ptr, dst_pitch, 4,
322                                   yoffset);
323    } else {
324      /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
325       * yoffset==0) case correctly. Add copy function here to guarantee
326       * six-tap function handles all possible offsets. */
327      vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
328    }
329  }
330}
331
332void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
333                                 int src_pixels_per_line, int xoffset,
334                                 int yoffset, unsigned char *dst_ptr,
335                                 int dst_pitch) {
336  DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
337
338  if (xoffset) {
339    if (yoffset) {
340      vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
341                                   src_pixels_per_line, FData2, 4, 9, xoffset);
342      vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
343    } else {
344      vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
345                                   dst_pitch, 4, xoffset);
346    }
347  } else {
348    if (yoffset) {
349      vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
350                                   src_pixels_per_line, dst_ptr, dst_pitch, 4,
351                                   yoffset);
352    } else {
353      /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
354        * yoffset==0) case correctly. Add copy function here to guarantee
355        * six-tap function handles all possible offsets. */
356      int r;
357
358      for (r = 0; r < 4; ++r) {
359        dst_ptr[0] = src_ptr[0];
360        dst_ptr[1] = src_ptr[1];
361        dst_ptr[2] = src_ptr[2];
362        dst_ptr[3] = src_ptr[3];
363        dst_ptr += dst_pitch;
364        src_ptr += src_pixels_per_line;
365      }
366    }
367  }
368}
369
370#endif
371