1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include "vpx_config.h"
13#include "vp8_rtcd.h"
14#include "vpx_ports/mem.h"
15#include "filter_x86.h"
16
17extern const short vp8_six_tap_mmx[8][6*8];
18
19extern void vp8_filter_block1d_h6_mmx
20(
21    unsigned char   *src_ptr,
22    unsigned short  *output_ptr,
23    unsigned int    src_pixels_per_line,
24    unsigned int    pixel_step,
25    unsigned int    output_height,
26    unsigned int    output_width,
27    const short      *vp8_filter
28);
29extern void vp8_filter_block1dc_v6_mmx
30(
31    unsigned short *src_ptr,
32    unsigned char  *output_ptr,
33    int             output_pitch,
34    unsigned int    pixels_per_line,
35    unsigned int    pixel_step,
36    unsigned int    output_height,
37    unsigned int    output_width,
38    const short    *vp8_filter
39);
40extern void vp8_filter_block1d8_h6_sse2
41(
42    unsigned char  *src_ptr,
43    unsigned short *output_ptr,
44    unsigned int    src_pixels_per_line,
45    unsigned int    pixel_step,
46    unsigned int    output_height,
47    unsigned int    output_width,
48    const short    *vp8_filter
49);
50extern void vp8_filter_block1d16_h6_sse2
51(
52    unsigned char  *src_ptr,
53    unsigned short *output_ptr,
54    unsigned int    src_pixels_per_line,
55    unsigned int    pixel_step,
56    unsigned int    output_height,
57    unsigned int    output_width,
58    const short    *vp8_filter
59);
60extern void vp8_filter_block1d8_v6_sse2
61(
62    unsigned short *src_ptr,
63    unsigned char *output_ptr,
64    int dst_ptich,
65    unsigned int pixels_per_line,
66    unsigned int pixel_step,
67    unsigned int output_height,
68    unsigned int output_width,
69    const short    *vp8_filter
70);
71extern void vp8_filter_block1d16_v6_sse2
72(
73    unsigned short *src_ptr,
74    unsigned char *output_ptr,
75    int dst_ptich,
76    unsigned int pixels_per_line,
77    unsigned int pixel_step,
78    unsigned int output_height,
79    unsigned int output_width,
80    const short    *vp8_filter
81);
82extern void vp8_unpack_block1d16_h6_sse2
83(
84    unsigned char  *src_ptr,
85    unsigned short *output_ptr,
86    unsigned int    src_pixels_per_line,
87    unsigned int    output_height,
88    unsigned int    output_width
89);
90extern void vp8_filter_block1d8_h6_only_sse2
91(
92    unsigned char  *src_ptr,
93    unsigned int    src_pixels_per_line,
94    unsigned char  *output_ptr,
95    int dst_ptich,
96    unsigned int    output_height,
97    const short    *vp8_filter
98);
99extern void vp8_filter_block1d16_h6_only_sse2
100(
101    unsigned char  *src_ptr,
102    unsigned int    src_pixels_per_line,
103    unsigned char  *output_ptr,
104    int dst_ptich,
105    unsigned int    output_height,
106    const short    *vp8_filter
107);
108extern void vp8_filter_block1d8_v6_only_sse2
109(
110    unsigned char *src_ptr,
111    unsigned int   src_pixels_per_line,
112    unsigned char *output_ptr,
113    int dst_ptich,
114    unsigned int   output_height,
115    const short   *vp8_filter
116);
117
118
119#if HAVE_MMX
120void vp8_sixtap_predict4x4_mmx
121(
122    unsigned char  *src_ptr,
123    int   src_pixels_per_line,
124    int  xoffset,
125    int  yoffset,
126    unsigned char *dst_ptr,
127    int dst_pitch
128)
129{
130    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16);  /* Temp data bufffer used in filtering */
131    const short *HFilter, *VFilter;
132    HFilter = vp8_six_tap_mmx[xoffset];
133    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
134    VFilter = vp8_six_tap_mmx[yoffset];
135    vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
136
137}
138
139
140void vp8_sixtap_predict16x16_mmx
141(
142    unsigned char  *src_ptr,
143    int   src_pixels_per_line,
144    int  xoffset,
145    int  yoffset,
146    unsigned char *dst_ptr,
147    int dst_pitch
148)
149{
150
151    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);  /* Temp data bufffer used in filtering */
152
153    const short *HFilter, *VFilter;
154
155
156    HFilter = vp8_six_tap_mmx[xoffset];
157
158    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
159    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
160    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,  FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
161    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
162
163    VFilter = vp8_six_tap_mmx[yoffset];
164    vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, 16, VFilter);
165    vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
166    vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
167    vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
168
169}
170
171
172void vp8_sixtap_predict8x8_mmx
173(
174    unsigned char  *src_ptr,
175    int   src_pixels_per_line,
176    int  xoffset,
177    int  yoffset,
178    unsigned char *dst_ptr,
179    int dst_pitch
180)
181{
182
183    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
184
185    const short *HFilter, *VFilter;
186
187    HFilter = vp8_six_tap_mmx[xoffset];
188    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
189    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
190
191    VFilter = vp8_six_tap_mmx[yoffset];
192    vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, 8, VFilter);
193    vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
194
195}
196
197
198void vp8_sixtap_predict8x4_mmx
199(
200    unsigned char  *src_ptr,
201    int   src_pixels_per_line,
202    int  xoffset,
203    int  yoffset,
204    unsigned char *dst_ptr,
205    int dst_pitch
206)
207{
208
209    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
210
211    const short *HFilter, *VFilter;
212
213    HFilter = vp8_six_tap_mmx[xoffset];
214    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
215    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
216
217    VFilter = vp8_six_tap_mmx[yoffset];
218    vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, 8, VFilter);
219    vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
220
221}
222
223
224
225void vp8_bilinear_predict16x16_mmx
226(
227    unsigned char  *src_ptr,
228    int   src_pixels_per_line,
229    int  xoffset,
230    int  yoffset,
231    unsigned char *dst_ptr,
232    int dst_pitch
233)
234{
235    vp8_bilinear_predict8x8_mmx(src_ptr,   src_pixels_per_line, xoffset, yoffset, dst_ptr,   dst_pitch);
236    vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
237    vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,   src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8,   dst_pitch);
238    vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
239}
240#endif
241
242
243#if HAVE_SSE2
244void vp8_sixtap_predict16x16_sse2
245(
246    unsigned char  *src_ptr,
247    int   src_pixels_per_line,
248    int  xoffset,
249    int  yoffset,
250    unsigned char *dst_ptr,
251    int dst_pitch
252
253)
254{
255    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);    /* Temp data bufffer used in filtering */
256
257    const short *HFilter, *VFilter;
258
259    if (xoffset)
260    {
261        if (yoffset)
262        {
263            HFilter = vp8_six_tap_mmx[xoffset];
264            vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
265            VFilter = vp8_six_tap_mmx[yoffset];
266            vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
267        }
268        else
269        {
270            /* First-pass only */
271            HFilter = vp8_six_tap_mmx[xoffset];
272            vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
273        }
274    }
275    else
276    {
277        /* Second-pass only */
278        VFilter = vp8_six_tap_mmx[yoffset];
279        vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 21, 32);
280        vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
281    }
282}
283
284
285void vp8_sixtap_predict8x8_sse2
286(
287    unsigned char  *src_ptr,
288    int   src_pixels_per_line,
289    int  xoffset,
290    int  yoffset,
291    unsigned char *dst_ptr,
292    int dst_pitch
293)
294{
295    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
296    const short *HFilter, *VFilter;
297
298    if (xoffset)
299    {
300        if (yoffset)
301        {
302            HFilter = vp8_six_tap_mmx[xoffset];
303            vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
304            VFilter = vp8_six_tap_mmx[yoffset];
305            vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
306        }
307        else
308        {
309            /* First-pass only */
310            HFilter = vp8_six_tap_mmx[xoffset];
311            vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
312        }
313    }
314    else
315    {
316        /* Second-pass only */
317        VFilter = vp8_six_tap_mmx[yoffset];
318        vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
319    }
320}
321
322
323void vp8_sixtap_predict8x4_sse2
324(
325    unsigned char  *src_ptr,
326    int   src_pixels_per_line,
327    int  xoffset,
328    int  yoffset,
329    unsigned char *dst_ptr,
330    int dst_pitch
331)
332{
333    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
334    const short *HFilter, *VFilter;
335
336    if (xoffset)
337    {
338        if (yoffset)
339        {
340            HFilter = vp8_six_tap_mmx[xoffset];
341            vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
342            VFilter = vp8_six_tap_mmx[yoffset];
343            vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
344        }
345        else
346        {
347            /* First-pass only */
348            HFilter = vp8_six_tap_mmx[xoffset];
349            vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
350        }
351    }
352    else
353    {
354        /* Second-pass only */
355        VFilter = vp8_six_tap_mmx[yoffset];
356        vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
357    }
358}
359
360#endif
361
362#if HAVE_SSSE3
363
364extern void vp8_filter_block1d8_h6_ssse3
365(
366    unsigned char  *src_ptr,
367    unsigned int    src_pixels_per_line,
368    unsigned char  *output_ptr,
369    unsigned int    output_pitch,
370    unsigned int    output_height,
371    unsigned int    vp8_filter_index
372);
373
374extern void vp8_filter_block1d16_h6_ssse3
375(
376    unsigned char  *src_ptr,
377    unsigned int    src_pixels_per_line,
378    unsigned char  *output_ptr,
379    unsigned int    output_pitch,
380    unsigned int    output_height,
381    unsigned int    vp8_filter_index
382);
383
384extern void vp8_filter_block1d16_v6_ssse3
385(
386    unsigned char *src_ptr,
387    unsigned int   src_pitch,
388    unsigned char *output_ptr,
389    unsigned int   out_pitch,
390    unsigned int   output_height,
391    unsigned int   vp8_filter_index
392);
393
394extern void vp8_filter_block1d8_v6_ssse3
395(
396    unsigned char *src_ptr,
397    unsigned int   src_pitch,
398    unsigned char *output_ptr,
399    unsigned int   out_pitch,
400    unsigned int   output_height,
401    unsigned int   vp8_filter_index
402);
403
404extern void vp8_filter_block1d4_h6_ssse3
405(
406    unsigned char  *src_ptr,
407    unsigned int    src_pixels_per_line,
408    unsigned char  *output_ptr,
409    unsigned int    output_pitch,
410    unsigned int    output_height,
411    unsigned int    vp8_filter_index
412);
413
414extern void vp8_filter_block1d4_v6_ssse3
415(
416    unsigned char *src_ptr,
417    unsigned int   src_pitch,
418    unsigned char *output_ptr,
419    unsigned int   out_pitch,
420    unsigned int   output_height,
421    unsigned int   vp8_filter_index
422);
423
424void vp8_sixtap_predict16x16_ssse3
425(
426    unsigned char  *src_ptr,
427    int   src_pixels_per_line,
428    int  xoffset,
429    int  yoffset,
430    unsigned char *dst_ptr,
431    int dst_pitch
432
433)
434{
435    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
436
437    if (xoffset)
438    {
439        if (yoffset)
440        {
441            vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
442                                          src_pixels_per_line, FData2,
443                                          16, 21, xoffset);
444            vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
445                                          16, yoffset);
446        }
447        else
448        {
449            /* First-pass only */
450            vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
451                                          dst_ptr, dst_pitch, 16, xoffset);
452        }
453    }
454    else
455    {
456        if (yoffset)
457        {
458            /* Second-pass only */
459            vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
460                                          src_pixels_per_line,
461                                          dst_ptr, dst_pitch, 16, yoffset);
462        }
463        else
464        {
465            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
466             * yoffset==0) case correctly. Add copy function here to guarantee
467             * six-tap function handles all possible offsets. */
468            vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
469        }
470    }
471}
472
473void vp8_sixtap_predict8x8_ssse3
474(
475    unsigned char  *src_ptr,
476    int   src_pixels_per_line,
477    int  xoffset,
478    int  yoffset,
479    unsigned char *dst_ptr,
480    int dst_pitch
481)
482{
483    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
484
485    if (xoffset)
486    {
487        if (yoffset)
488        {
489            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
490                                         src_pixels_per_line, FData2,
491                                         8, 13, xoffset);
492            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
493                                         8, yoffset);
494        }
495        else
496        {
497            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
498                                         dst_ptr, dst_pitch, 8, xoffset);
499        }
500    }
501    else
502    {
503        if (yoffset)
504        {
505            /* Second-pass only */
506            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
507                                         src_pixels_per_line,
508                                         dst_ptr, dst_pitch, 8, yoffset);
509        }
510        else
511        {
512            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
513             * yoffset==0) case correctly. Add copy function here to guarantee
514             * six-tap function handles all possible offsets. */
515            vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
516        }
517    }
518}
519
520
521void vp8_sixtap_predict8x4_ssse3
522(
523    unsigned char  *src_ptr,
524    int   src_pixels_per_line,
525    int  xoffset,
526    int  yoffset,
527    unsigned char *dst_ptr,
528    int dst_pitch
529)
530{
531    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
532
533    if (xoffset)
534    {
535        if (yoffset)
536        {
537            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
538                                         src_pixels_per_line, FData2,
539                                         8, 9, xoffset);
540            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
541                                         4, yoffset);
542        }
543        else
544        {
545            /* First-pass only */
546            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
547                                         dst_ptr, dst_pitch, 4, xoffset);
548        }
549    }
550    else
551    {
552        if (yoffset)
553        {
554            /* Second-pass only */
555            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
556                                         src_pixels_per_line,
557                                         dst_ptr, dst_pitch, 4, yoffset);
558        }
559        else
560        {
561            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
562             * yoffset==0) case correctly. Add copy function here to guarantee
563             * six-tap function handles all possible offsets. */
564            vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
565        }
566    }
567}
568
569void vp8_sixtap_predict4x4_ssse3
570(
571    unsigned char  *src_ptr,
572    int   src_pixels_per_line,
573    int  xoffset,
574    int  yoffset,
575    unsigned char *dst_ptr,
576    int dst_pitch
577)
578{
579  DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
580
581  if (xoffset)
582  {
583      if (yoffset)
584      {
585          vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
586                                       src_pixels_per_line,
587                                       FData2, 4, 9, xoffset);
588          vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
589                                       4, yoffset);
590      }
591      else
592      {
593          vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
594                                       dst_ptr, dst_pitch, 4, xoffset);
595      }
596  }
597  else
598  {
599      if (yoffset)
600      {
601          vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
602                                       src_pixels_per_line,
603                                       dst_ptr, dst_pitch, 4, yoffset);
604      }
605      else
606      {
607        /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
608          * yoffset==0) case correctly. Add copy function here to guarantee
609          * six-tap function handles all possible offsets. */
610          int r;
611
612          for (r = 0; r < 4; r++)
613          {
614            dst_ptr[0]  = src_ptr[0];
615            dst_ptr[1]  = src_ptr[1];
616            dst_ptr[2]  = src_ptr[2];
617            dst_ptr[3]  = src_ptr[3];
618            dst_ptr     += dst_pitch;
619            src_ptr     += src_pixels_per_line;
620          }
621      }
622  }
623}
624
625#endif
626