1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13extern sym(vp8_bilinear_filters_x86_8)
14
15%define BLOCK_HEIGHT_WIDTH 4
16%define VP8_FILTER_WEIGHT 128
17%define VP8_FILTER_SHIFT  7
18
19
20;/************************************************************************************
21; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
22; input pixel array has output_height rows. This routine assumes that output_height is an
23; even number. This function handles 8 pixels in horizontal direction, calculating ONE
24; rows each iteration to take advantage of the 128 bits operations.
25;*************************************************************************************/
26;void vp8_filter_block1d8_h6_sse2
27;(
28;    unsigned char  *src_ptr,
29;    unsigned short *output_ptr,
30;    unsigned int    src_pixels_per_line,
31;    unsigned int    pixel_step,
32;    unsigned int    output_height,
33;    unsigned int    output_width,
34;    short           *vp8_filter
35;)
36global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
37sym(vp8_filter_block1d8_h6_sse2):
38    push        rbp
39    mov         rbp, rsp
40    SHADOW_ARGS_TO_STACK 7
41    SAVE_XMM 7
42    GET_GOT     rbx
43    push        rsi
44    push        rdi
45    ; end prolog
46
47        mov         rdx,        arg(6) ;vp8_filter
48        mov         rsi,        arg(0) ;src_ptr
49
50        mov         rdi,        arg(1) ;output_ptr
51
52        movsxd      rcx,        dword ptr arg(4) ;output_height
53        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
54%if ABI_IS_32BIT=0
55        movsxd      r8,         dword ptr arg(5) ;output_width
56%endif
57        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
58
59.filter_block1d8_h6_rowloop:
60        movq        xmm3,       MMWORD PTR [rsi - 2]
61        movq        xmm1,       MMWORD PTR [rsi + 6]
62
63        prefetcht2  [rsi+rax-2]
64
65        pslldq      xmm1,       8
66        por         xmm1,       xmm3
67
68        movdqa      xmm4,       xmm1
69        movdqa      xmm5,       xmm1
70
71        movdqa      xmm6,       xmm1
72        movdqa      xmm7,       xmm1
73
74        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
75        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
76
77        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
78        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
79
80        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
81        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
82
83
84        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
85        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
86
87        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
88
89        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
90        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
91
92        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
93
94        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
95        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
96
97
98        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
99
100        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
101        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
102
103
104        paddsw      xmm4,       xmm7
105        paddsw      xmm4,       xmm5
106
107        paddsw      xmm4,       xmm3
108        paddsw      xmm4,       xmm6
109
110        paddsw      xmm4,       xmm1
111        paddsw      xmm4,       [GLOBAL(rd)]
112
113        psraw       xmm4,       7
114
115        packuswb    xmm4,       xmm0
116        punpcklbw   xmm4,       xmm0
117
118        movdqa      XMMWORD Ptr [rdi],         xmm4
119        lea         rsi,        [rsi + rax]
120
121%if ABI_IS_32BIT
122        add         rdi,        DWORD Ptr arg(5) ;[output_width]
123%else
124        add         rdi,        r8
125%endif
126        dec         rcx
127
128        jnz         .filter_block1d8_h6_rowloop                ; next row
129
130    ; begin epilog
131    pop rdi
132    pop rsi
133    RESTORE_GOT
134    RESTORE_XMM
135    UNSHADOW_ARGS
136    pop         rbp
137    ret
138
139
140;void vp8_filter_block1d16_h6_sse2
141;(
142;    unsigned char  *src_ptr,
143;    unsigned short *output_ptr,
144;    unsigned int    src_pixels_per_line,
145;    unsigned int    pixel_step,
146;    unsigned int    output_height,
147;    unsigned int    output_width,
148;    short           *vp8_filter
149;)
150;/************************************************************************************
151; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
152; input pixel array has output_height rows. This routine assumes that output_height is an
153; even number. This function handles 8 pixels in horizontal direction, calculating ONE
154; rows each iteration to take advantage of the 128 bits operations.
155;*************************************************************************************/
156global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
157sym(vp8_filter_block1d16_h6_sse2):
158    push        rbp
159    mov         rbp, rsp
160    SHADOW_ARGS_TO_STACK 7
161    SAVE_XMM 7
162    GET_GOT     rbx
163    push        rsi
164    push        rdi
165    ; end prolog
166
167        mov         rdx,        arg(6) ;vp8_filter
168        mov         rsi,        arg(0) ;src_ptr
169
170        mov         rdi,        arg(1) ;output_ptr
171
172        movsxd      rcx,        dword ptr arg(4) ;output_height
173        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
174%if ABI_IS_32BIT=0
175        movsxd      r8,         dword ptr arg(5) ;output_width
176%endif
177
178        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
179
180.filter_block1d16_h6_sse2_rowloop:
181        movq        xmm3,       MMWORD PTR [rsi - 2]
182        movq        xmm1,       MMWORD PTR [rsi + 6]
183
184        movq        xmm2,       MMWORD PTR [rsi +14]
185        pslldq      xmm2,       8
186
187        por         xmm2,       xmm1
188        prefetcht2  [rsi+rax-2]
189
190        pslldq      xmm1,       8
191        por         xmm1,       xmm3
192
193        movdqa      xmm4,       xmm1
194        movdqa      xmm5,       xmm1
195
196        movdqa      xmm6,       xmm1
197        movdqa      xmm7,       xmm1
198
199        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
200        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
201
202        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
203        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
204
205        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
206        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
207
208
209        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
210        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
211
212        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
213
214        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
215        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
216
217        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
218
219        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
220        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
221
222
223        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
224
225        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
226        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
227
228        paddsw      xmm4,       xmm7
229        paddsw      xmm4,       xmm5
230
231        paddsw      xmm4,       xmm3
232        paddsw      xmm4,       xmm6
233
234        paddsw      xmm4,       xmm1
235        paddsw      xmm4,       [GLOBAL(rd)]
236
237        psraw       xmm4,       7
238
239        packuswb    xmm4,       xmm0
240        punpcklbw   xmm4,       xmm0
241
242        movdqa      XMMWORD Ptr [rdi],         xmm4
243
244        movdqa      xmm3,       xmm2
245        movdqa      xmm4,       xmm2
246
247        movdqa      xmm5,       xmm2
248        movdqa      xmm6,       xmm2
249
250        movdqa      xmm7,       xmm2
251
252        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
253        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
254
255        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
256        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
257
258        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
259        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
260
261
262        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
263        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
264
265        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
266
267        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
268        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
269
270        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
271
272        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
273        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
274
275        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
276
277        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
278        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
279
280
281        paddsw      xmm4,       xmm7
282        paddsw      xmm4,       xmm5
283
284        paddsw      xmm4,       xmm3
285        paddsw      xmm4,       xmm6
286
287        paddsw      xmm4,       xmm2
288        paddsw      xmm4,       [GLOBAL(rd)]
289
290        psraw       xmm4,       7
291
292        packuswb    xmm4,       xmm0
293        punpcklbw   xmm4,       xmm0
294
295        movdqa      XMMWORD Ptr [rdi+16],      xmm4
296
297        lea         rsi,        [rsi + rax]
298%if ABI_IS_32BIT
299        add         rdi,        DWORD Ptr arg(5) ;[output_width]
300%else
301        add         rdi,        r8
302%endif
303
304        dec         rcx
305        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
306
307    ; begin epilog
308    pop rdi
309    pop rsi
310    RESTORE_GOT
311    RESTORE_XMM
312    UNSHADOW_ARGS
313    pop         rbp
314    ret
315
316
317;void vp8_filter_block1d8_v6_sse2
318;(
319;    short *src_ptr,
320;    unsigned char *output_ptr,
321;    int dst_ptich,
322;    unsigned int pixels_per_line,
323;    unsigned int pixel_step,
324;    unsigned int output_height,
325;    unsigned int output_width,
326;    short * vp8_filter
327;)
328;/************************************************************************************
329; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
330; input pixel array has output_height rows.
331;*************************************************************************************/
332global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
333sym(vp8_filter_block1d8_v6_sse2):
334    push        rbp
335    mov         rbp, rsp
336    SHADOW_ARGS_TO_STACK 8
337    SAVE_XMM 7
338    GET_GOT     rbx
339    push        rsi
340    push        rdi
341    ; end prolog
342
343        mov         rax,        arg(7) ;vp8_filter
344        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
345
346        mov         rdi,        arg(1) ;output_ptr
347        mov         rsi,        arg(0) ;src_ptr
348
349        sub         rsi,        rdx
350        sub         rsi,        rdx
351
352        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
353        pxor        xmm0,       xmm0                        ; clear xmm0
354
355        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
356%if ABI_IS_32BIT=0
357        movsxd      r8,         dword ptr arg(2) ; dst_ptich
358%endif
359
360.vp8_filter_block1d8_v6_sse2_loop:
361        movdqa      xmm1,       XMMWORD PTR [rsi]
362        pmullw      xmm1,       [rax]
363
364        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
365        pmullw      xmm2,       [rax + 16]
366
367        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
368        pmullw      xmm3,       [rax + 32]
369
370        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
371        pmullw      xmm5,       [rax + 64]
372
373        add         rsi,        rdx
374        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
375
376        pmullw      xmm4,       [rax + 48]
377        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
378
379        pmullw      xmm6,       [rax + 80]
380
381        paddsw      xmm2,       xmm5
382        paddsw      xmm2,       xmm3
383
384        paddsw      xmm2,       xmm1
385        paddsw      xmm2,       xmm4
386
387        paddsw      xmm2,       xmm6
388        paddsw      xmm2,       xmm7
389
390        psraw       xmm2,       7
391        packuswb    xmm2,       xmm0              ; pack and saturate
392
393        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
394%if ABI_IS_32BIT
395        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
396%else
397        add         rdi,        r8
398%endif
399        dec         rcx         ; decrement count
400        jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
401
402    ; begin epilog
403    pop rdi
404    pop rsi
405    RESTORE_GOT
406    RESTORE_XMM
407    UNSHADOW_ARGS
408    pop         rbp
409    ret
410
411
412;void vp8_filter_block1d16_v6_sse2
413;(
414;    unsigned short *src_ptr,
415;    unsigned char *output_ptr,
416;    int dst_ptich,
417;    unsigned int pixels_per_line,
418;    unsigned int pixel_step,
419;    unsigned int output_height,
420;    unsigned int output_width,
421;    const short    *vp8_filter
422;)
423;/************************************************************************************
424; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
425; input pixel array has output_height rows.
426;*************************************************************************************/
427global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
428sym(vp8_filter_block1d16_v6_sse2):
429    push        rbp
430    mov         rbp, rsp
431    SHADOW_ARGS_TO_STACK 8
432    SAVE_XMM 7
433    GET_GOT     rbx
434    push        rsi
435    push        rdi
436    ; end prolog
437
438        mov         rax,        arg(7) ;vp8_filter
439        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
440
441        mov         rdi,        arg(1) ;output_ptr
442        mov         rsi,        arg(0) ;src_ptr
443
444        sub         rsi,        rdx
445        sub         rsi,        rdx
446
447        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
448%if ABI_IS_32BIT=0
449        movsxd      r8,         dword ptr arg(2) ; dst_ptich
450%endif
451
452.vp8_filter_block1d16_v6_sse2_loop:
453; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
454        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
455        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
456        pmullw      xmm1,       [rax + 16]
457        pmullw      xmm2,       [rax + 16]
458
459        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
460        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
461        pmullw      xmm3,       [rax + 64]
462        pmullw      xmm4,       [rax + 64]
463
464        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
465        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
466        pmullw      xmm5,       [rax + 32]
467        pmullw      xmm6,       [rax + 32]
468
469        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
470        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
471        pmullw      xmm7,       [rax]
472        pmullw      xmm0,       [rax]
473
474        paddsw      xmm1,       xmm3
475        paddsw      xmm2,       xmm4
476        paddsw      xmm1,       xmm5
477        paddsw      xmm2,       xmm6
478        paddsw      xmm1,       xmm7
479        paddsw      xmm2,       xmm0
480
481        add         rsi,        rdx
482
483        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
484        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
485        pmullw      xmm3,       [rax + 48]
486        pmullw      xmm4,       [rax + 48]
487
488        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
489        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
490        pmullw      xmm5,       [rax + 80]
491        pmullw      xmm6,       [rax + 80]
492
493        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
494        pxor        xmm0,       xmm0                        ; clear xmm0
495
496        paddsw      xmm1,       xmm3
497        paddsw      xmm2,       xmm4
498        paddsw      xmm1,       xmm5
499        paddsw      xmm2,       xmm6
500
501        paddsw      xmm1,       xmm7
502        paddsw      xmm2,       xmm7
503
504        psraw       xmm1,       7
505        psraw       xmm2,       7
506
507        packuswb    xmm1,       xmm2              ; pack and saturate
508        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
509%if ABI_IS_32BIT
510        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
511%else
512        add         rdi,        r8
513%endif
514        dec         rcx         ; decrement count
515        jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
516
517    ; begin epilog
518    pop rdi
519    pop rsi
520    RESTORE_GOT
521    RESTORE_XMM
522    UNSHADOW_ARGS
523    pop         rbp
524    ret
525
526
527;void vp8_filter_block1d8_h6_only_sse2
528;(
529;    unsigned char  *src_ptr,
530;    unsigned int    src_pixels_per_line,
531;    unsigned char  *output_ptr,
532;    int dst_ptich,
533;    unsigned int    output_height,
534;    const short    *vp8_filter
535;)
536; First-pass filter only when yoffset==0
537global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
538sym(vp8_filter_block1d8_h6_only_sse2):
539    push        rbp
540    mov         rbp, rsp
541    SHADOW_ARGS_TO_STACK 6
542    SAVE_XMM 7
543    GET_GOT     rbx
544    push        rsi
545    push        rdi
546    ; end prolog
547
548        mov         rdx,        arg(5) ;vp8_filter
549        mov         rsi,        arg(0) ;src_ptr
550
551        mov         rdi,        arg(2) ;output_ptr
552
553        movsxd      rcx,        dword ptr arg(4) ;output_height
554        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
555%if ABI_IS_32BIT=0
556        movsxd      r8,         dword ptr arg(3) ;dst_ptich
557%endif
558        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
559
560.filter_block1d8_h6_only_rowloop:
561        movq        xmm3,       MMWORD PTR [rsi - 2]
562        movq        xmm1,       MMWORD PTR [rsi + 6]
563
564        prefetcht2  [rsi+rax-2]
565
566        pslldq      xmm1,       8
567        por         xmm1,       xmm3
568
569        movdqa      xmm4,       xmm1
570        movdqa      xmm5,       xmm1
571
572        movdqa      xmm6,       xmm1
573        movdqa      xmm7,       xmm1
574
575        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
576        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
577
578        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
579        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
580
581        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
582        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
583
584
585        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
586        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
587
588        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
589
590        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
591        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
592
593        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
594
595        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
596        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
597
598
599        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
600
601        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
602        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
603
604
605        paddsw      xmm4,       xmm7
606        paddsw      xmm4,       xmm5
607
608        paddsw      xmm4,       xmm3
609        paddsw      xmm4,       xmm6
610
611        paddsw      xmm4,       xmm1
612        paddsw      xmm4,       [GLOBAL(rd)]
613
614        psraw       xmm4,       7
615
616        packuswb    xmm4,       xmm0
617
618        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
619        lea         rsi,        [rsi + rax]
620
621%if ABI_IS_32BIT
622        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
623%else
624        add         rdi,        r8
625%endif
626        dec         rcx
627
628        jnz         .filter_block1d8_h6_only_rowloop               ; next row
629
630    ; begin epilog
631    pop rdi
632    pop rsi
633    RESTORE_GOT
634    RESTORE_XMM
635    UNSHADOW_ARGS
636    pop         rbp
637    ret
638
639
640;void vp8_filter_block1d16_h6_only_sse2
641;(
642;    unsigned char  *src_ptr,
643;    unsigned int    src_pixels_per_line,
644;    unsigned char  *output_ptr,
645;    int dst_ptich,
646;    unsigned int    output_height,
647;    const short    *vp8_filter
648;)
649; First-pass filter only when yoffset==0
650global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
651sym(vp8_filter_block1d16_h6_only_sse2):
652    push        rbp
653    mov         rbp, rsp
654    SHADOW_ARGS_TO_STACK 6
655    SAVE_XMM 7
656    GET_GOT     rbx
657    push        rsi
658    push        rdi
659    ; end prolog
660
661        mov         rdx,        arg(5) ;vp8_filter
662        mov         rsi,        arg(0) ;src_ptr
663
664        mov         rdi,        arg(2) ;output_ptr
665
666        movsxd      rcx,        dword ptr arg(4) ;output_height
667        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
668%if ABI_IS_32BIT=0
669        movsxd      r8,         dword ptr arg(3) ;dst_ptich
670%endif
671
672        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
673
674.filter_block1d16_h6_only_sse2_rowloop:
675        movq        xmm3,       MMWORD PTR [rsi - 2]
676        movq        xmm1,       MMWORD PTR [rsi + 6]
677
678        movq        xmm2,       MMWORD PTR [rsi +14]
679        pslldq      xmm2,       8
680
681        por         xmm2,       xmm1
682        prefetcht2  [rsi+rax-2]
683
684        pslldq      xmm1,       8
685        por         xmm1,       xmm3
686
687        movdqa      xmm4,       xmm1
688        movdqa      xmm5,       xmm1
689
690        movdqa      xmm6,       xmm1
691        movdqa      xmm7,       xmm1
692
693        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
694        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
695
696        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
697        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
698
699        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
700        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
701
702        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
703        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
704
705        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
706
707        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
708        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
709
710        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
711
712        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
713        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
714
715        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
716
717        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
718        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
719
720        paddsw      xmm4,       xmm7
721        paddsw      xmm4,       xmm5
722
723        paddsw      xmm4,       xmm3
724        paddsw      xmm4,       xmm6
725
726        paddsw      xmm4,       xmm1
727        paddsw      xmm4,       [GLOBAL(rd)]
728
729        psraw       xmm4,       7
730
731        packuswb    xmm4,       xmm0                        ; lower 8 bytes
732
733        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
734
735        movdqa      xmm3,       xmm2
736        movdqa      xmm4,       xmm2
737
738        movdqa      xmm5,       xmm2
739        movdqa      xmm6,       xmm2
740
741        movdqa      xmm7,       xmm2
742
743        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
744        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
745
746        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
747        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
748
749        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
750        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
751
752        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
753        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
754
755        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
756
757        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
758        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
759
760        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
761
762        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
763        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
764
765        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
766
767        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
768        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
769
770        paddsw      xmm4,       xmm7
771        paddsw      xmm4,       xmm5
772
773        paddsw      xmm4,       xmm3
774        paddsw      xmm4,       xmm6
775
776        paddsw      xmm4,       xmm2
777        paddsw      xmm4,       [GLOBAL(rd)]
778
779        psraw       xmm4,       7
780
781        packuswb    xmm4,       xmm0                        ; higher 8 bytes
782
783        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
784
785        lea         rsi,        [rsi + rax]
786%if ABI_IS_32BIT
787        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
788%else
789        add         rdi,        r8
790%endif
791
792        dec         rcx
793        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
794
795    ; begin epilog
796    pop rdi
797    pop rsi
798    RESTORE_GOT
799    RESTORE_XMM
800    UNSHADOW_ARGS
801    pop         rbp
802    ret
803
804
805;void vp8_filter_block1d8_v6_only_sse2
806;(
807;    unsigned char *src_ptr,
808;    unsigned int    src_pixels_per_line,
809;    unsigned char *output_ptr,
810;    int dst_ptich,
811;    unsigned int output_height,
812;    const short    *vp8_filter
813;)
814; Second-pass filter only when xoffset==0
815global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
816sym(vp8_filter_block1d8_v6_only_sse2):
817    push        rbp
818    mov         rbp, rsp
819    SHADOW_ARGS_TO_STACK 6
820    SAVE_XMM 7
821    GET_GOT     rbx
822    push        rsi
823    push        rdi
824    ; end prolog
825
826        mov         rsi,        arg(0) ;src_ptr
827        mov         rdi,        arg(2) ;output_ptr
828
829        movsxd      rcx,        dword ptr arg(4) ;output_height
830        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
831
832        mov         rax,        arg(5) ;vp8_filter
833
834        pxor        xmm0,       xmm0                        ; clear xmm0
835
836        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
837%if ABI_IS_32BIT=0
838        movsxd      r8,         dword ptr arg(3) ; dst_ptich
839%endif
840
841.vp8_filter_block1d8_v6_only_sse2_loop:
842        movq        xmm1,       MMWORD PTR [rsi]
843        movq        xmm2,       MMWORD PTR [rsi + rdx]
844        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
845        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
846        add         rsi,        rdx
847        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
848        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
849
850        punpcklbw   xmm1,       xmm0
851        pmullw      xmm1,       [rax]
852
853        punpcklbw   xmm2,       xmm0
854        pmullw      xmm2,       [rax + 16]
855
856        punpcklbw   xmm3,       xmm0
857        pmullw      xmm3,       [rax + 32]
858
859        punpcklbw   xmm5,       xmm0
860        pmullw      xmm5,       [rax + 64]
861
862        punpcklbw   xmm4,       xmm0
863        pmullw      xmm4,       [rax + 48]
864
865        punpcklbw   xmm6,       xmm0
866        pmullw      xmm6,       [rax + 80]
867
868        paddsw      xmm2,       xmm5
869        paddsw      xmm2,       xmm3
870
871        paddsw      xmm2,       xmm1
872        paddsw      xmm2,       xmm4
873
874        paddsw      xmm2,       xmm6
875        paddsw      xmm2,       xmm7
876
877        psraw       xmm2,       7
878        packuswb    xmm2,       xmm0              ; pack and saturate
879
880        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
881%if ABI_IS_32BIT
882        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
883%else
884        add         rdi,        r8
885%endif
886        dec         rcx         ; decrement count
887        jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
888
889    ; begin epilog
890    pop rdi
891    pop rsi
892    RESTORE_GOT
893    RESTORE_XMM
894    UNSHADOW_ARGS
895    pop         rbp
896    ret
897
898
899;void vp8_unpack_block1d16_h6_sse2
900;(
901;    unsigned char  *src_ptr,
902;    unsigned short *output_ptr,
903;    unsigned int    src_pixels_per_line,
904;    unsigned int    output_height,
905;    unsigned int    output_width
906;)
907global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
908sym(vp8_unpack_block1d16_h6_sse2):
909    push        rbp
910    mov         rbp, rsp
911    SHADOW_ARGS_TO_STACK 5
912    GET_GOT     rbx
913    push        rsi
914    push        rdi
915    ; end prolog
916
917        mov         rsi,        arg(0) ;src_ptr
918        mov         rdi,        arg(1) ;output_ptr
919
920        movsxd      rcx,        dword ptr arg(3) ;output_height
921        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
922
923        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
924%if ABI_IS_32BIT=0
925        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
926%endif
927
928.unpack_block1d16_h6_sse2_rowloop:
929        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
930        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
931
932        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
933        punpcklbw   xmm1,       xmm0
934
935        movdqa      XMMWORD Ptr [rdi],         xmm1
936        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
937
938        lea         rsi,        [rsi + rax]
939%if ABI_IS_32BIT
940        add         rdi,        DWORD Ptr arg(4) ;[output_width]
941%else
942        add         rdi,        r8
943%endif
944        dec         rcx
945        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
946
947    ; begin epilog
948    pop rdi
949    pop rsi
950    RESTORE_GOT
951    UNSHADOW_ARGS
952    pop         rbp
953    ret
954
955
956;void vp8_bilinear_predict16x16_sse2
957;(
958;    unsigned char  *src_ptr,
959;    int   src_pixels_per_line,
960;    int  xoffset,
961;    int  yoffset,
962;    unsigned char *dst_ptr,
963;    int dst_pitch
964;)
965extern sym(vp8_bilinear_filters_x86_8)
966global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
967sym(vp8_bilinear_predict16x16_sse2):
968    push        rbp
969    mov         rbp, rsp
970    SHADOW_ARGS_TO_STACK 6
971    SAVE_XMM 7
972    GET_GOT     rbx
973    push        rsi
974    push        rdi
975    ; end prolog
976
977    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
978    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
979
980        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
981        movsxd      rax,        dword ptr arg(2) ;xoffset
982
983        cmp         rax,        0      ;skip first_pass filter if xoffset=0
984        je          .b16x16_sp_only
985
986        shl         rax,        5
987        add         rax,        rcx    ;HFilter
988
989        mov         rdi,        arg(4) ;dst_ptr
990        mov         rsi,        arg(0) ;src_ptr
991        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
992
993        movdqa      xmm1,       [rax]
994        movdqa      xmm2,       [rax+16]
995
996        movsxd      rax,        dword ptr arg(3) ;yoffset
997
998        cmp         rax,        0      ;skip second_pass filter if yoffset=0
999        je          .b16x16_fp_only
1000
1001        shl         rax,        5
1002        add         rax,        rcx    ;VFilter
1003
1004        lea         rcx,        [rdi+rdx*8]
1005        lea         rcx,        [rcx+rdx*8]
1006        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
1007
1008        pxor        xmm0,       xmm0
1009
1010%if ABI_IS_32BIT=0
1011        movsxd      r8,         dword ptr arg(5) ;dst_pitch
1012%endif
1013        ; get the first horizontal line done
1014        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1015        movdqa      xmm4,       xmm3                 ; make a copy of current line
1016
1017        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1018        punpckhbw   xmm4,       xmm0
1019
1020        pmullw      xmm3,       xmm1
1021        pmullw      xmm4,       xmm1
1022
1023        movdqu      xmm5,       [rsi+1]
1024        movdqa      xmm6,       xmm5
1025
1026        punpcklbw   xmm5,       xmm0
1027        punpckhbw   xmm6,       xmm0
1028
1029        pmullw      xmm5,       xmm2
1030        pmullw      xmm6,       xmm2
1031
1032        paddw       xmm3,       xmm5
1033        paddw       xmm4,       xmm6
1034
1035        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1036        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1037
1038        paddw       xmm4,       [GLOBAL(rd)]
1039        psraw       xmm4,       VP8_FILTER_SHIFT
1040
1041        movdqa      xmm7,       xmm3
1042        packuswb    xmm7,       xmm4
1043
1044        add         rsi,        rdx                 ; next line
1045.next_row:
1046        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1047        movdqa      xmm4,       xmm3                 ; make a copy of current line
1048
1049        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1050        punpckhbw   xmm4,       xmm0
1051
1052        pmullw      xmm3,       xmm1
1053        pmullw      xmm4,       xmm1
1054
1055        movdqu      xmm5,       [rsi+1]
1056        movdqa      xmm6,       xmm5
1057
1058        punpcklbw   xmm5,       xmm0
1059        punpckhbw   xmm6,       xmm0
1060
1061        pmullw      xmm5,       xmm2
1062        pmullw      xmm6,       xmm2
1063
1064        paddw       xmm3,       xmm5
1065        paddw       xmm4,       xmm6
1066
1067        movdqa      xmm5,       xmm7
1068        movdqa      xmm6,       xmm7
1069
1070        punpcklbw   xmm5,       xmm0
1071        punpckhbw   xmm6,       xmm0
1072
1073        pmullw      xmm5,       [rax]
1074        pmullw      xmm6,       [rax]
1075
1076        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1077        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1078
1079        paddw       xmm4,       [GLOBAL(rd)]
1080        psraw       xmm4,       VP8_FILTER_SHIFT
1081
1082        movdqa      xmm7,       xmm3
1083        packuswb    xmm7,       xmm4
1084
1085        pmullw      xmm3,       [rax+16]
1086        pmullw      xmm4,       [rax+16]
1087
1088        paddw       xmm3,       xmm5
1089        paddw       xmm4,       xmm6
1090
1091        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1092        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1093
1094        paddw       xmm4,       [GLOBAL(rd)]
1095        psraw       xmm4,       VP8_FILTER_SHIFT
1096
1097        packuswb    xmm3,       xmm4
1098        movdqa      [rdi],      xmm3                 ; store the results in the destination
1099
1100        add         rsi,        rdx                 ; next line
1101%if ABI_IS_32BIT
1102        add         rdi,        DWORD PTR arg(5) ;dst_pitch
1103%else
1104        add         rdi,        r8
1105%endif
1106
1107        cmp         rdi,        rcx
1108        jne         .next_row
1109
1110        jmp         .done
1111
1112.b16x16_sp_only:
1113        movsxd      rax,        dword ptr arg(3) ;yoffset
1114        shl         rax,        5
1115        add         rax,        rcx    ;VFilter
1116
1117        mov         rdi,        arg(4) ;dst_ptr
1118        mov         rsi,        arg(0) ;src_ptr
1119        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
1120
1121        movdqa      xmm1,       [rax]
1122        movdqa      xmm2,       [rax+16]
1123
1124        lea         rcx,        [rdi+rdx*8]
1125        lea         rcx,        [rcx+rdx*8]
1126        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
1127
1128        pxor        xmm0,       xmm0
1129
1130        ; get the first horizontal line done
1131        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1132
1133        add         rsi,        rax                 ; next line
1134.next_row_spo:
1135        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1136
1137        movdqa      xmm5,       xmm7
1138        movdqa      xmm6,       xmm7
1139
1140        movdqa      xmm4,       xmm3                 ; make a copy of current line
1141        movdqa      xmm7,       xmm3
1142
1143        punpcklbw   xmm5,       xmm0
1144        punpckhbw   xmm6,       xmm0
1145        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1146        punpckhbw   xmm4,       xmm0
1147
1148        pmullw      xmm5,       xmm1
1149        pmullw      xmm6,       xmm1
1150        pmullw      xmm3,       xmm2
1151        pmullw      xmm4,       xmm2
1152
1153        paddw       xmm3,       xmm5
1154        paddw       xmm4,       xmm6
1155
1156        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1157        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1158
1159        paddw       xmm4,       [GLOBAL(rd)]
1160        psraw       xmm4,       VP8_FILTER_SHIFT
1161
1162        packuswb    xmm3,       xmm4
1163        movdqa      [rdi],      xmm3                 ; store the results in the destination
1164
1165        add         rsi,        rax                 ; next line
1166        add         rdi,        rdx                 ;dst_pitch
1167        cmp         rdi,        rcx
1168        jne         .next_row_spo
1169
1170        jmp         .done
1171
1172.b16x16_fp_only:
1173        lea         rcx,        [rdi+rdx*8]
1174        lea         rcx,        [rcx+rdx*8]
1175        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
1176        pxor        xmm0,       xmm0
1177
1178.next_row_fpo:
1179        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1180        movdqa      xmm4,       xmm3                 ; make a copy of current line
1181
1182        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1183        punpckhbw   xmm4,       xmm0
1184
1185        pmullw      xmm3,       xmm1
1186        pmullw      xmm4,       xmm1
1187
1188        movdqu      xmm5,       [rsi+1]
1189        movdqa      xmm6,       xmm5
1190
1191        punpcklbw   xmm5,       xmm0
1192        punpckhbw   xmm6,       xmm0
1193
1194        pmullw      xmm5,       xmm2
1195        pmullw      xmm6,       xmm2
1196
1197        paddw       xmm3,       xmm5
1198        paddw       xmm4,       xmm6
1199
1200        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1201        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1202
1203        paddw       xmm4,       [GLOBAL(rd)]
1204        psraw       xmm4,       VP8_FILTER_SHIFT
1205
1206        packuswb    xmm3,       xmm4
1207        movdqa      [rdi],      xmm3                 ; store the results in the destination
1208
1209        add         rsi,        rax                 ; next line
1210        add         rdi,        rdx                 ; dst_pitch
1211        cmp         rdi,        rcx
1212        jne         .next_row_fpo
1213
1214.done:
1215    ; begin epilog
1216    pop rdi
1217    pop rsi
1218    RESTORE_GOT
1219    RESTORE_XMM
1220    UNSHADOW_ARGS
1221    pop         rbp
1222    ret
1223
1224
1225;void vp8_bilinear_predict8x8_sse2
1226;(
1227;    unsigned char  *src_ptr,
1228;    int   src_pixels_per_line,
1229;    int  xoffset,
1230;    int  yoffset,
1231;    unsigned char *dst_ptr,
1232;    int dst_pitch
1233;)
1234global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
1235sym(vp8_bilinear_predict8x8_sse2):
1236    push        rbp
1237    mov         rbp, rsp
1238    SHADOW_ARGS_TO_STACK 6
1239    SAVE_XMM 7
1240    GET_GOT     rbx
1241    push        rsi
1242    push        rdi
1243    ; end prolog
1244
1245    ALIGN_STACK 16, rax
1246    sub         rsp, 144                         ; reserve 144 bytes
1247
1248    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
1249    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
1250        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
1251
1252        mov         rsi,        arg(0) ;src_ptr
1253        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
1254
1255    ;Read 9-line unaligned data in and put them on stack. This gives a big
1256    ;performance boost.
1257        movdqu      xmm0,       [rsi]
1258        lea         rax,        [rdx + rdx*2]
1259        movdqu      xmm1,       [rsi+rdx]
1260        movdqu      xmm2,       [rsi+rdx*2]
1261        add         rsi,        rax
1262        movdqu      xmm3,       [rsi]
1263        movdqu      xmm4,       [rsi+rdx]
1264        movdqu      xmm5,       [rsi+rdx*2]
1265        add         rsi,        rax
1266        movdqu      xmm6,       [rsi]
1267        movdqu      xmm7,       [rsi+rdx]
1268
1269        movdqa      XMMWORD PTR [rsp],            xmm0
1270
1271        movdqu      xmm0,       [rsi+rdx*2]
1272
1273        movdqa      XMMWORD PTR [rsp+16],         xmm1
1274        movdqa      XMMWORD PTR [rsp+32],         xmm2
1275        movdqa      XMMWORD PTR [rsp+48],         xmm3
1276        movdqa      XMMWORD PTR [rsp+64],         xmm4
1277        movdqa      XMMWORD PTR [rsp+80],         xmm5
1278        movdqa      XMMWORD PTR [rsp+96],         xmm6
1279        movdqa      XMMWORD PTR [rsp+112],        xmm7
1280        movdqa      XMMWORD PTR [rsp+128],        xmm0
1281
1282        movsxd      rax,        dword ptr arg(2) ;xoffset
1283        shl         rax,        5
1284        add         rax,        rcx    ;HFilter
1285
1286        mov         rdi,        arg(4) ;dst_ptr
1287        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
1288
1289        movdqa      xmm1,       [rax]
1290        movdqa      xmm2,       [rax+16]
1291
1292        movsxd      rax,        dword ptr arg(3) ;yoffset
1293        shl         rax,        5
1294        add         rax,        rcx    ;VFilter
1295
1296        lea         rcx,        [rdi+rdx*8]
1297
1298        movdqa      xmm5,       [rax]
1299        movdqa      xmm6,       [rax+16]
1300
1301        pxor        xmm0,       xmm0
1302
1303        ; get the first horizontal line done
1304        movdqa      xmm3,       XMMWORD PTR [rsp]
1305        movdqa      xmm4,       xmm3                 ; make a copy of current line
1306        psrldq      xmm4,       1
1307
1308        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
1309        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
1310
1311        pmullw      xmm3,       xmm1
1312        pmullw      xmm4,       xmm2
1313
1314        paddw       xmm3,       xmm4
1315
1316        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1317        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1318
1319        movdqa      xmm7,       xmm3
1320        add         rsp,        16                 ; next line
1321.next_row8x8:
1322        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1323        movdqa      xmm4,       xmm3                 ; make a copy of current line
1324        psrldq      xmm4,       1
1325
1326        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
1327        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
1328
1329        pmullw      xmm3,       xmm1
1330        pmullw      xmm4,       xmm2
1331
1332        paddw       xmm3,       xmm4
1333        pmullw      xmm7,       xmm5
1334
1335        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1336        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1337
1338        movdqa      xmm4,       xmm3
1339
1340        pmullw      xmm3,       xmm6
1341        paddw       xmm3,       xmm7
1342
1343        movdqa      xmm7,       xmm4
1344
1345        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1346        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1347
1348        packuswb    xmm3,       xmm0
1349        movq        [rdi],      xmm3                 ; store the results in the destination
1350
1351        add         rsp,        16                 ; next line
1352        add         rdi,        rdx
1353
1354        cmp         rdi,        rcx
1355        jne         .next_row8x8
1356
1357    ;add rsp, 144
1358    pop rsp
1359    ; begin epilog
1360    pop rdi
1361    pop rsi
1362    RESTORE_GOT
1363    RESTORE_XMM
1364    UNSHADOW_ARGS
1365    pop         rbp
1366    ret
1367
1368
1369SECTION_RODATA
1370align 16
1371rd:
1372    times 8 dw 0x40
1373