1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define BLOCK_HEIGHT_WIDTH 4
15%define VP8_FILTER_WEIGHT 128
16%define VP8_FILTER_SHIFT  7
17
18
19;/************************************************************************************
20; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
21; input pixel array has output_height rows. This routine assumes that output_height is an
22; even number. This function handles 8 pixels in horizontal direction, calculating ONE
23; rows each iteration to take advantage of the 128 bits operations.
24;
25; This is an implementation of some of the SSE optimizations first seen in ffvp8
26;
27;*************************************************************************************/
28;void vp8_filter_block1d8_h6_ssse3
29;(
30;    unsigned char  *src_ptr,
31;    unsigned int    src_pixels_per_line,
32;    unsigned char *output_ptr,
33;    unsigned int    output_pitch,
34;    unsigned int    output_height,
35;    unsigned int    vp8_filter_index
36;)
37global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
38sym(vp8_filter_block1d8_h6_ssse3):
39    push        rbp
40    mov         rbp, rsp
41    SHADOW_ARGS_TO_STACK 6
42    SAVE_XMM 7
43    GET_GOT     rbx
44    push        rsi
45    push        rdi
46    ; end prolog
47
48    movsxd      rdx, DWORD PTR arg(5)   ;table index
49    xor         rsi, rsi
50    shl         rdx, 4
51
52    movdqa      xmm7, [GLOBAL(rd)]
53
54    lea         rax, [GLOBAL(k0_k5)]
55    add         rax, rdx
56    mov         rdi, arg(2)             ;output_ptr
57
58    cmp         esi, DWORD PTR [rax]
59    je          vp8_filter_block1d8_h4_ssse3
60
61    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
62    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
63    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
64
65    mov         rsi, arg(0)             ;src_ptr
66    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
67    movsxd      rcx, dword ptr arg(4)   ;output_height
68
69    movsxd      rdx, dword ptr arg(3)   ;output_pitch
70
71    sub         rdi, rdx
72;xmm3 free
73.filter_block1d8_h6_rowloop_ssse3:
74    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
75
76    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
77
78    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
79
80    movdqa      xmm1,   xmm0
81    pmaddubsw   xmm0,   xmm4
82
83    movdqa      xmm2,   xmm1
84    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
85
86    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
87    pmaddubsw   xmm1,   xmm5
88
89    lea         rdi,    [rdi + rdx]
90    pmaddubsw   xmm2,   xmm6
91
92    lea         rsi,    [rsi + rax]
93    dec         rcx
94
95    paddsw      xmm0,   xmm1
96    paddsw      xmm2,   xmm7
97
98    paddsw      xmm0,   xmm2
99
100    psraw       xmm0,   7
101
102    packuswb    xmm0,   xmm0
103
104    movq        MMWORD Ptr [rdi], xmm0
105    jnz         .filter_block1d8_h6_rowloop_ssse3
106
107    ; begin epilog
108    pop rdi
109    pop rsi
110    RESTORE_GOT
111    RESTORE_XMM
112    UNSHADOW_ARGS
113    pop         rbp
114    ret
115
116vp8_filter_block1d8_h4_ssse3:
117    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
118    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
119
120    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
121    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
122
123    mov         rsi, arg(0)             ;src_ptr
124
125    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
126    movsxd      rcx, dword ptr arg(4)   ;output_height
127
128    movsxd      rdx, dword ptr arg(3)   ;output_pitch
129
130    sub         rdi, rdx
131
132.filter_block1d8_h4_rowloop_ssse3:
133    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
134
135    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
136
137    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
138
139    movdqa      xmm2,   xmm0
140    pshufb      xmm0,   xmm3
141
142    pshufb      xmm2,   xmm4
143    pmaddubsw   xmm0,   xmm5
144
145    lea         rdi,    [rdi + rdx]
146    pmaddubsw   xmm2,   xmm6
147
148    lea         rsi,    [rsi + rax]
149    dec         rcx
150
151    paddsw      xmm0,   xmm7
152
153    paddsw      xmm0,   xmm2
154
155    psraw       xmm0,   7
156
157    packuswb    xmm0,   xmm0
158
159    movq        MMWORD Ptr [rdi], xmm0
160
161    jnz         .filter_block1d8_h4_rowloop_ssse3
162
163    ; begin epilog
164    pop rdi
165    pop rsi
166    RESTORE_GOT
167    RESTORE_XMM
168    UNSHADOW_ARGS
169    pop         rbp
170    ret
171;void vp8_filter_block1d16_h6_ssse3
172;(
173;    unsigned char  *src_ptr,
174;    unsigned int    src_pixels_per_line,
175;    unsigned char  *output_ptr,
176;    unsigned int    output_pitch,
177;    unsigned int    output_height,
178;    unsigned int    vp8_filter_index
179;)
180global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
181sym(vp8_filter_block1d16_h6_ssse3):
182    push        rbp
183    mov         rbp, rsp
184    SHADOW_ARGS_TO_STACK 6
185    SAVE_XMM 7
186    GET_GOT     rbx
187    push        rsi
188    push        rdi
189    ; end prolog
190
191    movsxd      rdx, DWORD PTR arg(5)           ;table index
192    xor         rsi, rsi
193    shl         rdx, 4      ;
194
195    lea         rax, [GLOBAL(k0_k5)]
196    add         rax, rdx
197
198    mov         rdi, arg(2)                     ;output_ptr
199
200    mov         rsi, arg(0)                     ;src_ptr
201
202    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
203    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
204    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
205
206    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
207    movsxd      rcx, dword ptr arg(4)           ;output_height
208    movsxd      rdx, dword ptr arg(3)           ;output_pitch
209
210.filter_block1d16_h6_rowloop_ssse3:
211    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
212
213    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
214
215    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
216
217    movdqa      xmm1,   xmm0
218    pmaddubsw   xmm0,   xmm4
219
220    movdqa      xmm2,   xmm1
221    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
222
223    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
224    movq        xmm3,   MMWORD PTR [rsi +  6]
225
226    pmaddubsw   xmm1,   xmm5
227    movq        xmm7,   MMWORD PTR [rsi + 11]
228
229    pmaddubsw   xmm2,   xmm6
230    punpcklbw   xmm3,   xmm7
231
232    paddsw      xmm0,   xmm1
233    movdqa      xmm1,   xmm3
234
235    pmaddubsw   xmm3,   xmm4
236    paddsw      xmm0,   xmm2
237
238    movdqa      xmm2,   xmm1
239    paddsw      xmm0,   [GLOBAL(rd)]
240
241    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
242    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
243
244    psraw       xmm0,   7
245    pmaddubsw   xmm1,   xmm5
246
247    pmaddubsw   xmm2,   xmm6
248    packuswb    xmm0,   xmm0
249
250    lea         rsi,    [rsi + rax]
251    paddsw      xmm3,   xmm1
252
253    paddsw      xmm3,   xmm2
254
255    paddsw      xmm3,   [GLOBAL(rd)]
256
257    psraw       xmm3,   7
258
259    packuswb    xmm3,   xmm3
260
261    punpcklqdq  xmm0,   xmm3
262
263    movdqa      XMMWORD Ptr [rdi], xmm0
264
265    lea         rdi,    [rdi + rdx]
266    dec         rcx
267    jnz         .filter_block1d16_h6_rowloop_ssse3
268
269    ; begin epilog
270    pop rdi
271    pop rsi
272    RESTORE_GOT
273    RESTORE_XMM
274    UNSHADOW_ARGS
275    pop         rbp
276    ret
277
278;void vp8_filter_block1d4_h6_ssse3
279;(
280;    unsigned char  *src_ptr,
281;    unsigned int    src_pixels_per_line,
282;    unsigned char  *output_ptr,
283;    unsigned int    output_pitch,
284;    unsigned int    output_height,
285;    unsigned int    vp8_filter_index
286;)
287global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
288sym(vp8_filter_block1d4_h6_ssse3):
289    push        rbp
290    mov         rbp, rsp
291    SHADOW_ARGS_TO_STACK 6
292    SAVE_XMM 7
293    GET_GOT     rbx
294    push        rsi
295    push        rdi
296    ; end prolog
297
298    movsxd      rdx, DWORD PTR arg(5)   ;table index
299    xor         rsi, rsi
300    shl         rdx, 4      ;
301
302    lea         rax, [GLOBAL(k0_k5)]
303    add         rax, rdx
304    movdqa      xmm7, [GLOBAL(rd)]
305
306    cmp         esi, DWORD PTR [rax]
307    je          .vp8_filter_block1d4_h4_ssse3
308
309    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
310    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
311    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
312
313    mov         rsi, arg(0)             ;src_ptr
314    mov         rdi, arg(2)             ;output_ptr
315    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
316    movsxd      rcx, dword ptr arg(4)   ;output_height
317
318    movsxd      rdx, dword ptr arg(3)   ;output_pitch
319
320;xmm3 free
321.filter_block1d4_h6_rowloop_ssse3:
322    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
323
324    movdqa      xmm1, xmm0
325    pshufb      xmm0, [GLOBAL(shuf1b)]
326
327    movdqa      xmm2, xmm1
328    pshufb      xmm1, [GLOBAL(shuf2b)]
329    pmaddubsw   xmm0, xmm4
330    pshufb      xmm2, [GLOBAL(shuf3b)]
331    pmaddubsw   xmm1, xmm5
332
333;--
334    pmaddubsw   xmm2, xmm6
335
336    lea         rsi,    [rsi + rax]
337;--
338    paddsw      xmm0, xmm1
339    paddsw      xmm0, xmm7
340    pxor        xmm1, xmm1
341    paddsw      xmm0, xmm2
342    psraw       xmm0, 7
343    packuswb    xmm0, xmm0
344
345    movd        DWORD PTR [rdi], xmm0
346
347    add         rdi, rdx
348    dec         rcx
349    jnz         .filter_block1d4_h6_rowloop_ssse3
350
351    ; begin epilog
352    pop rdi
353    pop rsi
354    RESTORE_GOT
355    RESTORE_XMM
356    UNSHADOW_ARGS
357    pop         rbp
358    ret
359
360.vp8_filter_block1d4_h4_ssse3:
361    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
362    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
363    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
364    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
365
366    mov         rsi, arg(0)             ;src_ptr
367    mov         rdi, arg(2)             ;output_ptr
368    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
369    movsxd      rcx, dword ptr arg(4)   ;output_height
370
371    movsxd      rdx, dword ptr arg(3)   ;output_pitch
372
373.filter_block1d4_h4_rowloop_ssse3:
374    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
375
376    movdqa      xmm2, xmm1
377    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
378    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
379    pmaddubsw   xmm1, xmm5
380
381;--
382    pmaddubsw   xmm2, xmm6
383
384    lea         rsi,    [rsi + rax]
385;--
386    paddsw      xmm1, xmm7
387    paddsw      xmm1, xmm2
388    psraw       xmm1, 7
389    packuswb    xmm1, xmm1
390
391    movd        DWORD PTR [rdi], xmm1
392
393    add         rdi, rdx
394    dec         rcx
395    jnz         .filter_block1d4_h4_rowloop_ssse3
396
397    ; begin epilog
398    pop rdi
399    pop rsi
400    RESTORE_GOT
401    RESTORE_XMM
402    UNSHADOW_ARGS
403    pop         rbp
404    ret
405
406
407
408;void vp8_filter_block1d16_v6_ssse3
409;(
410;    unsigned char *src_ptr,
411;    unsigned int   src_pitch,
412;    unsigned char *output_ptr,
413;    unsigned int   out_pitch,
414;    unsigned int   output_height,
415;    unsigned int   vp8_filter_index
416;)
417global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
418sym(vp8_filter_block1d16_v6_ssse3):
419    push        rbp
420    mov         rbp, rsp
421    SHADOW_ARGS_TO_STACK 6
422    SAVE_XMM 7
423    GET_GOT     rbx
424    push        rsi
425    push        rdi
426    ; end prolog
427
428    movsxd      rdx, DWORD PTR arg(5)   ;table index
429    xor         rsi, rsi
430    shl         rdx, 4      ;
431
432    lea         rax, [GLOBAL(k0_k5)]
433    add         rax, rdx
434
435    cmp         esi, DWORD PTR [rax]
436    je          .vp8_filter_block1d16_v4_ssse3
437
438    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
439    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
440    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
441
442    mov         rsi, arg(0)             ;src_ptr
443    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
444    mov         rdi, arg(2)             ;output_ptr
445
446%if ABI_IS_32BIT=0
447    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
448%endif
449    mov         rax, rsi
450    movsxd      rcx, DWORD PTR arg(4)   ;output_height
451    add         rax, rdx
452
453
454.vp8_filter_block1d16_v6_ssse3_loop:
455    movq        xmm1, MMWORD PTR [rsi]                  ;A
456    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
457    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
458    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
459    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
460
461    punpcklbw   xmm2, xmm4                  ;B D
462    punpcklbw   xmm3, xmm0                  ;C E
463
464    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
465
466    pmaddubsw   xmm3, xmm6
467    punpcklbw   xmm1, xmm0                  ;A F
468    pmaddubsw   xmm2, xmm7
469    pmaddubsw   xmm1, xmm5
470
471    paddsw      xmm2, xmm3
472    paddsw      xmm2, xmm1
473    paddsw      xmm2, [GLOBAL(rd)]
474    psraw       xmm2, 7
475    packuswb    xmm2, xmm2
476
477    movq        MMWORD PTR [rdi], xmm2          ;store the results
478
479    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
480    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
481    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
482    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
483    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
484
485    punpcklbw   xmm2, xmm4                  ;B D
486    punpcklbw   xmm3, xmm0                  ;C E
487
488    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
489    pmaddubsw   xmm3, xmm6
490    punpcklbw   xmm1, xmm0                  ;A F
491    pmaddubsw   xmm2, xmm7
492    pmaddubsw   xmm1, xmm5
493
494    add         rsi,  rdx
495    add         rax,  rdx
496;--
497;--
498    paddsw      xmm2, xmm3
499    paddsw      xmm2, xmm1
500    paddsw      xmm2, [GLOBAL(rd)]
501    psraw       xmm2, 7
502    packuswb    xmm2, xmm2
503
504    movq        MMWORD PTR [rdi+8], xmm2
505
506%if ABI_IS_32BIT
507    add         rdi,        DWORD PTR arg(3) ;out_pitch
508%else
509    add         rdi,        r8
510%endif
511    dec         rcx
512    jnz         .vp8_filter_block1d16_v6_ssse3_loop
513
514    ; begin epilog
515    pop rdi
516    pop rsi
517    RESTORE_GOT
518    RESTORE_XMM
519    UNSHADOW_ARGS
520    pop         rbp
521    ret
522
523.vp8_filter_block1d16_v4_ssse3:
524    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
525    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
526
527    mov         rsi, arg(0)             ;src_ptr
528    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
529    mov         rdi, arg(2)             ;output_ptr
530
531%if ABI_IS_32BIT=0
532    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
533%endif
534    mov         rax, rsi
535    movsxd      rcx, DWORD PTR arg(4)   ;output_height
536    add         rax, rdx
537
538.vp8_filter_block1d16_v4_ssse3_loop:
539    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
540    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
541    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
542    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
543
544    punpcklbw   xmm2, xmm4                  ;B D
545    punpcklbw   xmm3, xmm0                  ;C E
546
547    pmaddubsw   xmm3, xmm6
548    pmaddubsw   xmm2, xmm7
549    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
550    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
551    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
552    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
553
554    paddsw      xmm2, [GLOBAL(rd)]
555    paddsw      xmm2, xmm3
556    psraw       xmm2, 7
557    packuswb    xmm2, xmm2
558
559    punpcklbw   xmm5, xmm4                  ;B D
560    punpcklbw   xmm1, xmm0                  ;C E
561
562    pmaddubsw   xmm1, xmm6
563    pmaddubsw   xmm5, xmm7
564
565    movdqa      xmm4, [GLOBAL(rd)]
566    add         rsi,  rdx
567    add         rax,  rdx
568;--
569;--
570    paddsw      xmm5, xmm1
571    paddsw      xmm5, xmm4
572    psraw       xmm5, 7
573    packuswb    xmm5, xmm5
574
575    punpcklqdq  xmm2, xmm5
576
577    movdqa       XMMWORD PTR [rdi], xmm2
578
579%if ABI_IS_32BIT
580    add         rdi,        DWORD PTR arg(3) ;out_pitch
581%else
582    add         rdi,        r8
583%endif
584    dec         rcx
585    jnz         .vp8_filter_block1d16_v4_ssse3_loop
586
587    ; begin epilog
588    pop rdi
589    pop rsi
590    RESTORE_GOT
591    RESTORE_XMM
592    UNSHADOW_ARGS
593    pop         rbp
594    ret
595
596;void vp8_filter_block1d8_v6_ssse3
597;(
598;    unsigned char *src_ptr,
599;    unsigned int   src_pitch,
600;    unsigned char *output_ptr,
601;    unsigned int   out_pitch,
602;    unsigned int   output_height,
603;    unsigned int   vp8_filter_index
604;)
605global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
606sym(vp8_filter_block1d8_v6_ssse3):
607    push        rbp
608    mov         rbp, rsp
609    SHADOW_ARGS_TO_STACK 6
610    SAVE_XMM 7
611    GET_GOT     rbx
612    push        rsi
613    push        rdi
614    ; end prolog
615
616    movsxd      rdx, DWORD PTR arg(5)   ;table index
617    xor         rsi, rsi
618    shl         rdx, 4      ;
619
620    lea         rax, [GLOBAL(k0_k5)]
621    add         rax, rdx
622
623    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
624    mov         rdi, arg(2)             ;output_ptr
625%if ABI_IS_32BIT=0
626    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
627%endif
628    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
629
630    cmp         esi, DWORD PTR [rax]
631    je          .vp8_filter_block1d8_v4_ssse3
632
633    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
634    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
635    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
636
637    mov         rsi, arg(0)             ;src_ptr
638
639    mov         rax, rsi
640    add         rax, rdx
641
642.vp8_filter_block1d8_v6_ssse3_loop:
643    movq        xmm1, MMWORD PTR [rsi]                  ;A
644    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
645    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
646    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
647    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
648
649    punpcklbw   xmm2, xmm4                  ;B D
650    punpcklbw   xmm3, xmm0                  ;C E
651
652    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
653    movdqa      xmm4, [GLOBAL(rd)]
654
655    pmaddubsw   xmm3, xmm6
656    punpcklbw   xmm1, xmm0                  ;A F
657    pmaddubsw   xmm2, xmm7
658    pmaddubsw   xmm1, xmm5
659    add         rsi,  rdx
660    add         rax,  rdx
661;--
662;--
663    paddsw      xmm2, xmm3
664    paddsw      xmm2, xmm1
665    paddsw      xmm2, xmm4
666    psraw       xmm2, 7
667    packuswb    xmm2, xmm2
668
669    movq        MMWORD PTR [rdi], xmm2
670
671%if ABI_IS_32BIT
672    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
673%else
674    add         rdi,        r8
675%endif
676    dec         rcx
677    jnz         .vp8_filter_block1d8_v6_ssse3_loop
678
679    ; begin epilog
680    pop rdi
681    pop rsi
682    RESTORE_GOT
683    RESTORE_XMM
684    UNSHADOW_ARGS
685    pop         rbp
686    ret
687
688.vp8_filter_block1d8_v4_ssse3:
689    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
690    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
691    movdqa      xmm5, [GLOBAL(rd)]
692
693    mov         rsi, arg(0)             ;src_ptr
694
695    mov         rax, rsi
696    add         rax, rdx
697
698.vp8_filter_block1d8_v4_ssse3_loop:
699    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
700    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
701    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
702    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
703
704    punpcklbw   xmm2, xmm4                  ;B D
705    punpcklbw   xmm3, xmm0                  ;C E
706
707    pmaddubsw   xmm3, xmm6
708    pmaddubsw   xmm2, xmm7
709    add         rsi,  rdx
710    add         rax,  rdx
711;--
712;--
713    paddsw      xmm2, xmm3
714    paddsw      xmm2, xmm5
715    psraw       xmm2, 7
716    packuswb    xmm2, xmm2
717
718    movq        MMWORD PTR [rdi], xmm2
719
720%if ABI_IS_32BIT
721    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
722%else
723    add         rdi,        r8
724%endif
725    dec         rcx
726    jnz         .vp8_filter_block1d8_v4_ssse3_loop
727
728    ; begin epilog
729    pop rdi
730    pop rsi
731    RESTORE_GOT
732    RESTORE_XMM
733    UNSHADOW_ARGS
734    pop         rbp
735    ret
736;void vp8_filter_block1d4_v6_ssse3
737;(
738;    unsigned char *src_ptr,
739;    unsigned int   src_pitch,
740;    unsigned char *output_ptr,
741;    unsigned int   out_pitch,
742;    unsigned int   output_height,
743;    unsigned int   vp8_filter_index
744;)
745global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
746sym(vp8_filter_block1d4_v6_ssse3):
747    push        rbp
748    mov         rbp, rsp
749    SHADOW_ARGS_TO_STACK 6
750    GET_GOT     rbx
751    push        rsi
752    push        rdi
753    ; end prolog
754
755    movsxd      rdx, DWORD PTR arg(5)   ;table index
756    xor         rsi, rsi
757    shl         rdx, 4      ;
758
759    lea         rax, [GLOBAL(k0_k5)]
760    add         rax, rdx
761
762    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
763    mov         rdi, arg(2)             ;output_ptr
764%if ABI_IS_32BIT=0
765    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
766%endif
767    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
768
769    cmp         esi, DWORD PTR [rax]
770    je          .vp8_filter_block1d4_v4_ssse3
771
772    movq        mm5, MMWORD PTR [rax]         ;k0_k5
773    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
774    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
775
776    mov         rsi, arg(0)             ;src_ptr
777
778    mov         rax, rsi
779    add         rax, rdx
780
781.vp8_filter_block1d4_v6_ssse3_loop:
782    movd        mm1, DWORD PTR [rsi]                  ;A
783    movd        mm2, DWORD PTR [rsi + rdx]            ;B
784    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
785    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
786    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
787
788    punpcklbw   mm2, mm4                  ;B D
789    punpcklbw   mm3, mm0                  ;C E
790
791    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
792
793    movq        mm4, [GLOBAL(rd)]
794
795    pmaddubsw   mm3, mm6
796    punpcklbw   mm1, mm0                  ;A F
797    pmaddubsw   mm2, mm7
798    pmaddubsw   mm1, mm5
799    add         rsi,  rdx
800    add         rax,  rdx
801;--
802;--
803    paddsw      mm2, mm3
804    paddsw      mm2, mm1
805    paddsw      mm2, mm4
806    psraw       mm2, 7
807    packuswb    mm2, mm2
808
809    movd        DWORD PTR [rdi], mm2
810
811%if ABI_IS_32BIT
812    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
813%else
814    add         rdi,        r8
815%endif
816    dec         rcx
817    jnz         .vp8_filter_block1d4_v6_ssse3_loop
818
819    ; begin epilog
820    pop rdi
821    pop rsi
822    RESTORE_GOT
823    UNSHADOW_ARGS
824    pop         rbp
825    ret
826
827.vp8_filter_block1d4_v4_ssse3:
828    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
829    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
830    movq        mm5, MMWORD PTR [GLOBAL(rd)]
831
832    mov         rsi, arg(0)             ;src_ptr
833
834    mov         rax, rsi
835    add         rax, rdx
836
837.vp8_filter_block1d4_v4_ssse3_loop:
838    movd        mm2, DWORD PTR [rsi + rdx]            ;B
839    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
840    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
841    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
842
843    punpcklbw   mm2, mm4                  ;B D
844    punpcklbw   mm3, mm0                  ;C E
845
846    pmaddubsw   mm3, mm6
847    pmaddubsw   mm2, mm7
848    add         rsi,  rdx
849    add         rax,  rdx
850;--
851;--
852    paddsw      mm2, mm3
853    paddsw      mm2, mm5
854    psraw       mm2, 7
855    packuswb    mm2, mm2
856
857    movd        DWORD PTR [rdi], mm2
858
859%if ABI_IS_32BIT
860    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
861%else
862    add         rdi,        r8
863%endif
864    dec         rcx
865    jnz         .vp8_filter_block1d4_v4_ssse3_loop
866
867    ; begin epilog
868    pop rdi
869    pop rsi
870    RESTORE_GOT
871    UNSHADOW_ARGS
872    pop         rbp
873    ret
874
875;void vp8_bilinear_predict16x16_ssse3
876;(
877;    unsigned char  *src_ptr,
878;    int   src_pixels_per_line,
879;    int  xoffset,
880;    int  yoffset,
881;    unsigned char *dst_ptr,
882;    int dst_pitch
883;)
884global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
885sym(vp8_bilinear_predict16x16_ssse3):
886    push        rbp
887    mov         rbp, rsp
888    SHADOW_ARGS_TO_STACK 6
889    SAVE_XMM 7
890    GET_GOT     rbx
891    push        rsi
892    push        rdi
893    ; end prolog
894
895        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
896        movsxd      rax,        dword ptr arg(2)    ; xoffset
897
898        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
899        je          .b16x16_sp_only
900
901        shl         rax,        4
902        lea         rax,        [rax + rcx]         ; HFilter
903
904        mov         rdi,        arg(4)              ; dst_ptr
905        mov         rsi,        arg(0)              ; src_ptr
906        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
907
908        movdqa      xmm1,       [rax]
909
910        movsxd      rax,        dword ptr arg(3)    ; yoffset
911
912        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
913        je          .b16x16_fp_only
914
915        shl         rax,        4
916        lea         rax,        [rax + rcx]         ; VFilter
917
918        lea         rcx,        [rdi+rdx*8]
919        lea         rcx,        [rcx+rdx*8]
920        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
921
922        movdqa      xmm2,       [rax]
923
924%if ABI_IS_32BIT=0
925        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
926%endif
927        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
928        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
929
930        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
931        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
932
933        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
934
935        lea         rsi,        [rsi + rdx]         ; next line
936
937        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
938
939        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
940        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
941
942        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
943        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
944
945        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
946        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
947
948        movdqa      xmm7,       xmm3
949        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
950
951.next_row:
952        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
953        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
954
955        punpcklbw   xmm6,       xmm5
956        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
957
958        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
959        lea         rsi,        [rsi + rdx]         ; next line
960
961        pmaddubsw   xmm6,       xmm1
962
963        punpcklbw   xmm4,       xmm5
964        pmaddubsw   xmm4,       xmm1
965
966        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
967        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
968
969        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
970        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
971
972        packuswb    xmm6,       xmm4
973        movdqa      xmm5,       xmm7
974
975        punpcklbw   xmm5,       xmm6
976        pmaddubsw   xmm5,       xmm2
977
978        punpckhbw   xmm7,       xmm6
979        pmaddubsw   xmm7,       xmm2
980
981        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
982        psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
983
984        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
985        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
986
987        packuswb    xmm5,       xmm7
988        movdqa      xmm7,       xmm6
989
990        movdqa      [rdi],      xmm5                ; store the results in the destination
991%if ABI_IS_32BIT
992        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
993%else
994        add         rdi,        r8
995%endif
996
997        cmp         rdi,        rcx
998        jne         .next_row
999
1000        jmp         .done
1001
1002.b16x16_sp_only:
1003        movsxd      rax,        dword ptr arg(3)    ; yoffset
1004        shl         rax,        4
1005        lea         rax,        [rax + rcx]         ; VFilter
1006
1007        mov         rdi,        arg(4)              ; dst_ptr
1008        mov         rsi,        arg(0)              ; src_ptr
1009        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1010
1011        movdqa      xmm1,       [rax]               ; VFilter
1012
1013        lea         rcx,        [rdi+rdx*8]
1014        lea         rcx,        [rcx+rdx*8]
1015        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
1016
1017        ; get the first horizontal line done
1018        movq        xmm4,       [rsi]               ; load row 0
1019        movq        xmm2,       [rsi + 8]           ; load row 0
1020
1021        lea         rsi,        [rsi + rax]         ; next line
1022.next_row_sp:
1023        movq        xmm3,       [rsi]               ; load row + 1
1024        movq        xmm5,       [rsi + 8]           ; load row + 1
1025
1026        punpcklbw   xmm4,       xmm3
1027        punpcklbw   xmm2,       xmm5
1028
1029        pmaddubsw   xmm4,       xmm1
1030        movq        xmm7,       [rsi + rax]         ; load row + 2
1031
1032        pmaddubsw   xmm2,       xmm1
1033        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
1034
1035        punpcklbw   xmm3,       xmm7
1036        punpcklbw   xmm5,       xmm6
1037
1038        pmaddubsw   xmm3,       xmm1
1039        paddw       xmm4,       [GLOBAL(rd)]
1040
1041        pmaddubsw   xmm5,       xmm1
1042        paddw       xmm2,       [GLOBAL(rd)]
1043
1044        psraw       xmm4,       VP8_FILTER_SHIFT
1045        psraw       xmm2,       VP8_FILTER_SHIFT
1046
1047        packuswb    xmm4,       xmm2
1048        paddw       xmm3,       [GLOBAL(rd)]
1049
1050        movdqa      [rdi],      xmm4                ; store row 0
1051        paddw       xmm5,       [GLOBAL(rd)]
1052
1053        psraw       xmm3,       VP8_FILTER_SHIFT
1054        psraw       xmm5,       VP8_FILTER_SHIFT
1055
1056        packuswb    xmm3,       xmm5
1057        movdqa      xmm4,       xmm7
1058
1059        movdqa      [rdi + rdx],xmm3                ; store row 1
1060        lea         rsi,        [rsi + 2*rax]
1061
1062        movdqa      xmm2,       xmm6
1063        lea         rdi,        [rdi + 2*rdx]
1064
1065        cmp         rdi,        rcx
1066        jne         .next_row_sp
1067
1068        jmp         .done
1069
1070.b16x16_fp_only:
1071        lea         rcx,        [rdi+rdx*8]
1072        lea         rcx,        [rcx+rdx*8]
1073        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
1074
1075.next_row_fp:
1076        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
1077        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
1078
1079        punpcklbw   xmm2,       xmm4
1080        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
1081
1082        pmaddubsw   xmm2,       xmm1
1083        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
1084
1085        lea         rsi,        [rsi + rax]         ; next line
1086        punpcklbw   xmm3,       xmm4
1087
1088        pmaddubsw   xmm3,       xmm1
1089        movq        xmm5,       [rsi]
1090
1091        paddw       xmm2,       [GLOBAL(rd)]
1092        movq        xmm7,       [rsi+1]
1093
1094        movq        xmm6,       [rsi+8]
1095        psraw       xmm2,       VP8_FILTER_SHIFT
1096
1097        punpcklbw   xmm5,       xmm7
1098        movq        xmm7,       [rsi+9]
1099
1100        paddw       xmm3,       [GLOBAL(rd)]
1101        pmaddubsw   xmm5,       xmm1
1102
1103        psraw       xmm3,       VP8_FILTER_SHIFT
1104        punpcklbw   xmm6,       xmm7
1105
1106        packuswb    xmm2,       xmm3
1107        pmaddubsw   xmm6,       xmm1
1108
1109        movdqa      [rdi],      xmm2                ; store the results in the destination
1110        paddw       xmm5,       [GLOBAL(rd)]
1111
1112        lea         rdi,        [rdi + rdx]         ; dst_pitch
1113        psraw       xmm5,       VP8_FILTER_SHIFT
1114
1115        paddw       xmm6,       [GLOBAL(rd)]
1116        psraw       xmm6,       VP8_FILTER_SHIFT
1117
1118        packuswb    xmm5,       xmm6
1119        lea         rsi,        [rsi + rax]         ; next line
1120
1121        movdqa      [rdi],      xmm5                ; store the results in the destination
1122        lea         rdi,        [rdi + rdx]         ; dst_pitch
1123
1124        cmp         rdi,        rcx
1125
1126        jne         .next_row_fp
1127
1128.done:
1129    ; begin epilog
1130    pop         rdi
1131    pop         rsi
1132    RESTORE_GOT
1133    RESTORE_XMM
1134    UNSHADOW_ARGS
1135    pop         rbp
1136    ret
1137
1138;void vp8_bilinear_predict8x8_ssse3
1139;(
1140;    unsigned char  *src_ptr,
1141;    int   src_pixels_per_line,
1142;    int  xoffset,
1143;    int  yoffset,
1144;    unsigned char *dst_ptr,
1145;    int dst_pitch
1146;)
1147global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
1148sym(vp8_bilinear_predict8x8_ssse3):
1149    push        rbp
1150    mov         rbp, rsp
1151    SHADOW_ARGS_TO_STACK 6
1152    SAVE_XMM 7
1153    GET_GOT     rbx
1154    push        rsi
1155    push        rdi
1156    ; end prolog
1157
1158    ALIGN_STACK 16, rax
1159    sub         rsp, 144                         ; reserve 144 bytes
1160
1161        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
1162
1163        mov         rsi,        arg(0) ;src_ptr
1164        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
1165
1166    ;Read 9-line unaligned data in and put them on stack. This gives a big
1167    ;performance boost.
1168        movdqu      xmm0,       [rsi]
1169        lea         rax,        [rdx + rdx*2]
1170        movdqu      xmm1,       [rsi+rdx]
1171        movdqu      xmm2,       [rsi+rdx*2]
1172        add         rsi,        rax
1173        movdqu      xmm3,       [rsi]
1174        movdqu      xmm4,       [rsi+rdx]
1175        movdqu      xmm5,       [rsi+rdx*2]
1176        add         rsi,        rax
1177        movdqu      xmm6,       [rsi]
1178        movdqu      xmm7,       [rsi+rdx]
1179
1180        movdqa      XMMWORD PTR [rsp],            xmm0
1181
1182        movdqu      xmm0,       [rsi+rdx*2]
1183
1184        movdqa      XMMWORD PTR [rsp+16],         xmm1
1185        movdqa      XMMWORD PTR [rsp+32],         xmm2
1186        movdqa      XMMWORD PTR [rsp+48],         xmm3
1187        movdqa      XMMWORD PTR [rsp+64],         xmm4
1188        movdqa      XMMWORD PTR [rsp+80],         xmm5
1189        movdqa      XMMWORD PTR [rsp+96],         xmm6
1190        movdqa      XMMWORD PTR [rsp+112],        xmm7
1191        movdqa      XMMWORD PTR [rsp+128],        xmm0
1192
1193        movsxd      rax,        dword ptr arg(2)    ; xoffset
1194        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
1195        je          .b8x8_sp_only
1196
1197        shl         rax,        4
1198        add         rax,        rcx                 ; HFilter
1199
1200        mov         rdi,        arg(4)              ; dst_ptr
1201        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1202
1203        movdqa      xmm0,       [rax]
1204
1205        movsxd      rax,        dword ptr arg(3)    ; yoffset
1206        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
1207        je          .b8x8_fp_only
1208
1209        shl         rax,        4
1210        lea         rax,        [rax + rcx]         ; VFilter
1211
1212        lea         rcx,        [rdi+rdx*8]
1213
1214        movdqa      xmm1,       [rax]
1215
1216        ; get the first horizontal line done
1217        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1218        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
1219
1220        psrldq      xmm5,       1
1221        lea         rsp,        [rsp + 16]          ; next line
1222
1223        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
1224        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
1225
1226        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1227        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
1228
1229        movdqa      xmm7,       xmm3
1230        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1231
1232.next_row:
1233        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1234        lea         rsp,        [rsp + 16]          ; next line
1235
1236        movdqa      xmm5,       xmm6
1237
1238        psrldq      xmm5,       1
1239
1240        punpcklbw   xmm6,       xmm5
1241        pmaddubsw   xmm6,       xmm0
1242
1243        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
1244        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
1245
1246        packuswb    xmm6,       xmm6
1247
1248        punpcklbw   xmm7,       xmm6
1249        pmaddubsw   xmm7,       xmm1
1250
1251        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
1252        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
1253
1254        packuswb    xmm7,       xmm7
1255
1256        movq        [rdi],      xmm7                ; store the results in the destination
1257        lea         rdi,        [rdi + rdx]
1258
1259        movdqa      xmm7,       xmm6
1260
1261        cmp         rdi,        rcx
1262        jne         .next_row
1263
1264        jmp         .done8x8
1265
1266.b8x8_sp_only:
1267        movsxd      rax,        dword ptr arg(3)    ; yoffset
1268        shl         rax,        4
1269        lea         rax,        [rax + rcx]         ; VFilter
1270
1271        mov         rdi,        arg(4) ;dst_ptr
1272        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1273
1274        movdqa      xmm0,       [rax]               ; VFilter
1275
1276        movq        xmm1,       XMMWORD PTR [rsp]
1277        movq        xmm2,       XMMWORD PTR [rsp+16]
1278
1279        movq        xmm3,       XMMWORD PTR [rsp+32]
1280        punpcklbw   xmm1,       xmm2
1281
1282        movq        xmm4,       XMMWORD PTR [rsp+48]
1283        punpcklbw   xmm2,       xmm3
1284
1285        movq        xmm5,       XMMWORD PTR [rsp+64]
1286        punpcklbw   xmm3,       xmm4
1287
1288        movq        xmm6,       XMMWORD PTR [rsp+80]
1289        punpcklbw   xmm4,       xmm5
1290
1291        movq        xmm7,       XMMWORD PTR [rsp+96]
1292        punpcklbw   xmm5,       xmm6
1293
1294        pmaddubsw   xmm1,       xmm0
1295        pmaddubsw   xmm2,       xmm0
1296
1297        pmaddubsw   xmm3,       xmm0
1298        pmaddubsw   xmm4,       xmm0
1299
1300        pmaddubsw   xmm5,       xmm0
1301        punpcklbw   xmm6,       xmm7
1302
1303        pmaddubsw   xmm6,       xmm0
1304        paddw       xmm1,       [GLOBAL(rd)]
1305
1306        paddw       xmm2,       [GLOBAL(rd)]
1307        psraw       xmm1,       VP8_FILTER_SHIFT
1308
1309        paddw       xmm3,       [GLOBAL(rd)]
1310        psraw       xmm2,       VP8_FILTER_SHIFT
1311
1312        paddw       xmm4,       [GLOBAL(rd)]
1313        psraw       xmm3,       VP8_FILTER_SHIFT
1314
1315        paddw       xmm5,       [GLOBAL(rd)]
1316        psraw       xmm4,       VP8_FILTER_SHIFT
1317
1318        paddw       xmm6,       [GLOBAL(rd)]
1319        psraw       xmm5,       VP8_FILTER_SHIFT
1320
1321        psraw       xmm6,       VP8_FILTER_SHIFT
1322        packuswb    xmm1,       xmm1
1323
1324        packuswb    xmm2,       xmm2
1325        movq        [rdi],      xmm1
1326
1327        packuswb    xmm3,       xmm3
1328        movq        [rdi+rdx],  xmm2
1329
1330        packuswb    xmm4,       xmm4
1331        movq        xmm1,       XMMWORD PTR [rsp+112]
1332
1333        lea         rdi,        [rdi + 2*rdx]
1334        movq        xmm2,       XMMWORD PTR [rsp+128]
1335
1336        packuswb    xmm5,       xmm5
1337        movq        [rdi],      xmm3
1338
1339        packuswb    xmm6,       xmm6
1340        movq        [rdi+rdx],  xmm4
1341
1342        lea         rdi,        [rdi + 2*rdx]
1343        punpcklbw   xmm7,       xmm1
1344
1345        movq        [rdi],      xmm5
1346        pmaddubsw   xmm7,       xmm0
1347
1348        movq        [rdi+rdx],  xmm6
1349        punpcklbw   xmm1,       xmm2
1350
1351        pmaddubsw   xmm1,       xmm0
1352        paddw       xmm7,       [GLOBAL(rd)]
1353
1354        psraw       xmm7,       VP8_FILTER_SHIFT
1355        paddw       xmm1,       [GLOBAL(rd)]
1356
1357        psraw       xmm1,       VP8_FILTER_SHIFT
1358        packuswb    xmm7,       xmm7
1359
1360        packuswb    xmm1,       xmm1
1361        lea         rdi,        [rdi + 2*rdx]
1362
1363        movq        [rdi],      xmm7
1364
1365        movq        [rdi+rdx],  xmm1
1366        lea         rsp,        [rsp + 144]
1367
1368        jmp         .done8x8
1369
1370.b8x8_fp_only:
1371        lea         rcx,        [rdi+rdx*8]
1372
1373.next_row_fp:
1374        movdqa      xmm1,       XMMWORD PTR [rsp]
1375        movdqa      xmm3,       XMMWORD PTR [rsp+16]
1376
1377        movdqa      xmm2,       xmm1
1378        movdqa      xmm5,       XMMWORD PTR [rsp+32]
1379
1380        psrldq      xmm2,       1
1381        movdqa      xmm7,       XMMWORD PTR [rsp+48]
1382
1383        movdqa      xmm4,       xmm3
1384        psrldq      xmm4,       1
1385
1386        movdqa      xmm6,       xmm5
1387        psrldq      xmm6,       1
1388
1389        punpcklbw   xmm1,       xmm2
1390        pmaddubsw   xmm1,       xmm0
1391
1392        punpcklbw   xmm3,       xmm4
1393        pmaddubsw   xmm3,       xmm0
1394
1395        punpcklbw   xmm5,       xmm6
1396        pmaddubsw   xmm5,       xmm0
1397
1398        movdqa      xmm2,       xmm7
1399        psrldq      xmm2,       1
1400
1401        punpcklbw   xmm7,       xmm2
1402        pmaddubsw   xmm7,       xmm0
1403
1404        paddw       xmm1,       [GLOBAL(rd)]
1405        psraw       xmm1,       VP8_FILTER_SHIFT
1406
1407        paddw       xmm3,       [GLOBAL(rd)]
1408        psraw       xmm3,       VP8_FILTER_SHIFT
1409
1410        paddw       xmm5,       [GLOBAL(rd)]
1411        psraw       xmm5,       VP8_FILTER_SHIFT
1412
1413        paddw       xmm7,       [GLOBAL(rd)]
1414        psraw       xmm7,       VP8_FILTER_SHIFT
1415
1416        packuswb    xmm1,       xmm1
1417        packuswb    xmm3,       xmm3
1418
1419        packuswb    xmm5,       xmm5
1420        movq        [rdi],      xmm1
1421
1422        packuswb    xmm7,       xmm7
1423        movq        [rdi+rdx],  xmm3
1424
1425        lea         rdi,        [rdi + 2*rdx]
1426        movq        [rdi],      xmm5
1427
1428        lea         rsp,        [rsp + 4*16]
1429        movq        [rdi+rdx],  xmm7
1430
1431        lea         rdi,        [rdi + 2*rdx]
1432        cmp         rdi,        rcx
1433
1434        jne         .next_row_fp
1435
1436        lea         rsp,        [rsp + 16]
1437
1438.done8x8:
1439    ;add rsp, 144
1440    pop         rsp
1441    ; begin epilog
1442    pop         rdi
1443    pop         rsi
1444    RESTORE_GOT
1445    RESTORE_XMM
1446    UNSHADOW_ARGS
1447    pop         rbp
1448    ret
1449
1450SECTION_RODATA
1451align 16
1452shuf1b:
1453    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
1454shuf2b:
1455    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
1456shuf3b:
1457    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
1458
1459align 16
1460shuf2bfrom1:
1461    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
1462align 16
1463shuf3bfrom1:
1464    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
1465
1466align 16
1467rd:
1468    times 8 dw 0x40
1469
1470align 16
1471k0_k5:
1472    times 8 db 0, 0             ;placeholder
1473    times 8 db 0, 0
1474    times 8 db 2, 1
1475    times 8 db 0, 0
1476    times 8 db 3, 3
1477    times 8 db 0, 0
1478    times 8 db 1, 2
1479    times 8 db 0, 0
1480k1_k3:
1481    times 8 db  0,    0         ;placeholder
1482    times 8 db  -6,  12
1483    times 8 db -11,  36
1484    times 8 db  -9,  50
1485    times 8 db -16,  77
1486    times 8 db  -6,  93
1487    times 8 db  -8, 108
1488    times 8 db  -1, 123
1489k2_k4:
1490    times 8 db 128,    0        ;placeholder
1491    times 8 db 123,   -1
1492    times 8 db 108,   -8
1493    times 8 db  93,   -6
1494    times 8 db  77,  -16
1495    times 8 db  50,   -9
1496    times 8 db  36,  -11
1497    times 8 db  12,   -6
1498align 16
1499vp8_bilinear_filters_ssse3:
1500    times 8 db 128, 0
1501    times 8 db 112, 16
1502    times 8 db 96,  32
1503    times 8 db 80,  48
1504    times 8 db 64,  64
1505    times 8 db 48,  80
1506    times 8 db 32,  96
1507    times 8 db 16,  112
1508
1509