1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define BLOCK_HEIGHT_WIDTH 4
15%define VP8_FILTER_WEIGHT 128
16%define VP8_FILTER_SHIFT  7
17
18
19;/************************************************************************************
20; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
21; input pixel array has output_height rows. This routine assumes that output_height is an
22; even number. This function handles 8 pixels in horizontal direction, calculating ONE
23; rows each iteration to take advantage of the 128 bits operations.
24;
25; This is an implementation of some of the SSE optimizations first seen in ffvp8
26;
27;*************************************************************************************/
28;void vp8_filter_block1d8_h6_ssse3
29;(
30;    unsigned char  *src_ptr,
31;    unsigned int    src_pixels_per_line,
32;    unsigned char *output_ptr,
33;    unsigned int    output_pitch,
34;    unsigned int    output_height,
35;    unsigned int    vp8_filter_index
36;)
37global sym(vp8_filter_block1d8_h6_ssse3)
38sym(vp8_filter_block1d8_h6_ssse3):
39    push        rbp
40    mov         rbp, rsp
41    SHADOW_ARGS_TO_STACK 6
42    GET_GOT     rbx
43    push        rsi
44    push        rdi
45    ; end prolog
46
47    movsxd      rdx, DWORD PTR arg(5)   ;table index
48    xor         rsi, rsi
49    shl         rdx, 4
50
51    movdqa      xmm7, [GLOBAL(rd)]
52
53    lea         rax, [GLOBAL(k0_k5)]
54    add         rax, rdx
55    mov         rdi, arg(2)             ;output_ptr
56
57    cmp         esi, DWORD PTR [rax]
58    je          vp8_filter_block1d8_h4_ssse3
59
60    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
61    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
62    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
63
64    mov         rsi, arg(0)             ;src_ptr
65    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
66    movsxd      rcx, dword ptr arg(4)   ;output_height
67
68    movsxd      rdx, dword ptr arg(3)   ;output_pitch
69
70    sub         rdi, rdx
71;xmm3 free
72filter_block1d8_h6_rowloop_ssse3:
73    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
74
75    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
76
77    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
78
79    movdqa      xmm1,   xmm0
80    pmaddubsw   xmm0,   xmm4
81
82    movdqa      xmm2,   xmm1
83    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
84
85    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
86    pmaddubsw   xmm1,   xmm5
87
88    lea         rdi,    [rdi + rdx]
89    pmaddubsw   xmm2,   xmm6
90
91    lea         rsi,    [rsi + rax]
92    dec         rcx
93
94    paddsw      xmm0,   xmm1
95    paddsw      xmm2,   xmm7
96
97    paddsw      xmm0,   xmm2
98
99    psraw       xmm0,   7
100
101    packuswb    xmm0,   xmm0
102
103    movq        MMWORD Ptr [rdi], xmm0
104    jnz         filter_block1d8_h6_rowloop_ssse3
105
106    ; begin epilog
107    pop rdi
108    pop rsi
109    RESTORE_GOT
110    UNSHADOW_ARGS
111    pop         rbp
112    ret
113
114vp8_filter_block1d8_h4_ssse3:
115    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
116    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
117
118    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
119    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
120
121    mov         rsi, arg(0)             ;src_ptr
122
123    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
124    movsxd      rcx, dword ptr arg(4)   ;output_height
125
126    movsxd      rdx, dword ptr arg(3)   ;output_pitch
127
128    sub         rdi, rdx
129
130filter_block1d8_h4_rowloop_ssse3:
131    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
132
133    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
134
135    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
136
137    movdqa      xmm2,   xmm0
138    pshufb      xmm0,   xmm3
139
140    pshufb      xmm2,   xmm4
141    pmaddubsw   xmm0,   xmm5
142
143    lea         rdi,    [rdi + rdx]
144    pmaddubsw   xmm2,   xmm6
145
146    lea         rsi,    [rsi + rax]
147    dec         rcx
148
149    paddsw      xmm0,   xmm7
150
151    paddsw      xmm0,   xmm2
152
153    psraw       xmm0,   7
154
155    packuswb    xmm0,   xmm0
156
157    movq        MMWORD Ptr [rdi], xmm0
158
159    jnz         filter_block1d8_h4_rowloop_ssse3
160
161    ; begin epilog
162    pop rdi
163    pop rsi
164    RESTORE_GOT
165    UNSHADOW_ARGS
166    pop         rbp
167    ret
168;void vp8_filter_block1d16_h6_ssse3
169;(
170;    unsigned char  *src_ptr,
171;    unsigned int    src_pixels_per_line,
172;    unsigned char  *output_ptr,
173;    unsigned int    output_pitch,
174;    unsigned int    output_height,
175;    unsigned int    vp8_filter_index
176;)
177global sym(vp8_filter_block1d16_h6_ssse3)
178sym(vp8_filter_block1d16_h6_ssse3):
179    push        rbp
180    mov         rbp, rsp
181    SHADOW_ARGS_TO_STACK 6
182    SAVE_XMM
183    GET_GOT     rbx
184    push        rsi
185    push        rdi
186    ; end prolog
187
188    movsxd      rdx, DWORD PTR arg(5)           ;table index
189    xor         rsi, rsi
190    shl         rdx, 4      ;
191
192    lea         rax, [GLOBAL(k0_k5)]
193    add         rax, rdx
194
195    mov         rdi, arg(2)                     ;output_ptr
196
197;;
198;;    cmp         esi, DWORD PTR [rax]
199;;    je          vp8_filter_block1d16_h4_ssse3
200
201    mov         rsi, arg(0)                     ;src_ptr
202
203    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
204    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
205    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
206
207    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
208    movsxd      rcx, dword ptr arg(4)           ;output_height
209    movsxd      rdx, dword ptr arg(3)           ;output_pitch
210
211filter_block1d16_h6_rowloop_ssse3:
212    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
213
214    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
215
216    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
217
218    movdqa      xmm1,   xmm0
219    pmaddubsw   xmm0,   xmm4
220
221    movdqa      xmm2,   xmm1
222    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
223
224    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
225    movq        xmm3,   MMWORD PTR [rsi +  6]
226
227    pmaddubsw   xmm1,   xmm5
228    movq        xmm7,   MMWORD PTR [rsi + 11]
229
230    pmaddubsw   xmm2,   xmm6
231    punpcklbw   xmm3,   xmm7
232
233    paddsw      xmm0,   xmm1
234    movdqa      xmm1,   xmm3
235
236    pmaddubsw   xmm3,   xmm4
237    paddsw      xmm0,   xmm2
238
239    movdqa      xmm2,   xmm1
240    paddsw      xmm0,   [GLOBAL(rd)]
241
242    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
243    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
244
245    psraw       xmm0,   7
246    pmaddubsw   xmm1,   xmm5
247
248    pmaddubsw   xmm2,   xmm6
249    packuswb    xmm0,   xmm0
250
251    lea         rsi,    [rsi + rax]
252    paddsw      xmm3,   xmm1
253
254    paddsw      xmm3,   xmm2
255
256    paddsw      xmm3,   [GLOBAL(rd)]
257
258    psraw       xmm3,   7
259
260    packuswb    xmm3,   xmm3
261
262    punpcklqdq  xmm0,   xmm3
263
264    movdqa      XMMWORD Ptr [rdi], xmm0
265
266    lea         rdi,    [rdi + rdx]
267    dec         rcx
268    jnz         filter_block1d16_h6_rowloop_ssse3
269
270    ; begin epilog
271    pop rdi
272    pop rsi
273    RESTORE_GOT
274    UNSHADOW_ARGS
275    pop         rbp
276    ret
277
278vp8_filter_block1d16_h4_ssse3:
279    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
280    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
281
282    mov         rsi, arg(0)             ;src_ptr
283    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
284    movsxd      rcx, dword ptr arg(4)   ;output_height
285    movsxd      rdx, dword ptr arg(3)   ;output_pitch
286
287filter_block1d16_h4_rowloop_ssse3:
288    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
289
290    movdqa      xmm2, xmm1
291    pshufb      xmm1, [GLOBAL(shuf2b)]
292    pshufb      xmm2, [GLOBAL(shuf3b)]
293    pmaddubsw   xmm1, xmm5
294
295    movdqu      xmm3,   XMMWORD PTR [rsi + 6]
296
297    pmaddubsw   xmm2, xmm6
298    movdqa      xmm0, xmm3
299    pshufb      xmm3, [GLOBAL(shuf3b)]
300    pshufb      xmm0, [GLOBAL(shuf2b)]
301
302    paddsw      xmm1, [GLOBAL(rd)]
303    paddsw      xmm1, xmm2
304
305    pmaddubsw   xmm0, xmm5
306    pmaddubsw   xmm3, xmm6
307
308    psraw       xmm1, 7
309    packuswb    xmm1, xmm1
310    lea         rsi,    [rsi + rax]
311    paddsw      xmm3, xmm0
312    paddsw      xmm3, [GLOBAL(rd)]
313    psraw       xmm3, 7
314    packuswb    xmm3, xmm3
315
316    punpcklqdq  xmm1, xmm3
317
318    movdqa      XMMWORD Ptr [rdi], xmm1
319
320    add         rdi, rdx
321    dec         rcx
322    jnz         filter_block1d16_h4_rowloop_ssse3
323
324
325    ; begin epilog
326    pop rdi
327    pop rsi
328    RESTORE_GOT
329    UNSHADOW_ARGS
330    pop         rbp
331    ret
332
333;void vp8_filter_block1d4_h6_ssse3
334;(
335;    unsigned char  *src_ptr,
336;    unsigned int    src_pixels_per_line,
337;    unsigned char  *output_ptr,
338;    unsigned int    output_pitch,
339;    unsigned int    output_height,
340;    unsigned int    vp8_filter_index
341;)
342global sym(vp8_filter_block1d4_h6_ssse3)
343sym(vp8_filter_block1d4_h6_ssse3):
344    push        rbp
345    mov         rbp, rsp
346    SHADOW_ARGS_TO_STACK 6
347    GET_GOT     rbx
348    push        rsi
349    push        rdi
350    ; end prolog
351
352    movsxd      rdx, DWORD PTR arg(5)   ;table index
353    xor         rsi, rsi
354    shl         rdx, 4      ;
355
356    lea         rax, [GLOBAL(k0_k5)]
357    add         rax, rdx
358    movdqa      xmm7, [GLOBAL(rd)]
359
360    cmp         esi, DWORD PTR [rax]
361    je          vp8_filter_block1d4_h4_ssse3
362
363    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
364    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
365    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
366
367    mov         rsi, arg(0)             ;src_ptr
368    mov         rdi, arg(2)             ;output_ptr
369    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
370    movsxd      rcx, dword ptr arg(4)   ;output_height
371
372    movsxd      rdx, dword ptr arg(3)   ;output_pitch
373
374;xmm3 free
375filter_block1d4_h6_rowloop_ssse3:
376    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
377
378    movdqa      xmm1, xmm0
379    pshufb      xmm0, [GLOBAL(shuf1b)]
380
381    movdqa      xmm2, xmm1
382    pshufb      xmm1, [GLOBAL(shuf2b)]
383    pmaddubsw   xmm0, xmm4
384    pshufb      xmm2, [GLOBAL(shuf3b)]
385    pmaddubsw   xmm1, xmm5
386
387;--
388    pmaddubsw   xmm2, xmm6
389
390    lea         rsi,    [rsi + rax]
391;--
392    paddsw      xmm0, xmm1
393    paddsw      xmm0, xmm7
394    pxor        xmm1, xmm1
395    paddsw      xmm0, xmm2
396    psraw       xmm0, 7
397    packuswb    xmm0, xmm0
398
399    movd        DWORD PTR [rdi], xmm0
400
401    add         rdi, rdx
402    dec         rcx
403    jnz         filter_block1d4_h6_rowloop_ssse3
404
405    ; begin epilog
406    pop rdi
407    pop rsi
408    RESTORE_GOT
409    UNSHADOW_ARGS
410    pop         rbp
411    ret
412
413vp8_filter_block1d4_h4_ssse3:
414    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
415    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
416    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
417    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
418
419    mov         rsi, arg(0)             ;src_ptr
420    mov         rdi, arg(2)             ;output_ptr
421    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
422    movsxd      rcx, dword ptr arg(4)   ;output_height
423
424    movsxd      rdx, dword ptr arg(3)   ;output_pitch
425
426filter_block1d4_h4_rowloop_ssse3:
427    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
428
429    movdqa      xmm2, xmm1
430    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
431    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
432    pmaddubsw   xmm1, xmm5
433
434;--
435    pmaddubsw   xmm2, xmm6
436
437    lea         rsi,    [rsi + rax]
438;--
439    paddsw      xmm1, xmm7
440    paddsw      xmm1, xmm2
441    psraw       xmm1, 7
442    packuswb    xmm1, xmm1
443
444    movd        DWORD PTR [rdi], xmm1
445
446    add         rdi, rdx
447    dec         rcx
448    jnz         filter_block1d4_h4_rowloop_ssse3
449
450    ; begin epilog
451    pop rdi
452    pop rsi
453    RESTORE_GOT
454    UNSHADOW_ARGS
455    pop         rbp
456    ret
457
458
459
460;void vp8_filter_block1d16_v6_ssse3
461;(
462;    unsigned char *src_ptr,
463;    unsigned int   src_pitch,
464;    unsigned char *output_ptr,
465;    unsigned int   out_pitch,
466;    unsigned int   output_height,
467;    unsigned int   vp8_filter_index
468;)
469global sym(vp8_filter_block1d16_v6_ssse3)
470sym(vp8_filter_block1d16_v6_ssse3):
471    push        rbp
472    mov         rbp, rsp
473    SHADOW_ARGS_TO_STACK 6
474    GET_GOT     rbx
475    push        rsi
476    push        rdi
477    ; end prolog
478
479    movsxd      rdx, DWORD PTR arg(5)   ;table index
480    xor         rsi, rsi
481    shl         rdx, 4      ;
482
483    lea         rax, [GLOBAL(k0_k5)]
484    add         rax, rdx
485
486    cmp         esi, DWORD PTR [rax]
487    je          vp8_filter_block1d16_v4_ssse3
488
489    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
490    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
491    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
492
493    mov         rsi, arg(0)             ;src_ptr
494    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
495    mov         rdi, arg(2)             ;output_ptr
496
497%if ABI_IS_32BIT=0
498    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
499%endif
500    mov         rax, rsi
501    movsxd      rcx, DWORD PTR arg(4)   ;output_height
502    add         rax, rdx
503
504
505vp8_filter_block1d16_v6_ssse3_loop:
506    movq        xmm1, MMWORD PTR [rsi]                  ;A
507    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
508    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
509    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
510    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
511
512    punpcklbw   xmm2, xmm4                  ;B D
513    punpcklbw   xmm3, xmm0                  ;C E
514
515    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
516
517    pmaddubsw   xmm3, xmm6
518    punpcklbw   xmm1, xmm0                  ;A F
519    pmaddubsw   xmm2, xmm7
520    pmaddubsw   xmm1, xmm5
521
522    paddsw      xmm2, xmm3
523    paddsw      xmm2, xmm1
524    paddsw      xmm2, [GLOBAL(rd)]
525    psraw       xmm2, 7
526    packuswb    xmm2, xmm2
527
528    movq        MMWORD PTR [rdi], xmm2          ;store the results
529
530    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
531    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
532    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
533    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
534    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
535
536    punpcklbw   xmm2, xmm4                  ;B D
537    punpcklbw   xmm3, xmm0                  ;C E
538
539    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
540    pmaddubsw   xmm3, xmm6
541    punpcklbw   xmm1, xmm0                  ;A F
542    pmaddubsw   xmm2, xmm7
543    pmaddubsw   xmm1, xmm5
544
545    add         rsi,  rdx
546    add         rax,  rdx
547;--
548;--
549    paddsw      xmm2, xmm3
550    paddsw      xmm2, xmm1
551    paddsw      xmm2, [GLOBAL(rd)]
552    psraw       xmm2, 7
553    packuswb    xmm2, xmm2
554
555    movq        MMWORD PTR [rdi+8], xmm2
556
557%if ABI_IS_32BIT
558    add         rdi,        DWORD PTR arg(3) ;out_pitch
559%else
560    add         rdi,        r8
561%endif
562    dec         rcx
563    jnz         vp8_filter_block1d16_v6_ssse3_loop
564
565    ; begin epilog
566    pop rdi
567    pop rsi
568    RESTORE_GOT
569    UNSHADOW_ARGS
570    pop         rbp
571    ret
572
573vp8_filter_block1d16_v4_ssse3:
574    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
575    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
576
577    mov         rsi, arg(0)             ;src_ptr
578    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
579    mov         rdi, arg(2)             ;output_ptr
580
581%if ABI_IS_32BIT=0
582    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
583%endif
584    mov         rax, rsi
585    movsxd      rcx, DWORD PTR arg(4)   ;output_height
586    add         rax, rdx
587
588vp8_filter_block1d16_v4_ssse3_loop:
589    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
590    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
591    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
592    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
593
594    punpcklbw   xmm2, xmm4                  ;B D
595    punpcklbw   xmm3, xmm0                  ;C E
596
597    pmaddubsw   xmm3, xmm6
598    pmaddubsw   xmm2, xmm7
599    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
600    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
601    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
602    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
603
604    paddsw      xmm2, [GLOBAL(rd)]
605    paddsw      xmm2, xmm3
606    psraw       xmm2, 7
607    packuswb    xmm2, xmm2
608
609    punpcklbw   xmm5, xmm4                  ;B D
610    punpcklbw   xmm1, xmm0                  ;C E
611
612    pmaddubsw   xmm1, xmm6
613    pmaddubsw   xmm5, xmm7
614
615    movdqa      xmm4, [GLOBAL(rd)]
616    add         rsi,  rdx
617    add         rax,  rdx
618;--
619;--
620    paddsw      xmm5, xmm1
621    paddsw      xmm5, xmm4
622    psraw       xmm5, 7
623    packuswb    xmm5, xmm5
624
625    punpcklqdq  xmm2, xmm5
626
627    movdqa       XMMWORD PTR [rdi], xmm2
628
629%if ABI_IS_32BIT
630    add         rdi,        DWORD PTR arg(3) ;out_pitch
631%else
632    add         rdi,        r8
633%endif
634    dec         rcx
635    jnz         vp8_filter_block1d16_v4_ssse3_loop
636
637    ; begin epilog
638    pop rdi
639    pop rsi
640    RESTORE_GOT
641    UNSHADOW_ARGS
642    pop         rbp
643    ret
644
645;void vp8_filter_block1d8_v6_ssse3
646;(
647;    unsigned char *src_ptr,
648;    unsigned int   src_pitch,
649;    unsigned char *output_ptr,
650;    unsigned int   out_pitch,
651;    unsigned int   output_height,
652;    unsigned int   vp8_filter_index
653;)
654global sym(vp8_filter_block1d8_v6_ssse3)
655sym(vp8_filter_block1d8_v6_ssse3):
656    push        rbp
657    mov         rbp, rsp
658    SHADOW_ARGS_TO_STACK 6
659    GET_GOT     rbx
660    push        rsi
661    push        rdi
662    ; end prolog
663
664    movsxd      rdx, DWORD PTR arg(5)   ;table index
665    xor         rsi, rsi
666    shl         rdx, 4      ;
667
668    lea         rax, [GLOBAL(k0_k5)]
669    add         rax, rdx
670
671    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
672    mov         rdi, arg(2)             ;output_ptr
673%if ABI_IS_32BIT=0
674    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
675%endif
676    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
677
678    cmp         esi, DWORD PTR [rax]
679    je          vp8_filter_block1d8_v4_ssse3
680
681    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
682    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
683    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
684
685    mov         rsi, arg(0)             ;src_ptr
686
687    mov         rax, rsi
688    add         rax, rdx
689
690vp8_filter_block1d8_v6_ssse3_loop:
691    movq        xmm1, MMWORD PTR [rsi]                  ;A
692    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
693    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
694    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
695    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
696
697    punpcklbw   xmm2, xmm4                  ;B D
698    punpcklbw   xmm3, xmm0                  ;C E
699
700    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
701    movdqa      xmm4, [GLOBAL(rd)]
702
703    pmaddubsw   xmm3, xmm6
704    punpcklbw   xmm1, xmm0                  ;A F
705    pmaddubsw   xmm2, xmm7
706    pmaddubsw   xmm1, xmm5
707    add         rsi,  rdx
708    add         rax,  rdx
709;--
710;--
711    paddsw      xmm2, xmm3
712    paddsw      xmm2, xmm1
713    paddsw      xmm2, xmm4
714    psraw       xmm2, 7
715    packuswb    xmm2, xmm2
716
717    movq        MMWORD PTR [rdi], xmm2
718
719%if ABI_IS_32BIT
720    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
721%else
722    add         rdi,        r8
723%endif
724    dec         rcx
725    jnz         vp8_filter_block1d8_v6_ssse3_loop
726
727    ; begin epilog
728    pop rdi
729    pop rsi
730    RESTORE_GOT
731    UNSHADOW_ARGS
732    pop         rbp
733    ret
734
735vp8_filter_block1d8_v4_ssse3:
736    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
737    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
738    movdqa      xmm5, [GLOBAL(rd)]
739
740    mov         rsi, arg(0)             ;src_ptr
741
742    mov         rax, rsi
743    add         rax, rdx
744
745vp8_filter_block1d8_v4_ssse3_loop:
746    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
747    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
748    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
749    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
750
751    punpcklbw   xmm2, xmm4                  ;B D
752    punpcklbw   xmm3, xmm0                  ;C E
753
754    pmaddubsw   xmm3, xmm6
755    pmaddubsw   xmm2, xmm7
756    add         rsi,  rdx
757    add         rax,  rdx
758;--
759;--
760    paddsw      xmm2, xmm3
761    paddsw      xmm2, xmm5
762    psraw       xmm2, 7
763    packuswb    xmm2, xmm2
764
765    movq        MMWORD PTR [rdi], xmm2
766
767%if ABI_IS_32BIT
768    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
769%else
770    add         rdi,        r8
771%endif
772    dec         rcx
773    jnz         vp8_filter_block1d8_v4_ssse3_loop
774
775    ; begin epilog
776    pop rdi
777    pop rsi
778    RESTORE_GOT
779    UNSHADOW_ARGS
780    pop         rbp
781    ret
782;void vp8_filter_block1d4_v6_ssse3
783;(
784;    unsigned char *src_ptr,
785;    unsigned int   src_pitch,
786;    unsigned char *output_ptr,
787;    unsigned int   out_pitch,
788;    unsigned int   output_height,
789;    unsigned int   vp8_filter_index
790;)
791global sym(vp8_filter_block1d4_v6_ssse3)
792sym(vp8_filter_block1d4_v6_ssse3):
793    push        rbp
794    mov         rbp, rsp
795    SHADOW_ARGS_TO_STACK 6
796    GET_GOT     rbx
797    push        rsi
798    push        rdi
799    ; end prolog
800
801    movsxd      rdx, DWORD PTR arg(5)   ;table index
802    xor         rsi, rsi
803    shl         rdx, 4      ;
804
805    lea         rax, [GLOBAL(k0_k5)]
806    add         rax, rdx
807
808    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
809    mov         rdi, arg(2)             ;output_ptr
810%if ABI_IS_32BIT=0
811    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
812%endif
813    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
814
815    cmp         esi, DWORD PTR [rax]
816    je          vp8_filter_block1d4_v4_ssse3
817
818    movq        mm5, MMWORD PTR [rax]         ;k0_k5
819    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
820    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
821
822    mov         rsi, arg(0)             ;src_ptr
823
824    mov         rax, rsi
825    add         rax, rdx
826
827vp8_filter_block1d4_v6_ssse3_loop:
828    movd        mm1, DWORD PTR [rsi]                  ;A
829    movd        mm2, DWORD PTR [rsi + rdx]            ;B
830    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
831    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
832    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
833
834    punpcklbw   mm2, mm4                  ;B D
835    punpcklbw   mm3, mm0                  ;C E
836
837    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
838
839    movq        mm4, [GLOBAL(rd)]
840
841    pmaddubsw   mm3, mm6
842    punpcklbw   mm1, mm0                  ;A F
843    pmaddubsw   mm2, mm7
844    pmaddubsw   mm1, mm5
845    add         rsi,  rdx
846    add         rax,  rdx
847;--
848;--
849    paddsw      mm2, mm3
850    paddsw      mm2, mm1
851    paddsw      mm2, mm4
852    psraw       mm2, 7
853    packuswb    mm2, mm2
854
855    movd        DWORD PTR [rdi], mm2
856
857%if ABI_IS_32BIT
858    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
859%else
860    add         rdi,        r8
861%endif
862    dec         rcx
863    jnz         vp8_filter_block1d4_v6_ssse3_loop
864
865    ; begin epilog
866    pop rdi
867    pop rsi
868    RESTORE_GOT
869    UNSHADOW_ARGS
870    pop         rbp
871    ret
872
873vp8_filter_block1d4_v4_ssse3:
874    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
875    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
876    movq        mm5, MMWORD PTR [GLOBAL(rd)]
877
878    mov         rsi, arg(0)             ;src_ptr
879
880    mov         rax, rsi
881    add         rax, rdx
882
883vp8_filter_block1d4_v4_ssse3_loop:
884    movd        mm2, DWORD PTR [rsi + rdx]            ;B
885    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
886    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
887    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
888
889    punpcklbw   mm2, mm4                  ;B D
890    punpcklbw   mm3, mm0                  ;C E
891
892    pmaddubsw   mm3, mm6
893    pmaddubsw   mm2, mm7
894    add         rsi,  rdx
895    add         rax,  rdx
896;--
897;--
898    paddsw      mm2, mm3
899    paddsw      mm2, mm5
900    psraw       mm2, 7
901    packuswb    mm2, mm2
902
903    movd        DWORD PTR [rdi], mm2
904
905%if ABI_IS_32BIT
906    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
907%else
908    add         rdi,        r8
909%endif
910    dec         rcx
911    jnz         vp8_filter_block1d4_v4_ssse3_loop
912
913    ; begin epilog
914    pop rdi
915    pop rsi
916    RESTORE_GOT
917    UNSHADOW_ARGS
918    pop         rbp
919    ret
920
921;void vp8_bilinear_predict16x16_ssse3
922;(
923;    unsigned char  *src_ptr,
924;    int   src_pixels_per_line,
925;    int  xoffset,
926;    int  yoffset,
927;    unsigned char *dst_ptr,
928;    int dst_pitch
929;)
930global sym(vp8_bilinear_predict16x16_ssse3)
931sym(vp8_bilinear_predict16x16_ssse3):
932    push        rbp
933    mov         rbp, rsp
934    SHADOW_ARGS_TO_STACK 6
935    SAVE_XMM
936    GET_GOT     rbx
937    push        rsi
938    push        rdi
939    ; end prolog
940
941        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
942        movsxd      rax,        dword ptr arg(2)    ; xoffset
943
944        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
945        je          b16x16_sp_only
946
947        shl         rax,        4
948        lea         rax,        [rax + rcx]         ; HFilter
949
950        mov         rdi,        arg(4)              ; dst_ptr
951        mov         rsi,        arg(0)              ; src_ptr
952        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
953
954        movdqa      xmm1,       [rax]
955
956        movsxd      rax,        dword ptr arg(3)    ; yoffset
957
958        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
959        je          b16x16_fp_only
960
961        shl         rax,        4
962        lea         rax,        [rax + rcx]         ; VFilter
963
964        lea         rcx,        [rdi+rdx*8]
965        lea         rcx,        [rcx+rdx*8]
966        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
967
968        movdqa      xmm2,       [rax]
969
970%if ABI_IS_32BIT=0
971        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
972%endif
973        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
974        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
975
976        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
977        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
978
979        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
980
981        lea         rsi,        [rsi + rdx]         ; next line
982
983        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
984
985        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
986        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
987
988        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
989        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
990
991        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
992        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
993
994        movdqa      xmm7,       xmm3
995        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
996
997.next_row:
998        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
999        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
1000
1001        punpcklbw   xmm6,       xmm5
1002        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
1003
1004        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
1005        lea         rsi,        [rsi + rdx]         ; next line
1006
1007        pmaddubsw   xmm6,       xmm1
1008
1009        punpcklbw   xmm4,       xmm5
1010        pmaddubsw   xmm4,       xmm1
1011
1012        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
1013        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
1014
1015        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
1016        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
1017
1018        packuswb    xmm6,       xmm4
1019        movdqa      xmm5,       xmm7
1020
1021        punpcklbw   xmm5,       xmm6
1022        pmaddubsw   xmm5,       xmm2
1023
1024        punpckhbw   xmm7,       xmm6
1025        pmaddubsw   xmm7,       xmm2
1026
1027        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
1028        psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
1029
1030        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
1031        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
1032
1033        packuswb    xmm5,       xmm7
1034        movdqa      xmm7,       xmm6
1035
1036        movdqa      [rdi],      xmm5                ; store the results in the destination
1037%if ABI_IS_32BIT
1038        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
1039%else
1040        add         rdi,        r8
1041%endif
1042
1043        cmp         rdi,        rcx
1044        jne         .next_row
1045
1046        jmp         done
1047
1048b16x16_sp_only:
1049        movsxd      rax,        dword ptr arg(3)    ; yoffset
1050        shl         rax,        4
1051        lea         rax,        [rax + rcx]         ; VFilter
1052
1053        mov         rdi,        arg(4)              ; dst_ptr
1054        mov         rsi,        arg(0)              ; src_ptr
1055        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1056
1057        movdqa      xmm1,       [rax]               ; VFilter
1058
1059        lea         rcx,        [rdi+rdx*8]
1060        lea         rcx,        [rcx+rdx*8]
1061        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
1062
1063        ; get the first horizontal line done
1064        movq        xmm4,       [rsi]               ; load row 0
1065        movq        xmm2,       [rsi + 8]           ; load row 0
1066
1067        lea         rsi,        [rsi + rax]         ; next line
1068.next_row:
1069        movq        xmm3,       [rsi]               ; load row + 1
1070        movq        xmm5,       [rsi + 8]           ; load row + 1
1071
1072        punpcklbw   xmm4,       xmm3
1073        punpcklbw   xmm2,       xmm5
1074
1075        pmaddubsw   xmm4,       xmm1
1076        movq        xmm7,       [rsi + rax]         ; load row + 2
1077
1078        pmaddubsw   xmm2,       xmm1
1079        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
1080
1081        punpcklbw   xmm3,       xmm7
1082        punpcklbw   xmm5,       xmm6
1083
1084        pmaddubsw   xmm3,       xmm1
1085        paddw       xmm4,       [GLOBAL(rd)]
1086
1087        pmaddubsw   xmm5,       xmm1
1088        paddw       xmm2,       [GLOBAL(rd)]
1089
1090        psraw       xmm4,       VP8_FILTER_SHIFT
1091        psraw       xmm2,       VP8_FILTER_SHIFT
1092
1093        packuswb    xmm4,       xmm2
1094        paddw       xmm3,       [GLOBAL(rd)]
1095
1096        movdqa      [rdi],      xmm4                ; store row 0
1097        paddw       xmm5,       [GLOBAL(rd)]
1098
1099        psraw       xmm3,       VP8_FILTER_SHIFT
1100        psraw       xmm5,       VP8_FILTER_SHIFT
1101
1102        packuswb    xmm3,       xmm5
1103        movdqa      xmm4,       xmm7
1104
1105        movdqa      [rdi + rdx],xmm3                ; store row 1
1106        lea         rsi,        [rsi + 2*rax]
1107
1108        movdqa      xmm2,       xmm6
1109        lea         rdi,        [rdi + 2*rdx]
1110
1111        cmp         rdi,        rcx
1112        jne         .next_row
1113
1114        jmp         done
1115
1116b16x16_fp_only:
1117        lea         rcx,        [rdi+rdx*8]
1118        lea         rcx,        [rcx+rdx*8]
1119        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
1120
1121.next_row:
1122        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
1123        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
1124
1125        punpcklbw   xmm2,       xmm4
1126        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
1127
1128        pmaddubsw   xmm2,       xmm1
1129        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
1130
1131        lea         rsi,        [rsi + rax]         ; next line
1132        punpcklbw   xmm3,       xmm4
1133
1134        pmaddubsw   xmm3,       xmm1
1135        movq        xmm5,       [rsi]
1136
1137        paddw       xmm2,       [GLOBAL(rd)]
1138        movq        xmm7,       [rsi+1]
1139
1140        movq        xmm6,       [rsi+8]
1141        psraw       xmm2,       VP8_FILTER_SHIFT
1142
1143        punpcklbw   xmm5,       xmm7
1144        movq        xmm7,       [rsi+9]
1145
1146        paddw       xmm3,       [GLOBAL(rd)]
1147        pmaddubsw   xmm5,       xmm1
1148
1149        psraw       xmm3,       VP8_FILTER_SHIFT
1150        punpcklbw   xmm6,       xmm7
1151
1152        packuswb    xmm2,       xmm3
1153        pmaddubsw   xmm6,       xmm1
1154
1155        movdqa      [rdi],      xmm2                ; store the results in the destination
1156        paddw       xmm5,       [GLOBAL(rd)]
1157
1158        lea         rdi,        [rdi + rdx]         ; dst_pitch
1159        psraw       xmm5,       VP8_FILTER_SHIFT
1160
1161        paddw       xmm6,       [GLOBAL(rd)]
1162        psraw       xmm6,       VP8_FILTER_SHIFT
1163
1164        packuswb    xmm5,       xmm6
1165        lea         rsi,        [rsi + rax]         ; next line
1166
1167        movdqa      [rdi],      xmm5                ; store the results in the destination
1168        lea         rdi,        [rdi + rdx]         ; dst_pitch
1169
1170        cmp         rdi,        rcx
1171
1172        jne         .next_row
1173
1174done:
1175    ; begin epilog
1176    pop         rdi
1177    pop         rsi
1178    RESTORE_GOT
1179    RESTORE_XMM
1180    UNSHADOW_ARGS
1181    pop         rbp
1182    ret
1183
1184;void vp8_bilinear_predict8x8_ssse3
1185;(
1186;    unsigned char  *src_ptr,
1187;    int   src_pixels_per_line,
1188;    int  xoffset,
1189;    int  yoffset,
1190;    unsigned char *dst_ptr,
1191;    int dst_pitch
1192;)
1193global sym(vp8_bilinear_predict8x8_ssse3)
1194sym(vp8_bilinear_predict8x8_ssse3):
1195    push        rbp
1196    mov         rbp, rsp
1197    SHADOW_ARGS_TO_STACK 6
1198    SAVE_XMM
1199    GET_GOT     rbx
1200    push        rsi
1201    push        rdi
1202    ; end prolog
1203
1204    ALIGN_STACK 16, rax
1205    sub         rsp, 144                         ; reserve 144 bytes
1206
1207        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
1208
1209        mov         rsi,        arg(0) ;src_ptr
1210        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
1211
1212    ;Read 9-line unaligned data in and put them on stack. This gives a big
1213    ;performance boost.
1214        movdqu      xmm0,       [rsi]
1215        lea         rax,        [rdx + rdx*2]
1216        movdqu      xmm1,       [rsi+rdx]
1217        movdqu      xmm2,       [rsi+rdx*2]
1218        add         rsi,        rax
1219        movdqu      xmm3,       [rsi]
1220        movdqu      xmm4,       [rsi+rdx]
1221        movdqu      xmm5,       [rsi+rdx*2]
1222        add         rsi,        rax
1223        movdqu      xmm6,       [rsi]
1224        movdqu      xmm7,       [rsi+rdx]
1225
1226        movdqa      XMMWORD PTR [rsp],            xmm0
1227
1228        movdqu      xmm0,       [rsi+rdx*2]
1229
1230        movdqa      XMMWORD PTR [rsp+16],         xmm1
1231        movdqa      XMMWORD PTR [rsp+32],         xmm2
1232        movdqa      XMMWORD PTR [rsp+48],         xmm3
1233        movdqa      XMMWORD PTR [rsp+64],         xmm4
1234        movdqa      XMMWORD PTR [rsp+80],         xmm5
1235        movdqa      XMMWORD PTR [rsp+96],         xmm6
1236        movdqa      XMMWORD PTR [rsp+112],        xmm7
1237        movdqa      XMMWORD PTR [rsp+128],        xmm0
1238
1239        movsxd      rax,        dword ptr arg(2)    ; xoffset
1240        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
1241        je          b8x8_sp_only
1242
1243        shl         rax,        4
1244        add         rax,        rcx                 ; HFilter
1245
1246        mov         rdi,        arg(4)              ; dst_ptr
1247        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1248
1249        movdqa      xmm0,       [rax]
1250
1251        movsxd      rax,        dword ptr arg(3)    ; yoffset
1252        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
1253        je          b8x8_fp_only
1254
1255        shl         rax,        4
1256        lea         rax,        [rax + rcx]         ; VFilter
1257
1258        lea         rcx,        [rdi+rdx*8]
1259
1260        movdqa      xmm1,       [rax]
1261
1262        ; get the first horizontal line done
1263        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1264        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
1265
1266        psrldq      xmm5,       1
1267        lea         rsp,        [rsp + 16]          ; next line
1268
1269        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
1270        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
1271
1272        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1273        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
1274
1275        movdqa      xmm7,       xmm3
1276        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1277
1278.next_row:
1279        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1280        lea         rsp,        [rsp + 16]          ; next line
1281
1282        movdqa      xmm5,       xmm6
1283
1284        psrldq      xmm5,       1
1285
1286        punpcklbw   xmm6,       xmm5
1287        pmaddubsw   xmm6,       xmm0
1288
1289        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
1290        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
1291
1292        packuswb    xmm6,       xmm6
1293
1294        punpcklbw   xmm7,       xmm6
1295        pmaddubsw   xmm7,       xmm1
1296
1297        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
1298        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
1299
1300        packuswb    xmm7,       xmm7
1301
1302        movq        [rdi],      xmm7                ; store the results in the destination
1303        lea         rdi,        [rdi + rdx]
1304
1305        movdqa      xmm7,       xmm6
1306
1307        cmp         rdi,        rcx
1308        jne         .next_row
1309
1310        jmp         done8x8
1311
1312b8x8_sp_only:
1313        movsxd      rax,        dword ptr arg(3)    ; yoffset
1314        shl         rax,        4
1315        lea         rax,        [rax + rcx]         ; VFilter
1316
1317        mov         rdi,        arg(4) ;dst_ptr
1318        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1319
1320        movdqa      xmm0,       [rax]               ; VFilter
1321
1322        movq        xmm1,       XMMWORD PTR [rsp]
1323        movq        xmm2,       XMMWORD PTR [rsp+16]
1324
1325        movq        xmm3,       XMMWORD PTR [rsp+32]
1326        punpcklbw   xmm1,       xmm2
1327
1328        movq        xmm4,       XMMWORD PTR [rsp+48]
1329        punpcklbw   xmm2,       xmm3
1330
1331        movq        xmm5,       XMMWORD PTR [rsp+64]
1332        punpcklbw   xmm3,       xmm4
1333
1334        movq        xmm6,       XMMWORD PTR [rsp+80]
1335        punpcklbw   xmm4,       xmm5
1336
1337        movq        xmm7,       XMMWORD PTR [rsp+96]
1338        punpcklbw   xmm5,       xmm6
1339
1340        pmaddubsw   xmm1,       xmm0
1341        pmaddubsw   xmm2,       xmm0
1342
1343        pmaddubsw   xmm3,       xmm0
1344        pmaddubsw   xmm4,       xmm0
1345
1346        pmaddubsw   xmm5,       xmm0
1347        punpcklbw   xmm6,       xmm7
1348
1349        pmaddubsw   xmm6,       xmm0
1350        paddw       xmm1,       [GLOBAL(rd)]
1351
1352        paddw       xmm2,       [GLOBAL(rd)]
1353        psraw       xmm1,       VP8_FILTER_SHIFT
1354
1355        paddw       xmm3,       [GLOBAL(rd)]
1356        psraw       xmm2,       VP8_FILTER_SHIFT
1357
1358        paddw       xmm4,       [GLOBAL(rd)]
1359        psraw       xmm3,       VP8_FILTER_SHIFT
1360
1361        paddw       xmm5,       [GLOBAL(rd)]
1362        psraw       xmm4,       VP8_FILTER_SHIFT
1363
1364        paddw       xmm6,       [GLOBAL(rd)]
1365        psraw       xmm5,       VP8_FILTER_SHIFT
1366
1367        psraw       xmm6,       VP8_FILTER_SHIFT
1368        packuswb    xmm1,       xmm1
1369
1370        packuswb    xmm2,       xmm2
1371        movq        [rdi],      xmm1
1372
1373        packuswb    xmm3,       xmm3
1374        movq        [rdi+rdx],  xmm2
1375
1376        packuswb    xmm4,       xmm4
1377        movq        xmm1,       XMMWORD PTR [rsp+112]
1378
1379        lea         rdi,        [rdi + 2*rdx]
1380        movq        xmm2,       XMMWORD PTR [rsp+128]
1381
1382        packuswb    xmm5,       xmm5
1383        movq        [rdi],      xmm3
1384
1385        packuswb    xmm6,       xmm6
1386        movq        [rdi+rdx],  xmm4
1387
1388        lea         rdi,        [rdi + 2*rdx]
1389        punpcklbw   xmm7,       xmm1
1390
1391        movq        [rdi],      xmm5
1392        pmaddubsw   xmm7,       xmm0
1393
1394        movq        [rdi+rdx],  xmm6
1395        punpcklbw   xmm1,       xmm2
1396
1397        pmaddubsw   xmm1,       xmm0
1398        paddw       xmm7,       [GLOBAL(rd)]
1399
1400        psraw       xmm7,       VP8_FILTER_SHIFT
1401        paddw       xmm1,       [GLOBAL(rd)]
1402
1403        psraw       xmm1,       VP8_FILTER_SHIFT
1404        packuswb    xmm7,       xmm7
1405
1406        packuswb    xmm1,       xmm1
1407        lea         rdi,        [rdi + 2*rdx]
1408
1409        movq        [rdi],      xmm7
1410
1411        movq        [rdi+rdx],  xmm1
1412        lea         rsp,        [rsp + 144]
1413
1414        jmp         done8x8
1415
1416b8x8_fp_only:
1417        lea         rcx,        [rdi+rdx*8]
1418
1419.next_row:
1420        movdqa      xmm1,       XMMWORD PTR [rsp]
1421        movdqa      xmm3,       XMMWORD PTR [rsp+16]
1422
1423        movdqa      xmm2,       xmm1
1424        movdqa      xmm5,       XMMWORD PTR [rsp+32]
1425
1426        psrldq      xmm2,       1
1427        movdqa      xmm7,       XMMWORD PTR [rsp+48]
1428
1429        movdqa      xmm4,       xmm3
1430        psrldq      xmm4,       1
1431
1432        movdqa      xmm6,       xmm5
1433        psrldq      xmm6,       1
1434
1435        punpcklbw   xmm1,       xmm2
1436        pmaddubsw   xmm1,       xmm0
1437
1438        punpcklbw   xmm3,       xmm4
1439        pmaddubsw   xmm3,       xmm0
1440
1441        punpcklbw   xmm5,       xmm6
1442        pmaddubsw   xmm5,       xmm0
1443
1444        movdqa      xmm2,       xmm7
1445        psrldq      xmm2,       1
1446
1447        punpcklbw   xmm7,       xmm2
1448        pmaddubsw   xmm7,       xmm0
1449
1450        paddw       xmm1,       [GLOBAL(rd)]
1451        psraw       xmm1,       VP8_FILTER_SHIFT
1452
1453        paddw       xmm3,       [GLOBAL(rd)]
1454        psraw       xmm3,       VP8_FILTER_SHIFT
1455
1456        paddw       xmm5,       [GLOBAL(rd)]
1457        psraw       xmm5,       VP8_FILTER_SHIFT
1458
1459        paddw       xmm7,       [GLOBAL(rd)]
1460        psraw       xmm7,       VP8_FILTER_SHIFT
1461
1462        packuswb    xmm1,       xmm1
1463        packuswb    xmm3,       xmm3
1464
1465        packuswb    xmm5,       xmm5
1466        movq        [rdi],      xmm1
1467
1468        packuswb    xmm7,       xmm7
1469        movq        [rdi+rdx],  xmm3
1470
1471        lea         rdi,        [rdi + 2*rdx]
1472        movq        [rdi],      xmm5
1473
1474        lea         rsp,        [rsp + 4*16]
1475        movq        [rdi+rdx],  xmm7
1476
1477        lea         rdi,        [rdi + 2*rdx]
1478        cmp         rdi,        rcx
1479
1480        jne         .next_row
1481
1482        lea         rsp,        [rsp + 16]
1483
1484done8x8:
1485    ;add rsp, 144
1486    pop         rsp
1487    ; begin epilog
1488    pop         rdi
1489    pop         rsi
1490    RESTORE_GOT
1491    RESTORE_XMM
1492    UNSHADOW_ARGS
1493    pop         rbp
1494    ret
1495
1496SECTION_RODATA
1497align 16
1498shuf1b:
1499    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
1500shuf2b:
1501    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
1502shuf3b:
1503    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
1504
1505align 16
1506shuf2bfrom1:
1507    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
1508align 16
1509shuf3bfrom1:
1510    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
1511
1512align 16
1513rd:
1514    times 8 dw 0x40
1515
1516align 16
1517k0_k5:
1518    times 8 db 0, 0             ;placeholder
1519    times 8 db 0, 0
1520    times 8 db 2, 1
1521    times 8 db 0, 0
1522    times 8 db 3, 3
1523    times 8 db 0, 0
1524    times 8 db 1, 2
1525    times 8 db 0, 0
1526k1_k3:
1527    times 8 db  0,    0         ;placeholder
1528    times 8 db  -6,  12
1529    times 8 db -11,  36
1530    times 8 db  -9,  50
1531    times 8 db -16,  77
1532    times 8 db  -6,  93
1533    times 8 db  -8, 108
1534    times 8 db  -1, 123
1535k2_k4:
1536    times 8 db 128,    0        ;placeholder
1537    times 8 db 123,   -1
1538    times 8 db 108,   -8
1539    times 8 db  93,   -6
1540    times 8 db  77,  -16
1541    times 8 db  50,   -9
1542    times 8 db  36,  -11
1543    times 8 db  12,   -6
1544align 16
1545vp8_bilinear_filters_ssse3:
1546    times 8 db 128, 0
1547    times 8 db 112, 16
1548    times 8 db 96,  32
1549    times 8 db 80,  48
1550    times 8 db 64,  64
1551    times 8 db 48,  80
1552    times 8 db 32,  96
1553    times 8 db 16,  112
1554
1555