1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro VERTx4 1
15    mov         rdx, arg(5)                 ;filter ptr
16    mov         rsi, arg(0)                 ;src_ptr
17    mov         rdi, arg(2)                 ;output_ptr
18    mov         rcx, 0x0400040
19
20    movdqa      xmm4, [rdx]                 ;load filters
21    movd        xmm5, rcx
22    packsswb    xmm4, xmm4
23    pshuflw     xmm0, xmm4, 0b              ;k0_k1
24    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
25    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
26    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
27
28    punpcklqdq  xmm0, xmm0
29    punpcklqdq  xmm1, xmm1
30    punpcklqdq  xmm2, xmm2
31    punpcklqdq  xmm3, xmm3
32
33    movdqa      k0k1, xmm0
34    movdqa      k2k3, xmm1
35    pshufd      xmm5, xmm5, 0
36    movdqa      k4k5, xmm2
37    movdqa      k6k7, xmm3
38    movdqa      krd, xmm5
39
40    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
41
42%if ABI_IS_32BIT=0
43    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
44%endif
45    mov         rax, rsi
46    movsxd      rcx, DWORD PTR arg(4)       ;output_height
47    add         rax, rdx
48
49    lea         rbx, [rdx + rdx*4]
50    add         rbx, rdx                    ;pitch * 6
51
52.loop:
53    movd        xmm0, [rsi]                 ;A
54    movd        xmm1, [rsi + rdx]           ;B
55    movd        xmm2, [rsi + rdx * 2]       ;C
56    movd        xmm3, [rax + rdx * 2]       ;D
57    movd        xmm4, [rsi + rdx * 4]       ;E
58    movd        xmm5, [rax + rdx * 4]       ;F
59
60    punpcklbw   xmm0, xmm1                  ;A B
61    punpcklbw   xmm2, xmm3                  ;C D
62    punpcklbw   xmm4, xmm5                  ;E F
63
64    movd        xmm6, [rsi + rbx]           ;G
65    movd        xmm7, [rax + rbx]           ;H
66
67    pmaddubsw   xmm0, k0k1
68    pmaddubsw   xmm2, k2k3
69    punpcklbw   xmm6, xmm7                  ;G H
70    pmaddubsw   xmm4, k4k5
71    pmaddubsw   xmm6, k6k7
72
73    movdqa      xmm1, xmm2
74    paddsw      xmm0, xmm6
75    pmaxsw      xmm2, xmm4
76    pminsw      xmm4, xmm1
77    paddsw      xmm0, xmm4
78    paddsw      xmm0, xmm2
79
80    paddsw      xmm0, krd
81    psraw       xmm0, 7
82    packuswb    xmm0, xmm0
83
84    add         rsi,  rdx
85    add         rax,  rdx
86%if %1
87    movd        xmm1, [rdi]
88    pavgb       xmm0, xmm1
89%endif
90    movd        [rdi], xmm0
91
92%if ABI_IS_32BIT
93    add         rdi, DWORD PTR arg(3)       ;out_pitch
94%else
95    add         rdi, r8
96%endif
97    dec         rcx
98    jnz         .loop
99%endm
100
101%macro VERTx8 1
102    mov         rdx, arg(5)                 ;filter ptr
103    mov         rsi, arg(0)                 ;src_ptr
104    mov         rdi, arg(2)                 ;output_ptr
105    mov         rcx, 0x0400040
106
107    movdqa      xmm4, [rdx]                 ;load filters
108    movq        xmm5, rcx
109    packsswb    xmm4, xmm4
110    pshuflw     xmm0, xmm4, 0b              ;k0_k1
111    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
112    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
113    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
114
115    punpcklqdq  xmm0, xmm0
116    punpcklqdq  xmm1, xmm1
117    punpcklqdq  xmm2, xmm2
118    punpcklqdq  xmm3, xmm3
119
120    movdqa      k0k1, xmm0
121    movdqa      k2k3, xmm1
122    pshufd      xmm5, xmm5, 0
123    movdqa      k4k5, xmm2
124    movdqa      k6k7, xmm3
125    movdqa      krd, xmm5
126
127    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
128
129%if ABI_IS_32BIT=0
130    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
131%endif
132    mov         rax, rsi
133    movsxd      rcx, DWORD PTR arg(4)       ;output_height
134    add         rax, rdx
135
136    lea         rbx, [rdx + rdx*4]
137    add         rbx, rdx                    ;pitch * 6
138
139.loop:
140    movq        xmm0, [rsi]                 ;A
141    movq        xmm1, [rsi + rdx]           ;B
142    movq        xmm2, [rsi + rdx * 2]       ;C
143    movq        xmm3, [rax + rdx * 2]       ;D
144    movq        xmm4, [rsi + rdx * 4]       ;E
145    movq        xmm5, [rax + rdx * 4]       ;F
146
147    punpcklbw   xmm0, xmm1                  ;A B
148    punpcklbw   xmm2, xmm3                  ;C D
149    punpcklbw   xmm4, xmm5                  ;E F
150
151    movq        xmm6, [rsi + rbx]           ;G
152    movq        xmm7, [rax + rbx]           ;H
153
154    pmaddubsw   xmm0, k0k1
155    pmaddubsw   xmm2, k2k3
156    punpcklbw   xmm6, xmm7                  ;G H
157    pmaddubsw   xmm4, k4k5
158    pmaddubsw   xmm6, k6k7
159
160    paddsw      xmm0, xmm6
161    movdqa      xmm1, xmm2
162    pmaxsw      xmm2, xmm4
163    pminsw      xmm4, xmm1
164    paddsw      xmm0, xmm4
165    paddsw      xmm0, xmm2
166
167    paddsw      xmm0, krd
168    psraw       xmm0, 7
169    packuswb    xmm0, xmm0
170
171    add         rsi,  rdx
172    add         rax,  rdx
173%if %1
174    movq        xmm1, [rdi]
175    pavgb       xmm0, xmm1
176%endif
177    movq        [rdi], xmm0
178
179%if ABI_IS_32BIT
180    add         rdi, DWORD PTR arg(3)       ;out_pitch
181%else
182    add         rdi, r8
183%endif
184    dec         rcx
185    jnz         .loop
186%endm
187
188
189%macro VERTx16 1
190    mov         rdx, arg(5)                 ;filter ptr
191    mov         rsi, arg(0)                 ;src_ptr
192    mov         rdi, arg(2)                 ;output_ptr
193    mov         rcx, 0x0400040
194
195    movdqa      xmm4, [rdx]                 ;load filters
196    movq        xmm5, rcx
197    packsswb    xmm4, xmm4
198    pshuflw     xmm0, xmm4, 0b              ;k0_k1
199    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
200    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
201    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
202
203    punpcklqdq  xmm0, xmm0
204    punpcklqdq  xmm1, xmm1
205    punpcklqdq  xmm2, xmm2
206    punpcklqdq  xmm3, xmm3
207
208    movdqa      k0k1, xmm0
209    movdqa      k2k3, xmm1
210    pshufd      xmm5, xmm5, 0
211    movdqa      k4k5, xmm2
212    movdqa      k6k7, xmm3
213    movdqa      krd, xmm5
214
215    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
216
217%if ABI_IS_32BIT=0
218    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
219%endif
220    mov         rax, rsi
221    movsxd      rcx, DWORD PTR arg(4)       ;output_height
222    add         rax, rdx
223
224    lea         rbx, [rdx + rdx*4]
225    add         rbx, rdx                    ;pitch * 6
226
227.loop:
228    movq        xmm0, [rsi]                 ;A
229    movq        xmm1, [rsi + rdx]           ;B
230    movq        xmm2, [rsi + rdx * 2]       ;C
231    movq        xmm3, [rax + rdx * 2]       ;D
232    movq        xmm4, [rsi + rdx * 4]       ;E
233    movq        xmm5, [rax + rdx * 4]       ;F
234
235    punpcklbw   xmm0, xmm1                  ;A B
236    punpcklbw   xmm2, xmm3                  ;C D
237    punpcklbw   xmm4, xmm5                  ;E F
238
239    movq        xmm6, [rsi + rbx]           ;G
240    movq        xmm7, [rax + rbx]           ;H
241
242    pmaddubsw   xmm0, k0k1
243    pmaddubsw   xmm2, k2k3
244    punpcklbw   xmm6, xmm7                  ;G H
245    pmaddubsw   xmm4, k4k5
246    pmaddubsw   xmm6, k6k7
247
248    paddsw      xmm0, xmm6
249    movdqa      xmm1, xmm2
250    pmaxsw      xmm2, xmm4
251    pminsw      xmm4, xmm1
252    paddsw      xmm0, xmm4
253    paddsw      xmm0, xmm2
254
255    paddsw      xmm0, krd
256    psraw       xmm0, 7
257    packuswb    xmm0, xmm0
258%if %1
259    movq        xmm1, [rdi]
260    pavgb       xmm0, xmm1
261%endif
262    movq        [rdi], xmm0
263
264    movq        xmm0, [rsi + 8]             ;A
265    movq        xmm1, [rsi + rdx + 8]       ;B
266    movq        xmm2, [rsi + rdx * 2 + 8]   ;C
267    movq        xmm3, [rax + rdx * 2 + 8]   ;D
268    movq        xmm4, [rsi + rdx * 4 + 8]   ;E
269    movq        xmm5, [rax + rdx * 4 + 8]   ;F
270
271    punpcklbw   xmm0, xmm1                  ;A B
272    punpcklbw   xmm2, xmm3                  ;C D
273    punpcklbw   xmm4, xmm5                  ;E F
274
275    movq        xmm6, [rsi + rbx + 8]       ;G
276    movq        xmm7, [rax + rbx + 8]       ;H
277    punpcklbw   xmm6, xmm7                  ;G H
278
279    pmaddubsw   xmm0, k0k1
280    pmaddubsw   xmm2, k2k3
281    pmaddubsw   xmm4, k4k5
282    pmaddubsw   xmm6, k6k7
283
284    paddsw      xmm0, xmm6
285    movdqa      xmm1, xmm2
286    pmaxsw      xmm2, xmm4
287    pminsw      xmm4, xmm1
288    paddsw      xmm0, xmm4
289    paddsw      xmm0, xmm2
290
291    paddsw      xmm0, krd
292    psraw       xmm0, 7
293    packuswb    xmm0, xmm0
294
295    add         rsi,  rdx
296    add         rax,  rdx
297%if %1
298    movq    xmm1, [rdi+8]
299    pavgb   xmm0, xmm1
300%endif
301
302    movq        [rdi+8], xmm0
303
304%if ABI_IS_32BIT
305    add         rdi, DWORD PTR arg(3)       ;out_pitch
306%else
307    add         rdi, r8
308%endif
309    dec         rcx
310    jnz         .loop
311%endm
312
313;void vp9_filter_block1d8_v8_ssse3
314;(
315;    unsigned char *src_ptr,
316;    unsigned int   src_pitch,
317;    unsigned char *output_ptr,
318;    unsigned int   out_pitch,
319;    unsigned int   output_height,
320;    short *filter
321;)
322global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
323sym(vp9_filter_block1d4_v8_ssse3):
324    push        rbp
325    mov         rbp, rsp
326    SHADOW_ARGS_TO_STACK 6
327    SAVE_XMM 7
328    push        rsi
329    push        rdi
330    push        rbx
331    ; end prolog
332
333    ALIGN_STACK 16, rax
334    sub         rsp, 16*5
335    %define k0k1 [rsp + 16*0]
336    %define k2k3 [rsp + 16*1]
337    %define k4k5 [rsp + 16*2]
338    %define k6k7 [rsp + 16*3]
339    %define krd [rsp + 16*4]
340
341    VERTx4 0
342
343    add rsp, 16*5
344    pop rsp
345    pop rbx
346    ; begin epilog
347    pop rdi
348    pop rsi
349    RESTORE_XMM
350    UNSHADOW_ARGS
351    pop         rbp
352    ret
353
354;void vp9_filter_block1d8_v8_ssse3
355;(
356;    unsigned char *src_ptr,
357;    unsigned int   src_pitch,
358;    unsigned char *output_ptr,
359;    unsigned int   out_pitch,
360;    unsigned int   output_height,
361;    short *filter
362;)
363global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
364sym(vp9_filter_block1d8_v8_ssse3):
365    push        rbp
366    mov         rbp, rsp
367    SHADOW_ARGS_TO_STACK 6
368    SAVE_XMM 7
369    push        rsi
370    push        rdi
371    push        rbx
372    ; end prolog
373
374    ALIGN_STACK 16, rax
375    sub         rsp, 16*5
376    %define k0k1 [rsp + 16*0]
377    %define k2k3 [rsp + 16*1]
378    %define k4k5 [rsp + 16*2]
379    %define k6k7 [rsp + 16*3]
380    %define krd [rsp + 16*4]
381
382    VERTx8 0
383
384    add rsp, 16*5
385    pop rsp
386    pop rbx
387    ; begin epilog
388    pop rdi
389    pop rsi
390    RESTORE_XMM
391    UNSHADOW_ARGS
392    pop         rbp
393    ret
394
395;void vp9_filter_block1d16_v8_ssse3
396;(
397;    unsigned char *src_ptr,
398;    unsigned int   src_pitch,
399;    unsigned char *output_ptr,
400;    unsigned int   out_pitch,
401;    unsigned int   output_height,
402;    short *filter
403;)
404global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
405sym(vp9_filter_block1d16_v8_ssse3):
406    push        rbp
407    mov         rbp, rsp
408    SHADOW_ARGS_TO_STACK 6
409    SAVE_XMM 7
410    push        rsi
411    push        rdi
412    push        rbx
413    ; end prolog
414
415    ALIGN_STACK 16, rax
416    sub         rsp, 16*5
417    %define k0k1 [rsp + 16*0]
418    %define k2k3 [rsp + 16*1]
419    %define k4k5 [rsp + 16*2]
420    %define k6k7 [rsp + 16*3]
421    %define krd [rsp + 16*4]
422
423    VERTx16 0
424
425    add rsp, 16*5
426    pop rsp
427    pop rbx
428    ; begin epilog
429    pop rdi
430    pop rsi
431    RESTORE_XMM
432    UNSHADOW_ARGS
433    pop         rbp
434    ret
435
436;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
437
438
439global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
440sym(vp9_filter_block1d4_v8_avg_ssse3):
441    push        rbp
442    mov         rbp, rsp
443    SHADOW_ARGS_TO_STACK 6
444    SAVE_XMM 7
445    push        rsi
446    push        rdi
447    push        rbx
448    ; end prolog
449
450    ALIGN_STACK 16, rax
451    sub         rsp, 16*5
452    %define k0k1 [rsp + 16*0]
453    %define k2k3 [rsp + 16*1]
454    %define k4k5 [rsp + 16*2]
455    %define k6k7 [rsp + 16*3]
456    %define krd [rsp + 16*4]
457
458    VERTx4 1
459
460    add rsp, 16*5
461    pop rsp
462    pop rbx
463    ; begin epilog
464    pop rdi
465    pop rsi
466    RESTORE_XMM
467    UNSHADOW_ARGS
468    pop         rbp
469    ret
470
471global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
472sym(vp9_filter_block1d8_v8_avg_ssse3):
473    push        rbp
474    mov         rbp, rsp
475    SHADOW_ARGS_TO_STACK 6
476    SAVE_XMM 7
477    push        rsi
478    push        rdi
479    push        rbx
480    ; end prolog
481
482    ALIGN_STACK 16, rax
483    sub         rsp, 16*5
484    %define k0k1 [rsp + 16*0]
485    %define k2k3 [rsp + 16*1]
486    %define k4k5 [rsp + 16*2]
487    %define k6k7 [rsp + 16*3]
488    %define krd [rsp + 16*4]
489
490    VERTx8 1
491
492    add rsp, 16*5
493    pop rsp
494    pop rbx
495    ; begin epilog
496    pop rdi
497    pop rsi
498    RESTORE_XMM
499    UNSHADOW_ARGS
500    pop         rbp
501    ret
502
503global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
504sym(vp9_filter_block1d16_v8_avg_ssse3):
505    push        rbp
506    mov         rbp, rsp
507    SHADOW_ARGS_TO_STACK 6
508    SAVE_XMM 7
509    push        rsi
510    push        rdi
511    push        rbx
512    ; end prolog
513
514    ALIGN_STACK 16, rax
515    sub         rsp, 16*5
516    %define k0k1 [rsp + 16*0]
517    %define k2k3 [rsp + 16*1]
518    %define k4k5 [rsp + 16*2]
519    %define k6k7 [rsp + 16*3]
520    %define krd [rsp + 16*4]
521
522    VERTx16 1
523
524    add rsp, 16*5
525    pop rsp
526    pop rbx
527    ; begin epilog
528    pop rdi
529    pop rsi
530    RESTORE_XMM
531    UNSHADOW_ARGS
532    pop         rbp
533    ret
534
535;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
536%macro HORIZx4_ROW 2
537    movdqa      %2,   %1
538    pshufb      %1,   [GLOBAL(shuf_t0t1)]
539    pshufb      %2,   [GLOBAL(shuf_t2t3)]
540    pmaddubsw   %1,   k0k1k4k5
541    pmaddubsw   %2,   k2k3k6k7
542
543    movdqa      xmm4, %1
544    movdqa      xmm5, %2
545    psrldq      %1,   8
546    psrldq      %2,   8
547    movdqa      xmm6, xmm5
548
549    paddsw      xmm4, %2
550    pmaxsw      xmm5, %1
551    pminsw      %1, xmm6
552    paddsw      %1, xmm4
553    paddsw      %1, xmm5
554
555    paddsw      %1,   krd
556    psraw       %1,   7
557    packuswb    %1,   %1
558%endm
559
560%macro HORIZx4 1
561    mov         rdx, arg(5)                 ;filter ptr
562    mov         rsi, arg(0)                 ;src_ptr
563    mov         rdi, arg(2)                 ;output_ptr
564    mov         rcx, 0x0400040
565
566    movdqa      xmm4, [rdx]                 ;load filters
567    movq        xmm5, rcx
568    packsswb    xmm4, xmm4
569    pshuflw     xmm6, xmm4, 0b              ;k0_k1
570    pshufhw     xmm6, xmm6, 10101010b       ;k0_k1_k4_k5
571    pshuflw     xmm7, xmm4, 01010101b       ;k2_k3
572    pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
573    pshufd      xmm5, xmm5, 0               ;rounding
574
575    movdqa      k0k1k4k5, xmm6
576    movdqa      k2k3k6k7, xmm7
577    movdqa      krd, xmm5
578
579    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
580    movsxd      rdx, dword ptr arg(3)       ;output_pitch
581    movsxd      rcx, dword ptr arg(4)       ;output_height
582    shr         rcx, 1
583.loop:
584    ;Do two rows once
585    movq        xmm0,   [rsi - 3]           ;load src
586    movq        xmm1,   [rsi + 5]
587    movq        xmm2,   [rsi + rax - 3]
588    movq        xmm3,   [rsi + rax + 5]
589    punpcklqdq  xmm0,   xmm1
590    punpcklqdq  xmm2,   xmm3
591
592    HORIZx4_ROW xmm0,   xmm1
593    HORIZx4_ROW xmm2,   xmm3
594%if %1
595    movd        xmm1,   [rdi]
596    pavgb       xmm0,   xmm1
597    movd        xmm3,   [rdi + rdx]
598    pavgb       xmm2,   xmm3
599%endif
600    movd        [rdi],  xmm0
601    movd        [rdi +rdx],  xmm2
602
603    lea         rsi,    [rsi + rax]
604    prefetcht0  [rsi + 4 * rax - 3]
605    lea         rsi,    [rsi + rax]
606    lea         rdi,    [rdi + 2 * rdx]
607    prefetcht0  [rsi + 2 * rax - 3]
608
609    dec         rcx
610    jnz         .loop
611
612    ; Do last row if output_height is odd
613    movsxd      rcx,    dword ptr arg(4)       ;output_height
614    and         rcx,    1
615    je          .done
616
617    movq        xmm0,   [rsi - 3]    ; load src
618    movq        xmm1,   [rsi + 5]
619    punpcklqdq  xmm0,   xmm1
620
621    HORIZx4_ROW xmm0, xmm1
622%if %1
623    movd        xmm1,   [rdi]
624    pavgb       xmm0,   xmm1
625%endif
626    movd        [rdi],  xmm0
627.done
628%endm
629
630%macro HORIZx8_ROW 4
631    movdqa      %2,   %1
632    movdqa      %3,   %1
633    movdqa      %4,   %1
634
635    pshufb      %1,   [GLOBAL(shuf_t0t1)]
636    pshufb      %2,   [GLOBAL(shuf_t2t3)]
637    pshufb      %3,   [GLOBAL(shuf_t4t5)]
638    pshufb      %4,   [GLOBAL(shuf_t6t7)]
639
640    pmaddubsw   %1,   k0k1
641    pmaddubsw   %2,   k2k3
642    pmaddubsw   %3,   k4k5
643    pmaddubsw   %4,   k6k7
644
645    paddsw      %1,   %4
646    movdqa      %4,   %2
647    pmaxsw      %2,   %3
648    pminsw      %3,   %4
649    paddsw      %1,   %3
650    paddsw      %1,   %2
651
652    paddsw      %1,   krd
653    psraw       %1,   7
654    packuswb    %1,   %1
655%endm
656
657%macro HORIZx8 1
658    mov         rdx, arg(5)                 ;filter ptr
659    mov         rsi, arg(0)                 ;src_ptr
660    mov         rdi, arg(2)                 ;output_ptr
661    mov         rcx, 0x0400040
662
663    movdqa      xmm4, [rdx]                 ;load filters
664    movd        xmm5, rcx
665    packsswb    xmm4, xmm4
666    pshuflw     xmm0, xmm4, 0b              ;k0_k1
667    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
668    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
669    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
670
671    punpcklqdq  xmm0, xmm0
672    punpcklqdq  xmm1, xmm1
673    punpcklqdq  xmm2, xmm2
674    punpcklqdq  xmm3, xmm3
675
676    movdqa      k0k1, xmm0
677    movdqa      k2k3, xmm1
678    pshufd      xmm5, xmm5, 0
679    movdqa      k4k5, xmm2
680    movdqa      k6k7, xmm3
681    movdqa      krd, xmm5
682
683    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
684    movsxd      rdx, dword ptr arg(3)       ;output_pitch
685    movsxd      rcx, dword ptr arg(4)       ;output_height
686    shr         rcx, 1
687
688.loop:
689    movq        xmm0,   [rsi - 3]           ;load src
690    movq        xmm3,   [rsi + 5]
691    movq        xmm4,   [rsi + rax - 3]
692    movq        xmm7,   [rsi + rax + 5]
693    punpcklqdq  xmm0,   xmm3
694    punpcklqdq  xmm4,   xmm7
695
696    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
697    HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
698%if %1
699    movq        xmm1,   [rdi]
700    movq        xmm2,   [rdi + rdx]
701    pavgb       xmm0,   xmm1
702    pavgb       xmm4,   xmm2
703%endif
704    movq        [rdi],  xmm0
705    movq        [rdi + rdx],  xmm4
706
707    lea         rsi,    [rsi + rax]
708    prefetcht0  [rsi + 4 * rax - 3]
709    lea         rsi,    [rsi + rax]
710    lea         rdi,    [rdi + 2 * rdx]
711    prefetcht0  [rsi + 2 * rax - 3]
712    dec         rcx
713    jnz         .loop
714
715    ;Do last row if output_height is odd
716    movsxd      rcx,    dword ptr arg(4)    ;output_height
717    and         rcx,    1
718    je          .done
719
720    movq        xmm0,   [rsi - 3]
721    movq        xmm3,   [rsi + 5]
722    punpcklqdq  xmm0,   xmm3
723
724    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
725%if %1
726    movq        xmm1,   [rdi]
727    pavgb       xmm0,   xmm1
728%endif
729    movq        [rdi],  xmm0
730.done
731%endm
732
733%macro HORIZx16 1
734    mov         rdx, arg(5)                 ;filter ptr
735    mov         rsi, arg(0)                 ;src_ptr
736    mov         rdi, arg(2)                 ;output_ptr
737    mov         rcx, 0x0400040
738
739    movdqa      xmm4, [rdx]                 ;load filters
740    movq        xmm5, rcx
741    packsswb    xmm4, xmm4
742    pshuflw     xmm0, xmm4, 0b              ;k0_k1
743    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
744    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
745    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
746
747    punpcklqdq  xmm0, xmm0
748    punpcklqdq  xmm1, xmm1
749    punpcklqdq  xmm2, xmm2
750    punpcklqdq  xmm3, xmm3
751
752    movdqa      k0k1, xmm0
753    movdqa      k2k3, xmm1
754    pshufd      xmm5, xmm5, 0
755    movdqa      k4k5, xmm2
756    movdqa      k6k7, xmm3
757    movdqa      krd, xmm5
758
759    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
760    movsxd      rdx, dword ptr arg(3)       ;output_pitch
761    movsxd      rcx, dword ptr arg(4)       ;output_height
762
763.loop:
764    prefetcht0  [rsi + 2 * rax -3]
765
766    movq        xmm0,   [rsi - 3]           ;load src data
767    movq        xmm4,   [rsi + 5]
768    movq        xmm7,   [rsi + 13]
769    punpcklqdq  xmm0,   xmm4
770    punpcklqdq  xmm4,   xmm7
771
772    movdqa      xmm1,   xmm0
773    movdqa      xmm2,   xmm0
774    movdqa      xmm3,   xmm0
775    movdqa      xmm5,   xmm4
776    movdqa      xmm6,   xmm4
777    movdqa      xmm7,   xmm4
778
779    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
780    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
781    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
782    pshufb      xmm3,   [GLOBAL(shuf_t6t7)]
783    pshufb      xmm4,   [GLOBAL(shuf_t0t1)]
784    pshufb      xmm5,   [GLOBAL(shuf_t2t3)]
785    pshufb      xmm6,   [GLOBAL(shuf_t4t5)]
786    pshufb      xmm7,   [GLOBAL(shuf_t6t7)]
787
788    pmaddubsw   xmm0,   k0k1
789    pmaddubsw   xmm1,   k2k3
790    pmaddubsw   xmm2,   k4k5
791    pmaddubsw   xmm3,   k6k7
792    pmaddubsw   xmm4,   k0k1
793    pmaddubsw   xmm5,   k2k3
794    pmaddubsw   xmm6,   k4k5
795    pmaddubsw   xmm7,   k6k7
796
797    paddsw      xmm0,   xmm3
798    movdqa      xmm3,   xmm1
799    pmaxsw      xmm1,   xmm2
800    pminsw      xmm2,   xmm3
801    paddsw      xmm0,   xmm2
802    paddsw      xmm0,   xmm1
803
804    paddsw      xmm4,   xmm7
805    movdqa      xmm7,   xmm5
806    pmaxsw      xmm5,   xmm6
807    pminsw      xmm6,   xmm7
808    paddsw      xmm4,   xmm6
809    paddsw      xmm4,   xmm5
810
811    paddsw      xmm0,   krd
812    paddsw      xmm4,   krd
813    psraw       xmm0,   7
814    psraw       xmm4,   7
815    packuswb    xmm0,   xmm0
816    packuswb    xmm4,   xmm4
817    punpcklqdq  xmm0,   xmm4
818%if %1
819    movdqa      xmm1,   [rdi]
820    pavgb       xmm0,   xmm1
821%endif
822
823    lea         rsi,    [rsi + rax]
824    movdqa      [rdi],  xmm0
825
826    lea         rdi,    [rdi + rdx]
827    dec         rcx
828    jnz         .loop
829%endm
830
831;void vp9_filter_block1d4_h8_ssse3
832;(
833;    unsigned char  *src_ptr,
834;    unsigned int    src_pixels_per_line,
835;    unsigned char  *output_ptr,
836;    unsigned int    output_pitch,
837;    unsigned int    output_height,
838;    short *filter
839;)
840global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
841sym(vp9_filter_block1d4_h8_ssse3):
842    push        rbp
843    mov         rbp, rsp
844    SHADOW_ARGS_TO_STACK 6
845    SAVE_XMM 7
846    GET_GOT     rbx
847    push        rsi
848    push        rdi
849    ; end prolog
850
851    ALIGN_STACK 16, rax
852    sub         rsp, 16 * 3
853    %define k0k1k4k5 [rsp + 16 * 0]
854    %define k2k3k6k7 [rsp + 16 * 1]
855    %define krd      [rsp + 16 * 2]
856
857    HORIZx4 0
858
859    add rsp, 16 * 3
860    pop rsp
861    ; begin epilog
862    pop rdi
863    pop rsi
864    RESTORE_GOT
865    RESTORE_XMM
866    UNSHADOW_ARGS
867    pop         rbp
868    ret
869
870;void vp9_filter_block1d8_h8_ssse3
871;(
872;    unsigned char  *src_ptr,
873;    unsigned int    src_pixels_per_line,
874;    unsigned char  *output_ptr,
875;    unsigned int    output_pitch,
876;    unsigned int    output_height,
877;    short *filter
878;)
879global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
880sym(vp9_filter_block1d8_h8_ssse3):
881    push        rbp
882    mov         rbp, rsp
883    SHADOW_ARGS_TO_STACK 6
884    SAVE_XMM 7
885    GET_GOT     rbx
886    push        rsi
887    push        rdi
888    ; end prolog
889
890    ALIGN_STACK 16, rax
891    sub         rsp, 16*5
892    %define k0k1 [rsp + 16*0]
893    %define k2k3 [rsp + 16*1]
894    %define k4k5 [rsp + 16*2]
895    %define k6k7 [rsp + 16*3]
896    %define krd [rsp + 16*4]
897
898    HORIZx8 0
899
900    add rsp, 16*5
901    pop rsp
902
903    ; begin epilog
904    pop rdi
905    pop rsi
906    RESTORE_GOT
907    RESTORE_XMM
908    UNSHADOW_ARGS
909    pop         rbp
910    ret
911
912;void vp9_filter_block1d16_h8_ssse3
913;(
914;    unsigned char  *src_ptr,
915;    unsigned int    src_pixels_per_line,
916;    unsigned char  *output_ptr,
917;    unsigned int    output_pitch,
918;    unsigned int    output_height,
919;    short *filter
920;)
921global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
922sym(vp9_filter_block1d16_h8_ssse3):
923    push        rbp
924    mov         rbp, rsp
925    SHADOW_ARGS_TO_STACK 6
926    SAVE_XMM 7
927    GET_GOT     rbx
928    push        rsi
929    push        rdi
930    ; end prolog
931
932    ALIGN_STACK 16, rax
933    sub         rsp, 16*5
934    %define k0k1 [rsp + 16*0]
935    %define k2k3 [rsp + 16*1]
936    %define k4k5 [rsp + 16*2]
937    %define k6k7 [rsp + 16*3]
938    %define krd [rsp + 16*4]
939
940    HORIZx16 0
941
942    add rsp, 16*5
943    pop rsp
944
945    ; begin epilog
946    pop rdi
947    pop rsi
948    RESTORE_GOT
949    RESTORE_XMM
950    UNSHADOW_ARGS
951    pop         rbp
952    ret
953
954global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
955sym(vp9_filter_block1d4_h8_avg_ssse3):
956    push        rbp
957    mov         rbp, rsp
958    SHADOW_ARGS_TO_STACK 6
959    SAVE_XMM 7
960    GET_GOT     rbx
961    push        rsi
962    push        rdi
963    ; end prolog
964
965    ALIGN_STACK 16, rax
966    sub         rsp, 16 * 3
967    %define k0k1k4k5 [rsp + 16 * 0]
968    %define k2k3k6k7 [rsp + 16 * 1]
969    %define krd      [rsp + 16 * 2]
970
971    HORIZx4 1
972
973    add rsp, 16 * 3
974    pop rsp
975    ; begin epilog
976    pop rdi
977    pop rsi
978    RESTORE_GOT
979    RESTORE_XMM
980    UNSHADOW_ARGS
981    pop         rbp
982    ret
983
984global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
985sym(vp9_filter_block1d8_h8_avg_ssse3):
986    push        rbp
987    mov         rbp, rsp
988    SHADOW_ARGS_TO_STACK 6
989    SAVE_XMM 7
990    GET_GOT     rbx
991    push        rsi
992    push        rdi
993    ; end prolog
994
995    ALIGN_STACK 16, rax
996    sub         rsp, 16*5
997    %define k0k1 [rsp + 16*0]
998    %define k2k3 [rsp + 16*1]
999    %define k4k5 [rsp + 16*2]
1000    %define k6k7 [rsp + 16*3]
1001    %define krd [rsp + 16*4]
1002
1003    HORIZx8 1
1004
1005    add rsp, 16*5
1006    pop rsp
1007
1008    ; begin epilog
1009    pop rdi
1010    pop rsi
1011    RESTORE_GOT
1012    RESTORE_XMM
1013    UNSHADOW_ARGS
1014    pop         rbp
1015    ret
1016
1017global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
1018sym(vp9_filter_block1d16_h8_avg_ssse3):
1019    push        rbp
1020    mov         rbp, rsp
1021    SHADOW_ARGS_TO_STACK 6
1022    SAVE_XMM 7
1023    GET_GOT     rbx
1024    push        rsi
1025    push        rdi
1026    ; end prolog
1027
1028    ALIGN_STACK 16, rax
1029    sub         rsp, 16*5
1030    %define k0k1 [rsp + 16*0]
1031    %define k2k3 [rsp + 16*1]
1032    %define k4k5 [rsp + 16*2]
1033    %define k6k7 [rsp + 16*3]
1034    %define krd [rsp + 16*4]
1035
1036    HORIZx16 1
1037
1038    add rsp, 16*5
1039    pop rsp
1040
1041    ; begin epilog
1042    pop rdi
1043    pop rsi
1044    RESTORE_GOT
1045    RESTORE_XMM
1046    UNSHADOW_ARGS
1047    pop         rbp
1048    ret
1049SECTION_RODATA
1050align 16
1051shuf_t0t1:
1052    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
1053align 16
1054shuf_t2t3:
1055    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
1056align 16
1057shuf_t4t5:
1058    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
1059align 16
1060shuf_t6t7:
1061    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
1062