1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;/************************************************************************************
15; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
16; input pixel array has output_height rows. This routine assumes that output_height is an
17; even number. This function handles 8 pixels in horizontal direction, calculating ONE
18; rows each iteration to take advantage of the 128 bits operations.
19;
20; This is an implementation of some of the SSE optimizations first seen in ffvp8
21;
22;*************************************************************************************/
23
24
25%macro VERTx4 1
26    mov         rdx, arg(5)                 ;filter ptr
27    mov         rsi, arg(0)                 ;src_ptr
28    mov         rdi, arg(2)                 ;output_ptr
29    mov         rcx, 0x0400040
30
31    movdqa      xmm4, [rdx]                 ;load filters
32    movd        xmm5, rcx
33    packsswb    xmm4, xmm4
34    pshuflw     xmm0, xmm4, 0b              ;k0_k1
35    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
36    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
37    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
38
39    punpcklqdq  xmm0, xmm0
40    punpcklqdq  xmm1, xmm1
41    punpcklqdq  xmm2, xmm2
42    punpcklqdq  xmm3, xmm3
43
44    movdqa      k0k1, xmm0
45    movdqa      k2k3, xmm1
46    pshufd      xmm5, xmm5, 0
47    movdqa      k4k5, xmm2
48    movdqa      k6k7, xmm3
49    movdqa      krd, xmm5
50
51    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
52
53%if ABI_IS_32BIT=0
54    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
55%endif
56    mov         rax, rsi
57    movsxd      rcx, DWORD PTR arg(4)       ;output_height
58    add         rax, rdx
59
60    lea         rbx, [rdx + rdx*4]
61    add         rbx, rdx                    ;pitch * 6
62
63.loop:
64    movd        xmm0, [rsi]                 ;A
65    movd        xmm1, [rsi + rdx]           ;B
66    movd        xmm2, [rsi + rdx * 2]       ;C
67    movd        xmm3, [rax + rdx * 2]       ;D
68    movd        xmm4, [rsi + rdx * 4]       ;E
69    movd        xmm5, [rax + rdx * 4]       ;F
70
71    punpcklbw   xmm0, xmm1                  ;A B
72    punpcklbw   xmm2, xmm3                  ;C D
73    punpcklbw   xmm4, xmm5                  ;E F
74
75    movd        xmm6, [rsi + rbx]           ;G
76    movd        xmm7, [rax + rbx]           ;H
77
78    pmaddubsw   xmm0, k0k1
79    pmaddubsw   xmm2, k2k3
80    punpcklbw   xmm6, xmm7                  ;G H
81    pmaddubsw   xmm4, k4k5
82    pmaddubsw   xmm6, k6k7
83
84    paddsw      xmm0, xmm6
85    paddsw      xmm0, xmm2
86    paddsw      xmm0, xmm4
87    paddsw      xmm0, krd
88
89    psraw       xmm0, 7
90    packuswb    xmm0, xmm0
91
92    add         rsi,  rdx
93    add         rax,  rdx
94%if %1
95    movd        xmm1, [rdi]
96    pavgb       xmm0, xmm1
97%endif
98    movd        [rdi], xmm0
99
100%if ABI_IS_32BIT
101    add         rdi, DWORD PTR arg(3)       ;out_pitch
102%else
103    add         rdi, r8
104%endif
105    dec         rcx
106    jnz         .loop
107%endm
108
109%macro VERTx8 1
110    mov         rdx, arg(5)                 ;filter ptr
111    mov         rsi, arg(0)                 ;src_ptr
112    mov         rdi, arg(2)                 ;output_ptr
113    mov         rcx, 0x0400040
114
115    movdqa      xmm4, [rdx]                 ;load filters
116    movq        xmm5, rcx
117    packsswb    xmm4, xmm4
118    pshuflw     xmm0, xmm4, 0b              ;k0_k1
119    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
120    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
121    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
122
123    punpcklqdq  xmm0, xmm0
124    punpcklqdq  xmm1, xmm1
125    punpcklqdq  xmm2, xmm2
126    punpcklqdq  xmm3, xmm3
127
128    movdqa      k0k1, xmm0
129    movdqa      k2k3, xmm1
130    pshufd      xmm5, xmm5, 0
131    movdqa      k4k5, xmm2
132    movdqa      k6k7, xmm3
133    movdqa      krd, xmm5
134
135    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
136
137%if ABI_IS_32BIT=0
138    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
139%endif
140    mov         rax, rsi
141    movsxd      rcx, DWORD PTR arg(4)       ;output_height
142    add         rax, rdx
143
144    lea         rbx, [rdx + rdx*4]
145    add         rbx, rdx                    ;pitch * 6
146
147.loop:
148    movq        xmm0, [rsi]                 ;A
149    movq        xmm1, [rsi + rdx]           ;B
150    movq        xmm2, [rsi + rdx * 2]       ;C
151    movq        xmm3, [rax + rdx * 2]       ;D
152    movq        xmm4, [rsi + rdx * 4]       ;E
153    movq        xmm5, [rax + rdx * 4]       ;F
154
155    punpcklbw   xmm0, xmm1                  ;A B
156    punpcklbw   xmm2, xmm3                  ;C D
157    punpcklbw   xmm4, xmm5                  ;E F
158
159    movq        xmm6, [rsi + rbx]           ;G
160    movq        xmm7, [rax + rbx]           ;H
161
162    pmaddubsw   xmm0, k0k1
163    pmaddubsw   xmm2, k2k3
164    punpcklbw   xmm6, xmm7                  ;G H
165    pmaddubsw   xmm4, k4k5
166    pmaddubsw   xmm6, k6k7
167
168    paddsw      xmm0, xmm6
169    paddsw      xmm0, xmm2
170    paddsw      xmm0, xmm4
171    paddsw      xmm0, krd
172
173    psraw       xmm0, 7
174    packuswb    xmm0, xmm0
175
176    add         rsi,  rdx
177    add         rax,  rdx
178%if %1
179    movq        xmm1, [rdi]
180    pavgb       xmm0, xmm1
181%endif
182    movq        [rdi], xmm0
183
184%if ABI_IS_32BIT
185    add         rdi, DWORD PTR arg(3)       ;out_pitch
186%else
187    add         rdi, r8
188%endif
189    dec         rcx
190    jnz         .loop
191%endm
192
193
194%macro VERTx16 1
195    mov         rdx, arg(5)                 ;filter ptr
196    mov         rsi, arg(0)                 ;src_ptr
197    mov         rdi, arg(2)                 ;output_ptr
198    mov         rcx, 0x0400040
199
200    movdqa      xmm4, [rdx]                 ;load filters
201    movq        xmm5, rcx
202    packsswb    xmm4, xmm4
203    pshuflw     xmm0, xmm4, 0b              ;k0_k1
204    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
205    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
206    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
207
208    punpcklqdq  xmm0, xmm0
209    punpcklqdq  xmm1, xmm1
210    punpcklqdq  xmm2, xmm2
211    punpcklqdq  xmm3, xmm3
212
213    movdqa      k0k1, xmm0
214    movdqa      k2k3, xmm1
215    pshufd      xmm5, xmm5, 0
216    movdqa      k4k5, xmm2
217    movdqa      k6k7, xmm3
218    movdqa      krd, xmm5
219
220    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
221
222%if ABI_IS_32BIT=0
223    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
224%endif
225    mov         rax, rsi
226    movsxd      rcx, DWORD PTR arg(4)       ;output_height
227    add         rax, rdx
228
229    lea         rbx, [rdx + rdx*4]
230    add         rbx, rdx                    ;pitch * 6
231
232.loop:
233    movq        xmm0, [rsi]                 ;A
234    movq        xmm1, [rsi + rdx]           ;B
235    movq        xmm2, [rsi + rdx * 2]       ;C
236    movq        xmm3, [rax + rdx * 2]       ;D
237    movq        xmm4, [rsi + rdx * 4]       ;E
238    movq        xmm5, [rax + rdx * 4]       ;F
239
240    punpcklbw   xmm0, xmm1                  ;A B
241    punpcklbw   xmm2, xmm3                  ;C D
242    punpcklbw   xmm4, xmm5                  ;E F
243
244    movq        xmm6, [rsi + rbx]           ;G
245    movq        xmm7, [rax + rbx]           ;H
246
247    pmaddubsw   xmm0, k0k1
248    pmaddubsw   xmm2, k2k3
249    punpcklbw   xmm6, xmm7                  ;G H
250    pmaddubsw   xmm4, k4k5
251    pmaddubsw   xmm6, k6k7
252
253    paddsw      xmm0, xmm6
254    paddsw      xmm0, xmm2
255    paddsw      xmm0, xmm4
256    paddsw      xmm0, krd
257
258    psraw       xmm0, 7
259    packuswb    xmm0, xmm0
260%if %1
261    movq        xmm1, [rdi]
262    pavgb       xmm0, xmm1
263%endif
264    movq        [rdi], xmm0
265
266    movq        xmm0, [rsi + 8]             ;A
267    movq        xmm1, [rsi + rdx + 8]       ;B
268    movq        xmm2, [rsi + rdx * 2 + 8]   ;C
269    movq        xmm3, [rax + rdx * 2 + 8]   ;D
270    movq        xmm4, [rsi + rdx * 4 + 8]   ;E
271    movq        xmm5, [rax + rdx * 4 + 8]   ;F
272
273    punpcklbw   xmm0, xmm1                  ;A B
274    punpcklbw   xmm2, xmm3                  ;C D
275    punpcklbw   xmm4, xmm5                  ;E F
276
277
278    movq        xmm6, [rsi + rbx + 8]       ;G
279    movq        xmm7, [rax + rbx + 8]       ;H
280    punpcklbw   xmm6, xmm7                  ;G H
281
282
283    pmaddubsw   xmm0, k0k1
284    pmaddubsw   xmm2, k2k3
285    pmaddubsw   xmm4, k4k5
286    pmaddubsw   xmm6, k6k7
287
288    paddsw      xmm0, xmm6
289    paddsw      xmm0, xmm2
290    paddsw      xmm0, xmm4
291    paddsw      xmm0, krd
292
293    psraw       xmm0, 7
294    packuswb    xmm0, xmm0
295
296    add         rsi,  rdx
297    add         rax,  rdx
298%if %1
299    movq    xmm1, [rdi+8]
300    pavgb   xmm0, xmm1
301%endif
302
303    movq        [rdi+8], xmm0
304
305%if ABI_IS_32BIT
306    add         rdi, DWORD PTR arg(3)       ;out_pitch
307%else
308    add         rdi, r8
309%endif
310    dec         rcx
311    jnz         .loop
312%endm
313
314;void vp9_filter_block1d8_v8_ssse3
315;(
316;    unsigned char *src_ptr,
317;    unsigned int   src_pitch,
318;    unsigned char *output_ptr,
319;    unsigned int   out_pitch,
320;    unsigned int   output_height,
321;    short *filter
322;)
323global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
324sym(vp9_filter_block1d4_v8_ssse3):
325    push        rbp
326    mov         rbp, rsp
327    SHADOW_ARGS_TO_STACK 6
328    SAVE_XMM 7
329    push        rsi
330    push        rdi
331    push        rbx
332    ; end prolog
333
334    ALIGN_STACK 16, rax
335    sub         rsp, 16*5
336    %define k0k1 [rsp + 16*0]
337    %define k2k3 [rsp + 16*1]
338    %define k4k5 [rsp + 16*2]
339    %define k6k7 [rsp + 16*3]
340    %define krd [rsp + 16*4]
341
342    VERTx4 0
343
344    add rsp, 16*5
345    pop rsp
346    pop rbx
347    ; begin epilog
348    pop rdi
349    pop rsi
350    RESTORE_XMM
351    UNSHADOW_ARGS
352    pop         rbp
353    ret
354
355;void vp9_filter_block1d8_v8_ssse3
356;(
357;    unsigned char *src_ptr,
358;    unsigned int   src_pitch,
359;    unsigned char *output_ptr,
360;    unsigned int   out_pitch,
361;    unsigned int   output_height,
362;    short *filter
363;)
364global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
365sym(vp9_filter_block1d8_v8_ssse3):
366    push        rbp
367    mov         rbp, rsp
368    SHADOW_ARGS_TO_STACK 6
369    SAVE_XMM 7
370    push        rsi
371    push        rdi
372    push        rbx
373    ; end prolog
374
375    ALIGN_STACK 16, rax
376    sub         rsp, 16*5
377    %define k0k1 [rsp + 16*0]
378    %define k2k3 [rsp + 16*1]
379    %define k4k5 [rsp + 16*2]
380    %define k6k7 [rsp + 16*3]
381    %define krd [rsp + 16*4]
382
383    VERTx8 0
384
385    add rsp, 16*5
386    pop rsp
387    pop rbx
388    ; begin epilog
389    pop rdi
390    pop rsi
391    RESTORE_XMM
392    UNSHADOW_ARGS
393    pop         rbp
394    ret
395
396;void vp9_filter_block1d16_v8_ssse3
397;(
398;    unsigned char *src_ptr,
399;    unsigned int   src_pitch,
400;    unsigned char *output_ptr,
401;    unsigned int   out_pitch,
402;    unsigned int   output_height,
403;    short *filter
404;)
405global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
406sym(vp9_filter_block1d16_v8_ssse3):
407    push        rbp
408    mov         rbp, rsp
409    SHADOW_ARGS_TO_STACK 6
410    SAVE_XMM 7
411    push        rsi
412    push        rdi
413    push        rbx
414    ; end prolog
415
416    ALIGN_STACK 16, rax
417    sub         rsp, 16*5
418    %define k0k1 [rsp + 16*0]
419    %define k2k3 [rsp + 16*1]
420    %define k4k5 [rsp + 16*2]
421    %define k6k7 [rsp + 16*3]
422    %define krd [rsp + 16*4]
423
424    VERTx16 0
425
426    add rsp, 16*5
427    pop rsp
428    pop rbx
429    ; begin epilog
430    pop rdi
431    pop rsi
432    RESTORE_XMM
433    UNSHADOW_ARGS
434    pop         rbp
435    ret
436
437;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
438
439
440global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
441sym(vp9_filter_block1d4_v8_avg_ssse3):
442    push        rbp
443    mov         rbp, rsp
444    SHADOW_ARGS_TO_STACK 6
445    SAVE_XMM 7
446    push        rsi
447    push        rdi
448    push        rbx
449    ; end prolog
450
451    ALIGN_STACK 16, rax
452    sub         rsp, 16*5
453    %define k0k1 [rsp + 16*0]
454    %define k2k3 [rsp + 16*1]
455    %define k4k5 [rsp + 16*2]
456    %define k6k7 [rsp + 16*3]
457    %define krd [rsp + 16*4]
458
459    VERTx4 1
460
461    add rsp, 16*5
462    pop rsp
463    pop rbx
464    ; begin epilog
465    pop rdi
466    pop rsi
467    RESTORE_XMM
468    UNSHADOW_ARGS
469    pop         rbp
470    ret
471
472global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
473sym(vp9_filter_block1d8_v8_avg_ssse3):
474    push        rbp
475    mov         rbp, rsp
476    SHADOW_ARGS_TO_STACK 6
477    SAVE_XMM 7
478    push        rsi
479    push        rdi
480    push        rbx
481    ; end prolog
482
483    ALIGN_STACK 16, rax
484    sub         rsp, 16*5
485    %define k0k1 [rsp + 16*0]
486    %define k2k3 [rsp + 16*1]
487    %define k4k5 [rsp + 16*2]
488    %define k6k7 [rsp + 16*3]
489    %define krd [rsp + 16*4]
490
491    VERTx8 1
492
493    add rsp, 16*5
494    pop rsp
495    pop rbx
496    ; begin epilog
497    pop rdi
498    pop rsi
499    RESTORE_XMM
500    UNSHADOW_ARGS
501    pop         rbp
502    ret
503
504global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
505sym(vp9_filter_block1d16_v8_avg_ssse3):
506    push        rbp
507    mov         rbp, rsp
508    SHADOW_ARGS_TO_STACK 6
509    SAVE_XMM 7
510    push        rsi
511    push        rdi
512    push        rbx
513    ; end prolog
514
515    ALIGN_STACK 16, rax
516    sub         rsp, 16*5
517    %define k0k1 [rsp + 16*0]
518    %define k2k3 [rsp + 16*1]
519    %define k4k5 [rsp + 16*2]
520    %define k6k7 [rsp + 16*3]
521    %define krd [rsp + 16*4]
522
523    VERTx16 1
524
525    add rsp, 16*5
526    pop rsp
527    pop rbx
528    ; begin epilog
529    pop rdi
530    pop rsi
531    RESTORE_XMM
532    UNSHADOW_ARGS
533    pop         rbp
534    ret
535
536;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
537%macro HORIZx4_ROW 2
538    movdqa      %2,   %1
539    pshufb      %1,   [GLOBAL(shuf_t0t1)]
540    pshufb      %2,   [GLOBAL(shuf_t2t3)]
541    pmaddubsw   %1,   xmm6
542    pmaddubsw   %2,   xmm7
543
544    paddsw      %1,   %2
545    movdqa      %2,   %1
546    psrldq      %2,   8
547    paddsw      %1,   %2
548    paddsw      %1,   xmm5
549    psraw       %1,   7
550    packuswb    %1,   %1
551%endm
552
553%macro HORIZx4 1
554    mov         rdx, arg(5)                 ;filter ptr
555    mov         rsi, arg(0)                 ;src_ptr
556    mov         rdi, arg(2)                 ;output_ptr
557    mov         rcx, 0x0400040
558
559    movdqa      xmm4, [rdx]                 ;load filters
560    movq        xmm5, rcx
561    packsswb    xmm4, xmm4
562    pshuflw     xmm6, xmm4, 0b              ;k0_k1
563    pshufhw     xmm6, xmm6, 10101010b       ;k0_k1_k4_k5
564    pshuflw     xmm7, xmm4, 01010101b       ;k2_k3
565    pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
566    pshufd      xmm5, xmm5, 0               ;rounding
567
568    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
569    movsxd      rdx, dword ptr arg(3)       ;output_pitch
570    movsxd      rcx, dword ptr arg(4)       ;output_height
571    shr         rcx, 1
572.loop:
573    ;Do two rows once
574    movq        xmm0,   [rsi - 3]           ;load src
575    movq        xmm1,   [rsi + 5]
576    movq        xmm2,   [rsi + rax - 3]
577    movq        xmm3,   [rsi + rax + 5]
578    punpcklqdq  xmm0,   xmm1
579    punpcklqdq  xmm2,   xmm3
580
581    HORIZx4_ROW xmm0,   xmm1
582    HORIZx4_ROW xmm2,   xmm3
583%if %1
584    movd        xmm1,   [rdi]
585    pavgb       xmm0,   xmm1
586    movd        xmm3,   [rdi + rdx]
587    pavgb       xmm2,   xmm3
588%endif
589    movd        [rdi],  xmm0
590    movd        [rdi +rdx],  xmm2
591
592    lea         rsi,    [rsi + rax]
593    prefetcht0  [rsi + 4 * rax - 3]
594    lea         rsi,    [rsi + rax]
595    lea         rdi,    [rdi + 2 * rdx]
596    prefetcht0  [rsi + 2 * rax - 3]
597
598    dec         rcx
599    jnz         .loop
600
601    ; Do last row if output_height is odd
602    movsxd      rcx,    dword ptr arg(4)       ;output_height
603    and         rcx,    1
604    je          .done
605
606    movq        xmm0,   [rsi - 3]    ; load src
607    movq        xmm1,   [rsi + 5]
608    punpcklqdq  xmm0,   xmm1
609
610    HORIZx4_ROW xmm0, xmm1
611%if %1
612    movd        xmm1,   [rdi]
613    pavgb       xmm0,   xmm1
614%endif
615    movd        [rdi],  xmm0
616.done
617%endm
618
619%macro HORIZx8_ROW 4
620    movdqa      %2,   %1
621    movdqa      %3,   %1
622    movdqa      %4,   %1
623
624    pshufb      %1,   [GLOBAL(shuf_t0t1)]
625    pshufb      %2,   [GLOBAL(shuf_t2t3)]
626    pshufb      %3,   [GLOBAL(shuf_t4t5)]
627    pshufb      %4,   [GLOBAL(shuf_t6t7)]
628
629    pmaddubsw   %1,   k0k1
630    pmaddubsw   %2,   k2k3
631    pmaddubsw   %3,   k4k5
632    pmaddubsw   %4,   k6k7
633
634    paddsw      %1,   %2
635    paddsw      %1,   %4
636    paddsw      %1,   %3
637    paddsw      %1,   krd
638    psraw       %1,   7
639    packuswb    %1,   %1
640%endm
641
642%macro HORIZx8 1
643    mov         rdx, arg(5)                 ;filter ptr
644    mov         rsi, arg(0)                 ;src_ptr
645    mov         rdi, arg(2)                 ;output_ptr
646    mov         rcx, 0x0400040
647
648    movdqa      xmm4, [rdx]                 ;load filters
649    movd        xmm5, rcx
650    packsswb    xmm4, xmm4
651    pshuflw     xmm0, xmm4, 0b              ;k0_k1
652    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
653    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
654    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
655
656    punpcklqdq  xmm0, xmm0
657    punpcklqdq  xmm1, xmm1
658    punpcklqdq  xmm2, xmm2
659    punpcklqdq  xmm3, xmm3
660
661    movdqa      k0k1, xmm0
662    movdqa      k2k3, xmm1
663    pshufd      xmm5, xmm5, 0
664    movdqa      k4k5, xmm2
665    movdqa      k6k7, xmm3
666    movdqa      krd, xmm5
667
668    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
669    movsxd      rdx, dword ptr arg(3)       ;output_pitch
670    movsxd      rcx, dword ptr arg(4)       ;output_height
671    shr         rcx, 1
672
673.loop:
674    movq        xmm0,   [rsi - 3]           ;load src
675    movq        xmm3,   [rsi + 5]
676    movq        xmm4,   [rsi + rax - 3]
677    movq        xmm7,   [rsi + rax + 5]
678    punpcklqdq  xmm0,   xmm3
679    punpcklqdq  xmm4,   xmm7
680
681    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
682    HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
683%if %1
684    movq        xmm1,   [rdi]
685    movq        xmm2,   [rdi + rdx]
686    pavgb       xmm0,   xmm1
687    pavgb       xmm4,   xmm2
688%endif
689    movq        [rdi],  xmm0
690    movq        [rdi + rdx],  xmm4
691
692    lea         rsi,    [rsi + rax]
693    prefetcht0  [rsi + 4 * rax - 3]
694    lea         rsi,    [rsi + rax]
695    lea         rdi,    [rdi + 2 * rdx]
696    prefetcht0  [rsi + 2 * rax - 3]
697    dec         rcx
698    jnz         .loop
699
700    ;Do last row if output_height is odd
701    movsxd      rcx,    dword ptr arg(4)    ;output_height
702    and         rcx,    1
703    je          .done
704
705    movq        xmm0,   [rsi - 3]
706    movq        xmm3,   [rsi + 5]
707    punpcklqdq  xmm0,   xmm3
708
709    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
710%if %1
711    movq        xmm1,   [rdi]
712    pavgb       xmm0,   xmm1
713%endif
714    movq        [rdi],  xmm0
715.done
716%endm
717
718%macro HORIZx16 1
719    mov         rdx, arg(5)                 ;filter ptr
720    mov         rsi, arg(0)                 ;src_ptr
721    mov         rdi, arg(2)                 ;output_ptr
722    mov         rcx, 0x0400040
723
724    movdqa      xmm4, [rdx]                 ;load filters
725    movq        xmm5, rcx
726    packsswb    xmm4, xmm4
727    pshuflw     xmm0, xmm4, 0b              ;k0_k1
728    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
729    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
730    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
731
732    punpcklqdq  xmm0, xmm0
733    punpcklqdq  xmm1, xmm1
734    punpcklqdq  xmm2, xmm2
735    punpcklqdq  xmm3, xmm3
736
737    movdqa      k0k1, xmm0
738    movdqa      k2k3, xmm1
739    pshufd      xmm5, xmm5, 0
740    movdqa      k4k5, xmm2
741    movdqa      k6k7, xmm3
742    movdqa      krd, xmm5
743
744    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
745    movsxd      rdx, dword ptr arg(3)       ;output_pitch
746    movsxd      rcx, dword ptr arg(4)       ;output_height
747
748.loop:
749    prefetcht0  [rsi + 2 * rax -3]
750
751    movq        xmm0,   [rsi - 3]           ;load src data
752    movq        xmm4,   [rsi + 5]
753    movq        xmm7,   [rsi + 13]
754    punpcklqdq  xmm0,   xmm4
755    punpcklqdq  xmm4,   xmm7
756
757    movdqa      xmm1,   xmm0
758    movdqa      xmm2,   xmm0
759    movdqa      xmm3,   xmm0
760    movdqa      xmm5,   xmm4
761    movdqa      xmm6,   xmm4
762    movdqa      xmm7,   xmm4
763
764    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
765    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
766    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
767    pshufb      xmm3,   [GLOBAL(shuf_t6t7)]
768    pshufb      xmm4,   [GLOBAL(shuf_t0t1)]
769    pshufb      xmm5,   [GLOBAL(shuf_t2t3)]
770    pshufb      xmm6,   [GLOBAL(shuf_t4t5)]
771    pshufb      xmm7,   [GLOBAL(shuf_t6t7)]
772
773    pmaddubsw   xmm0,   k0k1
774    pmaddubsw   xmm1,   k2k3
775    pmaddubsw   xmm2,   k4k5
776    pmaddubsw   xmm3,   k6k7
777    pmaddubsw   xmm4,   k0k1
778    pmaddubsw   xmm5,   k2k3
779    pmaddubsw   xmm6,   k4k5
780    pmaddubsw   xmm7,   k6k7
781
782    paddsw      xmm0,   xmm1
783    paddsw      xmm0,   xmm3
784    paddsw      xmm0,   xmm2
785    paddsw      xmm4,   xmm5
786    paddsw      xmm4,   xmm7
787    paddsw      xmm4,   xmm6
788
789    paddsw      xmm0,   krd
790    paddsw      xmm4,   krd
791    psraw       xmm0,   7
792    psraw       xmm4,   7
793    packuswb    xmm0,   xmm0
794    packuswb    xmm4,   xmm4
795    punpcklqdq  xmm0,   xmm4
796%if %1
797    movdqa      xmm1,   [rdi]
798    pavgb       xmm0,   xmm1
799%endif
800
801    lea         rsi,    [rsi + rax]
802    movdqa      [rdi],  xmm0
803
804    lea         rdi,    [rdi + rdx]
805    dec         rcx
806    jnz         .loop
807%endm
808
809;void vp9_filter_block1d4_h8_ssse3
810;(
811;    unsigned char  *src_ptr,
812;    unsigned int    src_pixels_per_line,
813;    unsigned char  *output_ptr,
814;    unsigned int    output_pitch,
815;    unsigned int    output_height,
816;    short *filter
817;)
818global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
819sym(vp9_filter_block1d4_h8_ssse3):
820    push        rbp
821    mov         rbp, rsp
822    SHADOW_ARGS_TO_STACK 6
823    SAVE_XMM 7
824    GET_GOT     rbx
825    push        rsi
826    push        rdi
827    ; end prolog
828
829    HORIZx4 0
830
831    ; begin epilog
832    pop rdi
833    pop rsi
834    RESTORE_GOT
835    RESTORE_XMM
836    UNSHADOW_ARGS
837    pop         rbp
838    ret
839
840;void vp9_filter_block1d8_h8_ssse3
841;(
842;    unsigned char  *src_ptr,
843;    unsigned int    src_pixels_per_line,
844;    unsigned char  *output_ptr,
845;    unsigned int    output_pitch,
846;    unsigned int    output_height,
847;    short *filter
848;)
849global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
850sym(vp9_filter_block1d8_h8_ssse3):
851    push        rbp
852    mov         rbp, rsp
853    SHADOW_ARGS_TO_STACK 6
854    SAVE_XMM 7
855    GET_GOT     rbx
856    push        rsi
857    push        rdi
858    ; end prolog
859
860    ALIGN_STACK 16, rax
861    sub         rsp, 16*5
862    %define k0k1 [rsp + 16*0]
863    %define k2k3 [rsp + 16*1]
864    %define k4k5 [rsp + 16*2]
865    %define k6k7 [rsp + 16*3]
866    %define krd [rsp + 16*4]
867
868    HORIZx8 0
869
870    add rsp, 16*5
871    pop rsp
872
873    ; begin epilog
874    pop rdi
875    pop rsi
876    RESTORE_GOT
877    RESTORE_XMM
878    UNSHADOW_ARGS
879    pop         rbp
880    ret
881
882;void vp9_filter_block1d16_h8_ssse3
883;(
884;    unsigned char  *src_ptr,
885;    unsigned int    src_pixels_per_line,
886;    unsigned char  *output_ptr,
887;    unsigned int    output_pitch,
888;    unsigned int    output_height,
889;    short *filter
890;)
891global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
892sym(vp9_filter_block1d16_h8_ssse3):
893    push        rbp
894    mov         rbp, rsp
895    SHADOW_ARGS_TO_STACK 6
896    SAVE_XMM 7
897    GET_GOT     rbx
898    push        rsi
899    push        rdi
900    ; end prolog
901
902    ALIGN_STACK 16, rax
903    sub         rsp, 16*5
904    %define k0k1 [rsp + 16*0]
905    %define k2k3 [rsp + 16*1]
906    %define k4k5 [rsp + 16*2]
907    %define k6k7 [rsp + 16*3]
908    %define krd [rsp + 16*4]
909
910    HORIZx16 0
911
912    add rsp, 16*5
913    pop rsp
914
915    ; begin epilog
916    pop rdi
917    pop rsi
918    RESTORE_GOT
919    RESTORE_XMM
920    UNSHADOW_ARGS
921    pop         rbp
922    ret
923
924global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
925sym(vp9_filter_block1d4_h8_avg_ssse3):
926    push        rbp
927    mov         rbp, rsp
928    SHADOW_ARGS_TO_STACK 6
929    SAVE_XMM 7
930    GET_GOT     rbx
931    push        rsi
932    push        rdi
933    ; end prolog
934
935    HORIZx4 1
936
937    ; begin epilog
938    pop rdi
939    pop rsi
940    RESTORE_GOT
941    RESTORE_XMM
942    UNSHADOW_ARGS
943    pop         rbp
944    ret
945
946global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
947sym(vp9_filter_block1d8_h8_avg_ssse3):
948    push        rbp
949    mov         rbp, rsp
950    SHADOW_ARGS_TO_STACK 6
951    SAVE_XMM 7
952    GET_GOT     rbx
953    push        rsi
954    push        rdi
955    ; end prolog
956
957    ALIGN_STACK 16, rax
958    sub         rsp, 16*5
959    %define k0k1 [rsp + 16*0]
960    %define k2k3 [rsp + 16*1]
961    %define k4k5 [rsp + 16*2]
962    %define k6k7 [rsp + 16*3]
963    %define krd [rsp + 16*4]
964
965    HORIZx8 1
966
967    add rsp, 16*5
968    pop rsp
969
970    ; begin epilog
971    pop rdi
972    pop rsi
973    RESTORE_GOT
974    RESTORE_XMM
975    UNSHADOW_ARGS
976    pop         rbp
977    ret
978
979global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
980sym(vp9_filter_block1d16_h8_avg_ssse3):
981    push        rbp
982    mov         rbp, rsp
983    SHADOW_ARGS_TO_STACK 6
984    SAVE_XMM 7
985    GET_GOT     rbx
986    push        rsi
987    push        rdi
988    ; end prolog
989
990    ALIGN_STACK 16, rax
991    sub         rsp, 16*5
992    %define k0k1 [rsp + 16*0]
993    %define k2k3 [rsp + 16*1]
994    %define k4k5 [rsp + 16*2]
995    %define k6k7 [rsp + 16*3]
996    %define krd [rsp + 16*4]
997
998    HORIZx16 1
999
1000    add rsp, 16*5
1001    pop rsp
1002
1003    ; begin epilog
1004    pop rdi
1005    pop rsi
1006    RESTORE_GOT
1007    RESTORE_XMM
1008    UNSHADOW_ARGS
1009    pop         rbp
1010    ret
1011SECTION_RODATA
1012align 16
1013shuf_t0t1:
1014    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
1015align 16
1016shuf_t2t3:
1017    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
1018align 16
1019shuf_t4t5:
1020    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
1021align 16
1022shuf_t6t7:
1023    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
1024