1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;Note: tap3 and tap4 have to be applied and added after other taps to avoid
15;overflow.
16
17%macro GET_FILTERS_4 0
18    mov         rdx, arg(5)                 ;filter ptr
19    mov         rcx, 0x0400040
20
21    movdqa      xmm7, [rdx]                 ;load filters
22    pshuflw     xmm0, xmm7, 0b              ;k0
23    pshuflw     xmm1, xmm7, 01010101b       ;k1
24    pshuflw     xmm2, xmm7, 10101010b       ;k2
25    pshuflw     xmm3, xmm7, 11111111b       ;k3
26    psrldq      xmm7, 8
27    pshuflw     xmm4, xmm7, 0b              ;k4
28    pshuflw     xmm5, xmm7, 01010101b       ;k5
29    pshuflw     xmm6, xmm7, 10101010b       ;k6
30    pshuflw     xmm7, xmm7, 11111111b       ;k7
31
32    punpcklqdq  xmm0, xmm1
33    punpcklqdq  xmm2, xmm3
34    punpcklqdq  xmm5, xmm4
35    punpcklqdq  xmm6, xmm7
36
37    movdqa      k0k1, xmm0
38    movdqa      k2k3, xmm2
39    movdqa      k5k4, xmm5
40    movdqa      k6k7, xmm6
41
42    movq        xmm6, rcx
43    pshufd      xmm6, xmm6, 0
44    movdqa      krd, xmm6
45
46    pxor        xmm7, xmm7
47    movdqa      zero, xmm7
48%endm
49
50%macro APPLY_FILTER_4 1
51    punpckldq   xmm0, xmm1                  ;two row in one register
52    punpckldq   xmm6, xmm7
53    punpckldq   xmm2, xmm3
54    punpckldq   xmm5, xmm4
55
56    punpcklbw   xmm0, zero                  ;unpack to word
57    punpcklbw   xmm6, zero
58    punpcklbw   xmm2, zero
59    punpcklbw   xmm5, zero
60
61    pmullw      xmm0, k0k1                  ;multiply the filter factors
62    pmullw      xmm6, k6k7
63    pmullw      xmm2, k2k3
64    pmullw      xmm5, k5k4
65
66    paddsw      xmm0, xmm6                  ;sum
67    movdqa      xmm1, xmm0
68    psrldq      xmm1, 8
69    paddsw      xmm0, xmm1
70    paddsw      xmm0, xmm2
71    psrldq      xmm2, 8
72    paddsw      xmm0, xmm5
73    psrldq      xmm5, 8
74    paddsw      xmm0, xmm2
75    paddsw      xmm0, xmm5
76
77    paddsw      xmm0, krd                   ;rounding
78    psraw       xmm0, 7                     ;shift
79    packuswb    xmm0, xmm0                  ;pack to byte
80
81%if %1
82    movd        xmm1, [rdi]
83    pavgb       xmm0, xmm1
84%endif
85    movd        [rdi], xmm0
86%endm
87
88%macro GET_FILTERS 0
89    mov         rdx, arg(5)                 ;filter ptr
90    mov         rsi, arg(0)                 ;src_ptr
91    mov         rdi, arg(2)                 ;output_ptr
92    mov         rcx, 0x0400040
93
94    movdqa      xmm7, [rdx]                 ;load filters
95    pshuflw     xmm0, xmm7, 0b              ;k0
96    pshuflw     xmm1, xmm7, 01010101b       ;k1
97    pshuflw     xmm2, xmm7, 10101010b       ;k2
98    pshuflw     xmm3, xmm7, 11111111b       ;k3
99    pshufhw     xmm4, xmm7, 0b              ;k4
100    pshufhw     xmm5, xmm7, 01010101b       ;k5
101    pshufhw     xmm6, xmm7, 10101010b       ;k6
102    pshufhw     xmm7, xmm7, 11111111b       ;k7
103
104    punpcklwd   xmm0, xmm0
105    punpcklwd   xmm1, xmm1
106    punpcklwd   xmm2, xmm2
107    punpcklwd   xmm3, xmm3
108    punpckhwd   xmm4, xmm4
109    punpckhwd   xmm5, xmm5
110    punpckhwd   xmm6, xmm6
111    punpckhwd   xmm7, xmm7
112
113    movdqa      k0,   xmm0                  ;store filter factors on stack
114    movdqa      k1,   xmm1
115    movdqa      k2,   xmm2
116    movdqa      k3,   xmm3
117    movdqa      k4,   xmm4
118    movdqa      k5,   xmm5
119    movdqa      k6,   xmm6
120    movdqa      k7,   xmm7
121
122    movq        xmm6, rcx
123    pshufd      xmm6, xmm6, 0
124    movdqa      krd, xmm6                   ;rounding
125
126    pxor        xmm7, xmm7
127    movdqa      zero, xmm7
128%endm
129
130%macro LOAD_VERT_8 1
131    movq        xmm0, [rsi + %1]            ;0
132    movq        xmm1, [rsi + rax + %1]      ;1
133    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
134    lea         rsi,  [rsi + rax]
135    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
136    movq        xmm2, [rsi + rax + %1]      ;2
137    movq        xmm3, [rsi + rax * 2 + %1]  ;3
138    movq        xmm4, [rsi + rdx + %1]      ;4
139    movq        xmm5, [rsi + rax * 4 + %1]  ;5
140%endm
141
142%macro APPLY_FILTER_8 2
143    punpcklbw   xmm0, zero
144    punpcklbw   xmm1, zero
145    punpcklbw   xmm6, zero
146    punpcklbw   xmm7, zero
147    punpcklbw   xmm2, zero
148    punpcklbw   xmm5, zero
149    punpcklbw   xmm3, zero
150    punpcklbw   xmm4, zero
151
152    pmullw      xmm0, k0
153    pmullw      xmm1, k1
154    pmullw      xmm6, k6
155    pmullw      xmm7, k7
156    pmullw      xmm2, k2
157    pmullw      xmm5, k5
158    pmullw      xmm3, k3
159    pmullw      xmm4, k4
160
161    paddsw      xmm0, xmm1
162    paddsw      xmm0, xmm6
163    paddsw      xmm0, xmm7
164    paddsw      xmm0, xmm2
165    paddsw      xmm0, xmm5
166    paddsw      xmm0, xmm3
167    paddsw      xmm0, xmm4
168
169    paddsw      xmm0, krd                   ;rounding
170    psraw       xmm0, 7                     ;shift
171    packuswb    xmm0, xmm0                  ;pack back to byte
172%if %1
173    movq        xmm1, [rdi + %2]
174    pavgb       xmm0, xmm1
175%endif
176    movq        [rdi + %2], xmm0
177%endm
178
179;void vp9_filter_block1d4_v8_sse2
180;(
181;    unsigned char *src_ptr,
182;    unsigned int   src_pitch,
183;    unsigned char *output_ptr,
184;    unsigned int   out_pitch,
185;    unsigned int   output_height,
186;    short *filter
187;)
188global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
189sym(vp9_filter_block1d4_v8_sse2):
190    push        rbp
191    mov         rbp, rsp
192    SHADOW_ARGS_TO_STACK 6
193    SAVE_XMM 7
194    push        rsi
195    push        rdi
196    push        rbx
197    ; end prolog
198
199    ALIGN_STACK 16, rax
200    sub         rsp, 16 * 6
201    %define k0k1 [rsp + 16 * 0]
202    %define k2k3 [rsp + 16 * 1]
203    %define k5k4 [rsp + 16 * 2]
204    %define k6k7 [rsp + 16 * 3]
205    %define krd [rsp + 16 * 4]
206    %define zero [rsp + 16 * 5]
207
208    GET_FILTERS_4
209
210    mov         rsi, arg(0)                 ;src_ptr
211    mov         rdi, arg(2)                 ;output_ptr
212
213    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
214    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
215    lea         rdx, [rax + rax * 2]
216    movsxd      rcx, DWORD PTR arg(4)       ;output_height
217
218.loop:
219    movd        xmm0, [rsi]                 ;load src: row 0
220    movd        xmm1, [rsi + rax]           ;1
221    movd        xmm6, [rsi + rdx * 2]       ;6
222    lea         rsi,  [rsi + rax]
223    movd        xmm7, [rsi + rdx * 2]       ;7
224    movd        xmm2, [rsi + rax]           ;2
225    movd        xmm3, [rsi + rax * 2]       ;3
226    movd        xmm4, [rsi + rdx]           ;4
227    movd        xmm5, [rsi + rax * 4]       ;5
228
229    APPLY_FILTER_4 0
230
231    lea         rdi, [rdi + rbx]
232    dec         rcx
233    jnz         .loop
234
235    add rsp, 16 * 6
236    pop rsp
237    pop rbx
238    ; begin epilog
239    pop rdi
240    pop rsi
241    RESTORE_XMM
242    UNSHADOW_ARGS
243    pop         rbp
244    ret
245
246;void vp9_filter_block1d8_v8_sse2
247;(
248;    unsigned char *src_ptr,
249;    unsigned int   src_pitch,
250;    unsigned char *output_ptr,
251;    unsigned int   out_pitch,
252;    unsigned int   output_height,
253;    short *filter
254;)
255global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
256sym(vp9_filter_block1d8_v8_sse2):
257    push        rbp
258    mov         rbp, rsp
259    SHADOW_ARGS_TO_STACK 6
260    SAVE_XMM 7
261    push        rsi
262    push        rdi
263    push        rbx
264    ; end prolog
265
266    ALIGN_STACK 16, rax
267    sub         rsp, 16 * 10
268    %define k0 [rsp + 16 * 0]
269    %define k1 [rsp + 16 * 1]
270    %define k2 [rsp + 16 * 2]
271    %define k3 [rsp + 16 * 3]
272    %define k4 [rsp + 16 * 4]
273    %define k5 [rsp + 16 * 5]
274    %define k6 [rsp + 16 * 6]
275    %define k7 [rsp + 16 * 7]
276    %define krd [rsp + 16 * 8]
277    %define zero [rsp + 16 * 9]
278
279    GET_FILTERS
280
281    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
282    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
283    lea         rdx, [rax + rax * 2]
284    movsxd      rcx, DWORD PTR arg(4)       ;output_height
285
286.loop:
287    LOAD_VERT_8 0
288    APPLY_FILTER_8 0, 0
289
290    lea         rdi, [rdi + rbx]
291    dec         rcx
292    jnz         .loop
293
294    add rsp, 16 * 10
295    pop rsp
296    pop rbx
297    ; begin epilog
298    pop rdi
299    pop rsi
300    RESTORE_XMM
301    UNSHADOW_ARGS
302    pop         rbp
303    ret
304
305;void vp9_filter_block1d16_v8_sse2
306;(
307;    unsigned char *src_ptr,
308;    unsigned int   src_pitch,
309;    unsigned char *output_ptr,
310;    unsigned int   out_pitch,
311;    unsigned int   output_height,
312;    short *filter
313;)
314global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
315sym(vp9_filter_block1d16_v8_sse2):
316    push        rbp
317    mov         rbp, rsp
318    SHADOW_ARGS_TO_STACK 6
319    SAVE_XMM 7
320    push        rsi
321    push        rdi
322    push        rbx
323    ; end prolog
324
325    ALIGN_STACK 16, rax
326    sub         rsp, 16 * 10
327    %define k0 [rsp + 16 * 0]
328    %define k1 [rsp + 16 * 1]
329    %define k2 [rsp + 16 * 2]
330    %define k3 [rsp + 16 * 3]
331    %define k4 [rsp + 16 * 4]
332    %define k5 [rsp + 16 * 5]
333    %define k6 [rsp + 16 * 6]
334    %define k7 [rsp + 16 * 7]
335    %define krd [rsp + 16 * 8]
336    %define zero [rsp + 16 * 9]
337
338    GET_FILTERS
339
340    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
341    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
342    lea         rdx, [rax + rax * 2]
343    movsxd      rcx, DWORD PTR arg(4)       ;output_height
344
345.loop:
346    LOAD_VERT_8 0
347    APPLY_FILTER_8 0, 0
348    sub         rsi, rax
349
350    LOAD_VERT_8 8
351    APPLY_FILTER_8 0, 8
352    add         rdi, rbx
353
354    dec         rcx
355    jnz         .loop
356
357    add rsp, 16 * 10
358    pop rsp
359    pop rbx
360    ; begin epilog
361    pop rdi
362    pop rsi
363    RESTORE_XMM
364    UNSHADOW_ARGS
365    pop         rbp
366    ret
367
368global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
369sym(vp9_filter_block1d4_v8_avg_sse2):
370    push        rbp
371    mov         rbp, rsp
372    SHADOW_ARGS_TO_STACK 6
373    SAVE_XMM 7
374    push        rsi
375    push        rdi
376    push        rbx
377    ; end prolog
378
379    ALIGN_STACK 16, rax
380    sub         rsp, 16 * 6
381    %define k0k1 [rsp + 16 * 0]
382    %define k2k3 [rsp + 16 * 1]
383    %define k5k4 [rsp + 16 * 2]
384    %define k6k7 [rsp + 16 * 3]
385    %define krd [rsp + 16 * 4]
386    %define zero [rsp + 16 * 5]
387
388    GET_FILTERS_4
389
390    mov         rsi, arg(0)                 ;src_ptr
391    mov         rdi, arg(2)                 ;output_ptr
392
393    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
394    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
395    lea         rdx, [rax + rax * 2]
396    movsxd      rcx, DWORD PTR arg(4)       ;output_height
397
398.loop:
399    movd        xmm0, [rsi]                 ;load src: row 0
400    movd        xmm1, [rsi + rax]           ;1
401    movd        xmm6, [rsi + rdx * 2]       ;6
402    lea         rsi,  [rsi + rax]
403    movd        xmm7, [rsi + rdx * 2]       ;7
404    movd        xmm2, [rsi + rax]           ;2
405    movd        xmm3, [rsi + rax * 2]       ;3
406    movd        xmm4, [rsi + rdx]           ;4
407    movd        xmm5, [rsi + rax * 4]       ;5
408
409    APPLY_FILTER_4 1
410
411    lea         rdi, [rdi + rbx]
412    dec         rcx
413    jnz         .loop
414
415    add rsp, 16 * 6
416    pop rsp
417    pop rbx
418    ; begin epilog
419    pop rdi
420    pop rsi
421    RESTORE_XMM
422    UNSHADOW_ARGS
423    pop         rbp
424    ret
425
426global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
427sym(vp9_filter_block1d8_v8_avg_sse2):
428    push        rbp
429    mov         rbp, rsp
430    SHADOW_ARGS_TO_STACK 6
431    SAVE_XMM 7
432    push        rsi
433    push        rdi
434    push        rbx
435    ; end prolog
436
437    ALIGN_STACK 16, rax
438    sub         rsp, 16 * 10
439    %define k0 [rsp + 16 * 0]
440    %define k1 [rsp + 16 * 1]
441    %define k2 [rsp + 16 * 2]
442    %define k3 [rsp + 16 * 3]
443    %define k4 [rsp + 16 * 4]
444    %define k5 [rsp + 16 * 5]
445    %define k6 [rsp + 16 * 6]
446    %define k7 [rsp + 16 * 7]
447    %define krd [rsp + 16 * 8]
448    %define zero [rsp + 16 * 9]
449
450    GET_FILTERS
451
452    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
453    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
454    lea         rdx, [rax + rax * 2]
455    movsxd      rcx, DWORD PTR arg(4)       ;output_height
456.loop:
457    LOAD_VERT_8 0
458    APPLY_FILTER_8 1, 0
459
460    lea         rdi, [rdi + rbx]
461    dec         rcx
462    jnz         .loop
463
464    add rsp, 16 * 10
465    pop rsp
466    pop rbx
467    ; begin epilog
468    pop rdi
469    pop rsi
470    RESTORE_XMM
471    UNSHADOW_ARGS
472    pop         rbp
473    ret
474
475global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
476sym(vp9_filter_block1d16_v8_avg_sse2):
477    push        rbp
478    mov         rbp, rsp
479    SHADOW_ARGS_TO_STACK 6
480    SAVE_XMM 7
481    push        rsi
482    push        rdi
483    push        rbx
484    ; end prolog
485
486    ALIGN_STACK 16, rax
487    sub         rsp, 16 * 10
488    %define k0 [rsp + 16 * 0]
489    %define k1 [rsp + 16 * 1]
490    %define k2 [rsp + 16 * 2]
491    %define k3 [rsp + 16 * 3]
492    %define k4 [rsp + 16 * 4]
493    %define k5 [rsp + 16 * 5]
494    %define k6 [rsp + 16 * 6]
495    %define k7 [rsp + 16 * 7]
496    %define krd [rsp + 16 * 8]
497    %define zero [rsp + 16 * 9]
498
499    GET_FILTERS
500
501    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
502    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
503    lea         rdx, [rax + rax * 2]
504    movsxd      rcx, DWORD PTR arg(4)       ;output_height
505.loop:
506    LOAD_VERT_8 0
507    APPLY_FILTER_8 1, 0
508    sub         rsi, rax
509
510    LOAD_VERT_8 8
511    APPLY_FILTER_8 1, 8
512    add         rdi, rbx
513
514    dec         rcx
515    jnz         .loop
516
517    add rsp, 16 * 10
518    pop rsp
519    pop rbx
520    ; begin epilog
521    pop rdi
522    pop rsi
523    RESTORE_XMM
524    UNSHADOW_ARGS
525    pop         rbp
526    ret
527
528;void vp9_filter_block1d4_h8_sse2
529;(
530;    unsigned char  *src_ptr,
531;    unsigned int    src_pixels_per_line,
532;    unsigned char  *output_ptr,
533;    unsigned int    output_pitch,
534;    unsigned int    output_height,
535;    short *filter
536;)
537global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
538sym(vp9_filter_block1d4_h8_sse2):
539    push        rbp
540    mov         rbp, rsp
541    SHADOW_ARGS_TO_STACK 6
542    SAVE_XMM 7
543    push        rsi
544    push        rdi
545    ; end prolog
546
547    ALIGN_STACK 16, rax
548    sub         rsp, 16 * 6
549    %define k0k1 [rsp + 16 * 0]
550    %define k2k3 [rsp + 16 * 1]
551    %define k5k4 [rsp + 16 * 2]
552    %define k6k7 [rsp + 16 * 3]
553    %define krd [rsp + 16 * 4]
554    %define zero [rsp + 16 * 5]
555
556    GET_FILTERS_4
557
558    mov         rsi, arg(0)                 ;src_ptr
559    mov         rdi, arg(2)                 ;output_ptr
560
561    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
562    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
563    movsxd      rcx, DWORD PTR arg(4)       ;output_height
564
565.loop:
566    movdqu      xmm0,   [rsi - 3]           ;load src
567
568    movdqa      xmm1, xmm0
569    movdqa      xmm6, xmm0
570    movdqa      xmm7, xmm0
571    movdqa      xmm2, xmm0
572    movdqa      xmm3, xmm0
573    movdqa      xmm5, xmm0
574    movdqa      xmm4, xmm0
575
576    psrldq      xmm1, 1
577    psrldq      xmm6, 6
578    psrldq      xmm7, 7
579    psrldq      xmm2, 2
580    psrldq      xmm3, 3
581    psrldq      xmm5, 5
582    psrldq      xmm4, 4
583
584    APPLY_FILTER_4 0
585
586    lea         rsi, [rsi + rax]
587    lea         rdi, [rdi + rdx]
588    dec         rcx
589    jnz         .loop
590
591    add rsp, 16 * 6
592    pop rsp
593
594    ; begin epilog
595    pop rdi
596    pop rsi
597    RESTORE_XMM
598    UNSHADOW_ARGS
599    pop         rbp
600    ret
601
602;void vp9_filter_block1d8_h8_sse2
603;(
604;    unsigned char  *src_ptr,
605;    unsigned int    src_pixels_per_line,
606;    unsigned char  *output_ptr,
607;    unsigned int    output_pitch,
608;    unsigned int    output_height,
609;    short *filter
610;)
611global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
612sym(vp9_filter_block1d8_h8_sse2):
613    push        rbp
614    mov         rbp, rsp
615    SHADOW_ARGS_TO_STACK 6
616    SAVE_XMM 7
617    push        rsi
618    push        rdi
619    ; end prolog
620
621    ALIGN_STACK 16, rax
622    sub         rsp, 16 * 10
623    %define k0 [rsp + 16 * 0]
624    %define k1 [rsp + 16 * 1]
625    %define k2 [rsp + 16 * 2]
626    %define k3 [rsp + 16 * 3]
627    %define k4 [rsp + 16 * 4]
628    %define k5 [rsp + 16 * 5]
629    %define k6 [rsp + 16 * 6]
630    %define k7 [rsp + 16 * 7]
631    %define krd [rsp + 16 * 8]
632    %define zero [rsp + 16 * 9]
633
634    GET_FILTERS
635
636    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
637    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
638    movsxd      rcx, DWORD PTR arg(4)       ;output_height
639
640.loop:
641    movdqu      xmm0,   [rsi - 3]           ;load src
642
643    movdqa      xmm1, xmm0
644    movdqa      xmm6, xmm0
645    movdqa      xmm7, xmm0
646    movdqa      xmm2, xmm0
647    movdqa      xmm5, xmm0
648    movdqa      xmm3, xmm0
649    movdqa      xmm4, xmm0
650
651    psrldq      xmm1, 1
652    psrldq      xmm6, 6
653    psrldq      xmm7, 7
654    psrldq      xmm2, 2
655    psrldq      xmm5, 5
656    psrldq      xmm3, 3
657    psrldq      xmm4, 4
658
659    APPLY_FILTER_8 0, 0
660
661    lea         rsi, [rsi + rax]
662    lea         rdi, [rdi + rdx]
663    dec         rcx
664    jnz         .loop
665
666    add rsp, 16 * 10
667    pop rsp
668
669    ; begin epilog
670    pop rdi
671    pop rsi
672    RESTORE_XMM
673    UNSHADOW_ARGS
674    pop         rbp
675    ret
676
677;void vp9_filter_block1d16_h8_sse2
678;(
679;    unsigned char  *src_ptr,
680;    unsigned int    src_pixels_per_line,
681;    unsigned char  *output_ptr,
682;    unsigned int    output_pitch,
683;    unsigned int    output_height,
684;    short *filter
685;)
686global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
687sym(vp9_filter_block1d16_h8_sse2):
688    push        rbp
689    mov         rbp, rsp
690    SHADOW_ARGS_TO_STACK 6
691    SAVE_XMM 7
692    push        rsi
693    push        rdi
694    ; end prolog
695
696    ALIGN_STACK 16, rax
697    sub         rsp, 16 * 10
698    %define k0 [rsp + 16 * 0]
699    %define k1 [rsp + 16 * 1]
700    %define k2 [rsp + 16 * 2]
701    %define k3 [rsp + 16 * 3]
702    %define k4 [rsp + 16 * 4]
703    %define k5 [rsp + 16 * 5]
704    %define k6 [rsp + 16 * 6]
705    %define k7 [rsp + 16 * 7]
706    %define krd [rsp + 16 * 8]
707    %define zero [rsp + 16 * 9]
708
709    GET_FILTERS
710
711    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
712    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
713    movsxd      rcx, DWORD PTR arg(4)       ;output_height
714
715.loop:
716    movdqu      xmm0,   [rsi - 3]           ;load src
717
718    movdqa      xmm1, xmm0
719    movdqa      xmm6, xmm0
720    movdqa      xmm7, xmm0
721    movdqa      xmm2, xmm0
722    movdqa      xmm5, xmm0
723    movdqa      xmm3, xmm0
724    movdqa      xmm4, xmm0
725
726    psrldq      xmm1, 1
727    psrldq      xmm6, 6
728    psrldq      xmm7, 7
729    psrldq      xmm2, 2
730    psrldq      xmm5, 5
731    psrldq      xmm3, 3
732    psrldq      xmm4, 4
733
734    APPLY_FILTER_8 0, 0
735
736    movdqu      xmm0,   [rsi + 5]           ;load src
737
738    movdqa      xmm1, xmm0
739    movdqa      xmm6, xmm0
740    movdqa      xmm7, xmm0
741    movdqa      xmm2, xmm0
742    movdqa      xmm5, xmm0
743    movdqa      xmm3, xmm0
744    movdqa      xmm4, xmm0
745
746    psrldq      xmm1, 1
747    psrldq      xmm6, 6
748    psrldq      xmm7, 7
749    psrldq      xmm2, 2
750    psrldq      xmm5, 5
751    psrldq      xmm3, 3
752    psrldq      xmm4, 4
753
754    APPLY_FILTER_8 0, 8
755
756    lea         rsi, [rsi + rax]
757    lea         rdi, [rdi + rdx]
758    dec         rcx
759    jnz         .loop
760
761    add rsp, 16 * 10
762    pop rsp
763
764    ; begin epilog
765    pop rdi
766    pop rsi
767    RESTORE_XMM
768    UNSHADOW_ARGS
769    pop         rbp
770    ret
771
772global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
773sym(vp9_filter_block1d4_h8_avg_sse2):
774    push        rbp
775    mov         rbp, rsp
776    SHADOW_ARGS_TO_STACK 6
777    SAVE_XMM 7
778    push        rsi
779    push        rdi
780    ; end prolog
781
782    ALIGN_STACK 16, rax
783    sub         rsp, 16 * 6
784    %define k0k1 [rsp + 16 * 0]
785    %define k2k3 [rsp + 16 * 1]
786    %define k5k4 [rsp + 16 * 2]
787    %define k6k7 [rsp + 16 * 3]
788    %define krd [rsp + 16 * 4]
789    %define zero [rsp + 16 * 5]
790
791    GET_FILTERS_4
792
793    mov         rsi, arg(0)                 ;src_ptr
794    mov         rdi, arg(2)                 ;output_ptr
795
796    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
797    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
798    movsxd      rcx, DWORD PTR arg(4)       ;output_height
799
800.loop:
801    movdqu      xmm0,   [rsi - 3]           ;load src
802
803    movdqa      xmm1, xmm0
804    movdqa      xmm6, xmm0
805    movdqa      xmm7, xmm0
806    movdqa      xmm2, xmm0
807    movdqa      xmm3, xmm0
808    movdqa      xmm5, xmm0
809    movdqa      xmm4, xmm0
810
811    psrldq      xmm1, 1
812    psrldq      xmm6, 6
813    psrldq      xmm7, 7
814    psrldq      xmm2, 2
815    psrldq      xmm3, 3
816    psrldq      xmm5, 5
817    psrldq      xmm4, 4
818
819    APPLY_FILTER_4 1
820
821    lea         rsi, [rsi + rax]
822    lea         rdi, [rdi + rdx]
823    dec         rcx
824    jnz         .loop
825
826    add rsp, 16 * 6
827    pop rsp
828
829    ; begin epilog
830    pop rdi
831    pop rsi
832    RESTORE_XMM
833    UNSHADOW_ARGS
834    pop         rbp
835    ret
836
837global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
838sym(vp9_filter_block1d8_h8_avg_sse2):
839    push        rbp
840    mov         rbp, rsp
841    SHADOW_ARGS_TO_STACK 6
842    SAVE_XMM 7
843    push        rsi
844    push        rdi
845    ; end prolog
846
847    ALIGN_STACK 16, rax
848    sub         rsp, 16 * 10
849    %define k0 [rsp + 16 * 0]
850    %define k1 [rsp + 16 * 1]
851    %define k2 [rsp + 16 * 2]
852    %define k3 [rsp + 16 * 3]
853    %define k4 [rsp + 16 * 4]
854    %define k5 [rsp + 16 * 5]
855    %define k6 [rsp + 16 * 6]
856    %define k7 [rsp + 16 * 7]
857    %define krd [rsp + 16 * 8]
858    %define zero [rsp + 16 * 9]
859
860    GET_FILTERS
861
862    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
863    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
864    movsxd      rcx, DWORD PTR arg(4)       ;output_height
865
866.loop:
867    movdqu      xmm0,   [rsi - 3]           ;load src
868
869    movdqa      xmm1, xmm0
870    movdqa      xmm6, xmm0
871    movdqa      xmm7, xmm0
872    movdqa      xmm2, xmm0
873    movdqa      xmm5, xmm0
874    movdqa      xmm3, xmm0
875    movdqa      xmm4, xmm0
876
877    psrldq      xmm1, 1
878    psrldq      xmm6, 6
879    psrldq      xmm7, 7
880    psrldq      xmm2, 2
881    psrldq      xmm5, 5
882    psrldq      xmm3, 3
883    psrldq      xmm4, 4
884
885    APPLY_FILTER_8 1, 0
886
887    lea         rsi, [rsi + rax]
888    lea         rdi, [rdi + rdx]
889    dec         rcx
890    jnz         .loop
891
892    add rsp, 16 * 10
893    pop rsp
894
895    ; begin epilog
896    pop rdi
897    pop rsi
898    RESTORE_XMM
899    UNSHADOW_ARGS
900    pop         rbp
901    ret
902
903global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
904sym(vp9_filter_block1d16_h8_avg_sse2):
905    push        rbp
906    mov         rbp, rsp
907    SHADOW_ARGS_TO_STACK 6
908    SAVE_XMM 7
909    push        rsi
910    push        rdi
911    ; end prolog
912
913    ALIGN_STACK 16, rax
914    sub         rsp, 16 * 10
915    %define k0 [rsp + 16 * 0]
916    %define k1 [rsp + 16 * 1]
917    %define k2 [rsp + 16 * 2]
918    %define k3 [rsp + 16 * 3]
919    %define k4 [rsp + 16 * 4]
920    %define k5 [rsp + 16 * 5]
921    %define k6 [rsp + 16 * 6]
922    %define k7 [rsp + 16 * 7]
923    %define krd [rsp + 16 * 8]
924    %define zero [rsp + 16 * 9]
925
926    GET_FILTERS
927
928    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
929    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
930    movsxd      rcx, DWORD PTR arg(4)       ;output_height
931
932.loop:
933    movdqu      xmm0,   [rsi - 3]           ;load src
934
935    movdqa      xmm1, xmm0
936    movdqa      xmm6, xmm0
937    movdqa      xmm7, xmm0
938    movdqa      xmm2, xmm0
939    movdqa      xmm5, xmm0
940    movdqa      xmm3, xmm0
941    movdqa      xmm4, xmm0
942
943    psrldq      xmm1, 1
944    psrldq      xmm6, 6
945    psrldq      xmm7, 7
946    psrldq      xmm2, 2
947    psrldq      xmm5, 5
948    psrldq      xmm3, 3
949    psrldq      xmm4, 4
950
951    APPLY_FILTER_8 1, 0
952
953    movdqu      xmm0,   [rsi + 5]           ;load src
954
955    movdqa      xmm1, xmm0
956    movdqa      xmm6, xmm0
957    movdqa      xmm7, xmm0
958    movdqa      xmm2, xmm0
959    movdqa      xmm5, xmm0
960    movdqa      xmm3, xmm0
961    movdqa      xmm4, xmm0
962
963    psrldq      xmm1, 1
964    psrldq      xmm6, 6
965    psrldq      xmm7, 7
966    psrldq      xmm2, 2
967    psrldq      xmm5, 5
968    psrldq      xmm3, 3
969    psrldq      xmm4, 4
970
971    APPLY_FILTER_8 1, 8
972
973    lea         rsi, [rsi + rax]
974    lea         rdi, [rdi + rdx]
975    dec         rcx
976    jnz         .loop
977
978    add rsp, 16 * 10
979    pop rsp
980
981    ; begin epilog
982    pop rdi
983    pop rsi
984    RESTORE_XMM
985    UNSHADOW_ARGS
986    pop         rbp
987    ret
988