1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;Note: tap3 and tap4 have to be applied and added after other taps to avoid
15;overflow.
16
17%macro HIGH_GET_FILTERS_4 0
18    mov         rdx, arg(5)                 ;filter ptr
19    mov         rcx, 0x00000040
20
21    movdqa      xmm7, [rdx]                 ;load filters
22    pshuflw     xmm0, xmm7, 0b              ;k0
23    pshuflw     xmm1, xmm7, 01010101b       ;k1
24    pshuflw     xmm2, xmm7, 10101010b       ;k2
25    pshuflw     xmm3, xmm7, 11111111b       ;k3
26    psrldq      xmm7, 8
27    pshuflw     xmm4, xmm7, 0b              ;k4
28    pshuflw     xmm5, xmm7, 01010101b       ;k5
29    pshuflw     xmm6, xmm7, 10101010b       ;k6
30    pshuflw     xmm7, xmm7, 11111111b       ;k7
31
32    punpcklwd   xmm0, xmm6
33    punpcklwd   xmm2, xmm5
34    punpcklwd   xmm3, xmm4
35    punpcklwd   xmm1, xmm7
36
37    movdqa      k0k6, xmm0
38    movdqa      k2k5, xmm2
39    movdqa      k3k4, xmm3
40    movdqa      k1k7, xmm1
41
42    movq        xmm6, rcx
43    pshufd      xmm6, xmm6, 0
44    movdqa      krd, xmm6
45
46    ;Compute max and min values of a pixel
47    mov         rdx, 0x00010001
48    movsxd      rcx, DWORD PTR arg(6)      ;bps
49    movq        xmm0, rdx
50    movq        xmm1, rcx
51    pshufd      xmm0, xmm0, 0b
52    movdqa      xmm2, xmm0
53    psllw       xmm0, xmm1
54    psubw       xmm0, xmm2
55    pxor        xmm1, xmm1
56    movdqa      max, xmm0                  ;max value (for clamping)
57    movdqa      min, xmm1                  ;min value (for clamping)
58
59%endm
60
61%macro HIGH_APPLY_FILTER_4 1
62    punpcklwd   xmm0, xmm6                  ;two row in one register
63    punpcklwd   xmm1, xmm7
64    punpcklwd   xmm2, xmm5
65    punpcklwd   xmm3, xmm4
66
67    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
68    pmaddwd     xmm1, k1k7
69    pmaddwd     xmm2, k2k5
70    pmaddwd     xmm3, k3k4
71
72    paddd       xmm0, xmm1                  ;sum
73    paddd       xmm0, xmm2
74    paddd       xmm0, xmm3
75
76    paddd       xmm0, krd                   ;rounding
77    psrad       xmm0, 7                     ;shift
78    packssdw    xmm0, xmm0                  ;pack to word
79
80    ;clamp the values
81    pminsw      xmm0, max
82    pmaxsw      xmm0, min
83
84%if %1
85    movq        xmm1, [rdi]
86    pavgw       xmm0, xmm1
87%endif
88    movq        [rdi], xmm0
89%endm
90
91%macro HIGH_GET_FILTERS 0
92    mov         rdx, arg(5)                 ;filter ptr
93    mov         rsi, arg(0)                 ;src_ptr
94    mov         rdi, arg(2)                 ;output_ptr
95    mov         rcx, 0x00000040
96
97    movdqa      xmm7, [rdx]                 ;load filters
98    pshuflw     xmm0, xmm7, 0b              ;k0
99    pshuflw     xmm1, xmm7, 01010101b       ;k1
100    pshuflw     xmm2, xmm7, 10101010b       ;k2
101    pshuflw     xmm3, xmm7, 11111111b       ;k3
102    pshufhw     xmm4, xmm7, 0b              ;k4
103    pshufhw     xmm5, xmm7, 01010101b       ;k5
104    pshufhw     xmm6, xmm7, 10101010b       ;k6
105    pshufhw     xmm7, xmm7, 11111111b       ;k7
106    punpcklqdq  xmm2, xmm2
107    punpcklqdq  xmm3, xmm3
108    punpcklwd   xmm0, xmm1
109    punpckhwd   xmm6, xmm7
110    punpckhwd   xmm2, xmm5
111    punpckhwd   xmm3, xmm4
112
113    movdqa      k0k1, xmm0                  ;store filter factors on stack
114    movdqa      k6k7, xmm6
115    movdqa      k2k5, xmm2
116    movdqa      k3k4, xmm3
117
118    movq        xmm6, rcx
119    pshufd      xmm6, xmm6, 0
120    movdqa      krd, xmm6                   ;rounding
121
122    ;Compute max and min values of a pixel
123    mov         rdx, 0x00010001
124    movsxd      rcx, DWORD PTR arg(6)       ;bps
125    movq        xmm0, rdx
126    movq        xmm1, rcx
127    pshufd      xmm0, xmm0, 0b
128    movdqa      xmm2, xmm0
129    psllw       xmm0, xmm1
130    psubw       xmm0, xmm2
131    pxor        xmm1, xmm1
132    movdqa      max, xmm0                  ;max value (for clamping)
133    movdqa      min, xmm1                  ;min value (for clamping)
134%endm
135
136%macro LOAD_VERT_8 1
137    movdqu      xmm0, [rsi + %1]            ;0
138    movdqu      xmm1, [rsi + rax + %1]      ;1
139    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
140    lea         rsi,  [rsi + rax]
141    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
142    movdqu      xmm2, [rsi + rax + %1]      ;2
143    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
144    movdqu      xmm4, [rsi + rdx + %1]      ;4
145    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
146%endm
147
148%macro HIGH_APPLY_FILTER_8 2
149    movdqu      temp, xmm4
150    movdqa      xmm4, xmm0
151    punpcklwd   xmm0, xmm1
152    punpckhwd   xmm4, xmm1
153    movdqa      xmm1, xmm6
154    punpcklwd   xmm6, xmm7
155    punpckhwd   xmm1, xmm7
156    movdqa      xmm7, xmm2
157    punpcklwd   xmm2, xmm5
158    punpckhwd   xmm7, xmm5
159
160    movdqu      xmm5, temp
161    movdqu      temp, xmm4
162    movdqa      xmm4, xmm3
163    punpcklwd   xmm3, xmm5
164    punpckhwd   xmm4, xmm5
165    movdqu      xmm5, temp
166
167    pmaddwd     xmm0, k0k1
168    pmaddwd     xmm5, k0k1
169    pmaddwd     xmm6, k6k7
170    pmaddwd     xmm1, k6k7
171    pmaddwd     xmm2, k2k5
172    pmaddwd     xmm7, k2k5
173    pmaddwd     xmm3, k3k4
174    pmaddwd     xmm4, k3k4
175
176    paddd       xmm0, xmm6
177    paddd       xmm0, xmm2
178    paddd       xmm0, xmm3
179    paddd       xmm5, xmm1
180    paddd       xmm5, xmm7
181    paddd       xmm5, xmm4
182
183    paddd       xmm0, krd                   ;rounding
184    paddd       xmm5, krd
185    psrad       xmm0, 7                     ;shift
186    psrad       xmm5, 7
187    packssdw    xmm0, xmm5                  ;pack back to word
188
189    ;clamp the values
190    pminsw      xmm0, max
191    pmaxsw      xmm0, min
192
193%if %1
194    movdqu      xmm1, [rdi + %2]
195    pavgw       xmm0, xmm1
196%endif
197    movdqu      [rdi + %2], xmm0
198%endm
199
200;void vp9_filter_block1d4_v8_sse2
201;(
202;    unsigned char *src_ptr,
203;    unsigned int   src_pitch,
204;    unsigned char *output_ptr,
205;    unsigned int   out_pitch,
206;    unsigned int   output_height,
207;    short *filter
208;)
209global sym(vp9_high_filter_block1d4_v8_sse2) PRIVATE
210sym(vp9_high_filter_block1d4_v8_sse2):
211    push        rbp
212    mov         rbp, rsp
213    SHADOW_ARGS_TO_STACK 7
214    SAVE_XMM 7
215    push        rsi
216    push        rdi
217    push        rbx
218    ; end prolog
219
220    ALIGN_STACK 16, rax
221    sub         rsp, 16 * 7
222    %define k0k6 [rsp + 16 * 0]
223    %define k2k5 [rsp + 16 * 1]
224    %define k3k4 [rsp + 16 * 2]
225    %define k1k7 [rsp + 16 * 3]
226    %define krd [rsp + 16 * 4]
227    %define max [rsp + 16 * 5]
228    %define min [rsp + 16 * 6]
229
230    HIGH_GET_FILTERS_4
231
232    mov         rsi, arg(0)                 ;src_ptr
233    mov         rdi, arg(2)                 ;output_ptr
234
235    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
236    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
237    lea         rax, [rax + rax]            ;bytes per line
238    lea         rbx, [rbx + rbx]
239    lea         rdx, [rax + rax * 2]
240    movsxd      rcx, DWORD PTR arg(4)       ;output_height
241
242.loop:
243    movq        xmm0, [rsi]                 ;load src: row 0
244    movq        xmm1, [rsi + rax]           ;1
245    movq        xmm6, [rsi + rdx * 2]       ;6
246    lea         rsi,  [rsi + rax]
247    movq        xmm7, [rsi + rdx * 2]       ;7
248    movq        xmm2, [rsi + rax]           ;2
249    movq        xmm3, [rsi + rax * 2]       ;3
250    movq        xmm4, [rsi + rdx]           ;4
251    movq        xmm5, [rsi + rax * 4]       ;5
252
253    HIGH_APPLY_FILTER_4 0
254
255    lea         rdi, [rdi + rbx]
256    dec         rcx
257    jnz         .loop
258
259    add rsp, 16 * 7
260    pop rsp
261    pop rbx
262    ; begin epilog
263    pop rdi
264    pop rsi
265    RESTORE_XMM
266    UNSHADOW_ARGS
267    pop         rbp
268    ret
269
270;void vp9_filter_block1d8_v8_sse2
271;(
272;    unsigned char *src_ptr,
273;    unsigned int   src_pitch,
274;    unsigned char *output_ptr,
275;    unsigned int   out_pitch,
276;    unsigned int   output_height,
277;    short *filter
278;)
279global sym(vp9_high_filter_block1d8_v8_sse2) PRIVATE
280sym(vp9_high_filter_block1d8_v8_sse2):
281    push        rbp
282    mov         rbp, rsp
283    SHADOW_ARGS_TO_STACK 7
284    SAVE_XMM 7
285    push        rsi
286    push        rdi
287    push        rbx
288    ; end prolog
289
290    ALIGN_STACK 16, rax
291    sub         rsp, 16 * 8
292    %define k0k1 [rsp + 16 * 0]
293    %define k6k7 [rsp + 16 * 1]
294    %define k2k5 [rsp + 16 * 2]
295    %define k3k4 [rsp + 16 * 3]
296    %define krd [rsp + 16 * 4]
297    %define temp [rsp + 16 * 5]
298    %define max [rsp + 16 * 6]
299    %define min [rsp + 16 * 7]
300
301    HIGH_GET_FILTERS
302
303    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
304    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
305    lea         rax, [rax + rax]            ;bytes per line
306    lea         rbx, [rbx + rbx]
307    lea         rdx, [rax + rax * 2]
308    movsxd      rcx, DWORD PTR arg(4)       ;output_height
309
310.loop:
311    LOAD_VERT_8 0
312    HIGH_APPLY_FILTER_8 0, 0
313
314    lea         rdi, [rdi + rbx]
315    dec         rcx
316    jnz         .loop
317
318    add rsp, 16 * 8
319    pop rsp
320    pop rbx
321    ; begin epilog
322    pop rdi
323    pop rsi
324    RESTORE_XMM
325    UNSHADOW_ARGS
326    pop         rbp
327    ret
328
329;void vp9_filter_block1d16_v8_sse2
330;(
331;    unsigned char *src_ptr,
332;    unsigned int   src_pitch,
333;    unsigned char *output_ptr,
334;    unsigned int   out_pitch,
335;    unsigned int   output_height,
336;    short *filter
337;)
338global sym(vp9_high_filter_block1d16_v8_sse2) PRIVATE
339sym(vp9_high_filter_block1d16_v8_sse2):
340    push        rbp
341    mov         rbp, rsp
342    SHADOW_ARGS_TO_STACK 7
343    SAVE_XMM 7
344    push        rsi
345    push        rdi
346    push        rbx
347    ; end prolog
348
349    ALIGN_STACK 16, rax
350    sub         rsp, 16 * 8
351    %define k0k1 [rsp + 16 * 0]
352    %define k6k7 [rsp + 16 * 1]
353    %define k2k5 [rsp + 16 * 2]
354    %define k3k4 [rsp + 16 * 3]
355    %define krd [rsp + 16 * 4]
356    %define temp [rsp + 16 * 5]
357    %define max [rsp + 16 * 6]
358    %define min [rsp + 16 * 7]
359
360    HIGH_GET_FILTERS
361
362    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
363    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
364    lea         rax, [rax + rax]            ;bytes per line
365    lea         rbx, [rbx + rbx]
366    lea         rdx, [rax + rax * 2]
367    movsxd      rcx, DWORD PTR arg(4)       ;output_height
368
369.loop:
370    LOAD_VERT_8 0
371    HIGH_APPLY_FILTER_8 0, 0
372    sub         rsi, rax
373
374    LOAD_VERT_8 16
375    HIGH_APPLY_FILTER_8 0, 16
376    add         rdi, rbx
377
378    dec         rcx
379    jnz         .loop
380
381    add rsp, 16 * 8
382    pop rsp
383    pop rbx
384    ; begin epilog
385    pop rdi
386    pop rsi
387    RESTORE_XMM
388    UNSHADOW_ARGS
389    pop         rbp
390    ret
391
392global sym(vp9_high_filter_block1d4_v8_avg_sse2) PRIVATE
393sym(vp9_high_filter_block1d4_v8_avg_sse2):
394    push        rbp
395    mov         rbp, rsp
396    SHADOW_ARGS_TO_STACK 7
397    SAVE_XMM 7
398    push        rsi
399    push        rdi
400    push        rbx
401    ; end prolog
402
403    ALIGN_STACK 16, rax
404    sub         rsp, 16 * 7
405    %define k0k6 [rsp + 16 * 0]
406    %define k2k5 [rsp + 16 * 1]
407    %define k3k4 [rsp + 16 * 2]
408    %define k1k7 [rsp + 16 * 3]
409    %define krd [rsp + 16 * 4]
410    %define max [rsp + 16 * 5]
411    %define min [rsp + 16 * 6]
412
413    HIGH_GET_FILTERS_4
414
415    mov         rsi, arg(0)                 ;src_ptr
416    mov         rdi, arg(2)                 ;output_ptr
417
418    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
419    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
420    lea         rax, [rax + rax]            ;bytes per line
421    lea         rbx, [rbx + rbx]
422    lea         rdx, [rax + rax * 2]
423    movsxd      rcx, DWORD PTR arg(4)       ;output_height
424
425.loop:
426    movq        xmm0, [rsi]                 ;load src: row 0
427    movq        xmm1, [rsi + rax]           ;1
428    movq        xmm6, [rsi + rdx * 2]       ;6
429    lea         rsi,  [rsi + rax]
430    movq        xmm7, [rsi + rdx * 2]       ;7
431    movq        xmm2, [rsi + rax]           ;2
432    movq        xmm3, [rsi + rax * 2]       ;3
433    movq        xmm4, [rsi + rdx]           ;4
434    movq        xmm5, [rsi + rax * 4]       ;5
435
436    HIGH_APPLY_FILTER_4 1
437
438    lea         rdi, [rdi + rbx]
439    dec         rcx
440    jnz         .loop
441
442    add rsp, 16 * 7
443    pop rsp
444    pop rbx
445    ; begin epilog
446    pop rdi
447    pop rsi
448    RESTORE_XMM
449    UNSHADOW_ARGS
450    pop         rbp
451    ret
452
453global sym(vp9_high_filter_block1d8_v8_avg_sse2) PRIVATE
454sym(vp9_high_filter_block1d8_v8_avg_sse2):
455    push        rbp
456    mov         rbp, rsp
457    SHADOW_ARGS_TO_STACK 7
458    SAVE_XMM 7
459    push        rsi
460    push        rdi
461    push        rbx
462    ; end prolog
463
464    ALIGN_STACK 16, rax
465    sub         rsp, 16 * 8
466    %define k0k1 [rsp + 16 * 0]
467    %define k6k7 [rsp + 16 * 1]
468    %define k2k5 [rsp + 16 * 2]
469    %define k3k4 [rsp + 16 * 3]
470    %define krd [rsp + 16 * 4]
471    %define temp [rsp + 16 * 5]
472    %define max [rsp + 16 * 6]
473    %define min [rsp + 16 * 7]
474
475    HIGH_GET_FILTERS
476
477    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
478    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
479    lea         rax, [rax + rax]            ;bytes per line
480    lea         rbx, [rbx + rbx]
481    lea         rdx, [rax + rax * 2]
482    movsxd      rcx, DWORD PTR arg(4)       ;output_height
483.loop:
484    LOAD_VERT_8 0
485    HIGH_APPLY_FILTER_8 1, 0
486
487    lea         rdi, [rdi + rbx]
488    dec         rcx
489    jnz         .loop
490
491    add rsp, 16 * 8
492    pop rsp
493    pop rbx
494    ; begin epilog
495    pop rdi
496    pop rsi
497    RESTORE_XMM
498    UNSHADOW_ARGS
499    pop         rbp
500    ret
501
502global sym(vp9_high_filter_block1d16_v8_avg_sse2) PRIVATE
503sym(vp9_high_filter_block1d16_v8_avg_sse2):
504    push        rbp
505    mov         rbp, rsp
506    SHADOW_ARGS_TO_STACK 7
507    SAVE_XMM 7
508    push        rsi
509    push        rdi
510    push        rbx
511    ; end prolog
512
513    ALIGN_STACK 16, rax
514    sub         rsp, 16 * 8
515    %define k0k1 [rsp + 16 * 0]
516    %define k6k7 [rsp + 16 * 1]
517    %define k2k5 [rsp + 16 * 2]
518    %define k3k4 [rsp + 16 * 3]
519    %define krd [rsp + 16 * 4]
520    %define temp [rsp + 16 * 5]
521    %define max [rsp + 16 * 6]
522    %define min [rsp + 16 * 7]
523
524    HIGH_GET_FILTERS
525
526    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
527    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
528    lea         rax, [rax + rax]            ;bytes per line
529    lea         rbx, [rbx + rbx]
530    lea         rdx, [rax + rax * 2]
531    movsxd      rcx, DWORD PTR arg(4)       ;output_height
532.loop:
533    LOAD_VERT_8 0
534    HIGH_APPLY_FILTER_8 1, 0
535    sub         rsi, rax
536
537    LOAD_VERT_8 16
538    HIGH_APPLY_FILTER_8 1, 16
539    add         rdi, rbx
540
541    dec         rcx
542    jnz         .loop
543
544    add rsp, 16 * 8
545    pop rsp
546    pop rbx
547    ; begin epilog
548    pop rdi
549    pop rsi
550    RESTORE_XMM
551    UNSHADOW_ARGS
552    pop         rbp
553    ret
554
555;void vp9_filter_block1d4_h8_sse2
556;(
557;    unsigned char  *src_ptr,
558;    unsigned int    src_pixels_per_line,
559;    unsigned char  *output_ptr,
560;    unsigned int    output_pitch,
561;    unsigned int    output_height,
562;    short *filter
563;)
564global sym(vp9_high_filter_block1d4_h8_sse2) PRIVATE
565sym(vp9_high_filter_block1d4_h8_sse2):
566    push        rbp
567    mov         rbp, rsp
568    SHADOW_ARGS_TO_STACK 7
569    SAVE_XMM 7
570    push        rsi
571    push        rdi
572    ; end prolog
573
574    ALIGN_STACK 16, rax
575    sub         rsp, 16 * 7
576    %define k0k6 [rsp + 16 * 0]
577    %define k2k5 [rsp + 16 * 1]
578    %define k3k4 [rsp + 16 * 2]
579    %define k1k7 [rsp + 16 * 3]
580    %define krd [rsp + 16 * 4]
581    %define max [rsp + 16 * 5]
582    %define min [rsp + 16 * 6]
583
584    HIGH_GET_FILTERS_4
585
586    mov         rsi, arg(0)                 ;src_ptr
587    mov         rdi, arg(2)                 ;output_ptr
588
589    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
590    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
591    lea         rax, [rax + rax]            ;bytes per line
592    lea         rdx, [rdx + rdx]
593    movsxd      rcx, DWORD PTR arg(4)       ;output_height
594
595.loop:
596    movdqu      xmm0,   [rsi - 6]           ;load src
597    movdqu      xmm4,   [rsi + 2]
598    movdqa      xmm1, xmm0
599    movdqa      xmm6, xmm4
600    movdqa      xmm7, xmm4
601    movdqa      xmm2, xmm0
602    movdqa      xmm3, xmm0
603    movdqa      xmm5, xmm4
604
605    psrldq      xmm1, 2
606    psrldq      xmm6, 4
607    psrldq      xmm7, 6
608    psrldq      xmm2, 4
609    psrldq      xmm3, 6
610    psrldq      xmm5, 2
611
612    HIGH_APPLY_FILTER_4 0
613
614    lea         rsi, [rsi + rax]
615    lea         rdi, [rdi + rdx]
616    dec         rcx
617    jnz         .loop
618
619    add rsp, 16 * 7
620    pop rsp
621
622    ; begin epilog
623    pop rdi
624    pop rsi
625    RESTORE_XMM
626    UNSHADOW_ARGS
627    pop         rbp
628    ret
629
630;void vp9_filter_block1d8_h8_sse2
631;(
632;    unsigned char  *src_ptr,
633;    unsigned int    src_pixels_per_line,
634;    unsigned char  *output_ptr,
635;    unsigned int    output_pitch,
636;    unsigned int    output_height,
637;    short *filter
638;)
639global sym(vp9_high_filter_block1d8_h8_sse2) PRIVATE
640sym(vp9_high_filter_block1d8_h8_sse2):
641    push        rbp
642    mov         rbp, rsp
643    SHADOW_ARGS_TO_STACK 7
644    SAVE_XMM 7
645    push        rsi
646    push        rdi
647    ; end prolog
648
649    ALIGN_STACK 16, rax
650    sub         rsp, 16 * 8
651    %define k0k1 [rsp + 16 * 0]
652    %define k6k7 [rsp + 16 * 1]
653    %define k2k5 [rsp + 16 * 2]
654    %define k3k4 [rsp + 16 * 3]
655    %define krd [rsp + 16 * 4]
656    %define temp [rsp + 16 * 5]
657    %define max [rsp + 16 * 6]
658    %define min [rsp + 16 * 7]
659
660    HIGH_GET_FILTERS
661
662    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
663    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
664    lea         rax, [rax + rax]            ;bytes per line
665    lea         rdx, [rdx + rdx]
666    movsxd      rcx, DWORD PTR arg(4)       ;output_height
667
668.loop:
669    movdqu      xmm0,   [rsi - 6]           ;load src
670    movdqu      xmm1,   [rsi - 4]
671    movdqu      xmm2,   [rsi - 2]
672    movdqu      xmm3,   [rsi]
673    movdqu      xmm4,   [rsi + 2]
674    movdqu      xmm5,   [rsi + 4]
675    movdqu      xmm6,   [rsi + 6]
676    movdqu      xmm7,   [rsi + 8]
677
678    HIGH_APPLY_FILTER_8 0, 0
679
680    lea         rsi, [rsi + rax]
681    lea         rdi, [rdi + rdx]
682    dec         rcx
683    jnz         .loop
684
685    add rsp, 16 * 8
686    pop rsp
687
688    ; begin epilog
689    pop rdi
690    pop rsi
691    RESTORE_XMM
692    UNSHADOW_ARGS
693    pop         rbp
694    ret
695
696;void vp9_filter_block1d16_h8_sse2
697;(
698;    unsigned char  *src_ptr,
699;    unsigned int    src_pixels_per_line,
700;    unsigned char  *output_ptr,
701;    unsigned int    output_pitch,
702;    unsigned int    output_height,
703;    short *filter
704;)
705global sym(vp9_high_filter_block1d16_h8_sse2) PRIVATE
706sym(vp9_high_filter_block1d16_h8_sse2):
707    push        rbp
708    mov         rbp, rsp
709    SHADOW_ARGS_TO_STACK 7
710    SAVE_XMM 7
711    push        rsi
712    push        rdi
713    ; end prolog
714
715    ALIGN_STACK 16, rax
716    sub         rsp, 16 * 8
717    %define k0k1 [rsp + 16 * 0]
718    %define k6k7 [rsp + 16 * 1]
719    %define k2k5 [rsp + 16 * 2]
720    %define k3k4 [rsp + 16 * 3]
721    %define krd [rsp + 16 * 4]
722    %define temp [rsp + 16 * 5]
723    %define max [rsp + 16 * 6]
724    %define min [rsp + 16 * 7]
725
726    HIGH_GET_FILTERS
727
728    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
729    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
730    lea         rax, [rax + rax]            ;bytes per line
731    lea         rdx, [rdx + rdx]
732    movsxd      rcx, DWORD PTR arg(4)       ;output_height
733
734.loop:
735    movdqu      xmm0,   [rsi - 6]           ;load src
736    movdqu      xmm1,   [rsi - 4]
737    movdqu      xmm2,   [rsi - 2]
738    movdqu      xmm3,   [rsi]
739    movdqu      xmm4,   [rsi + 2]
740    movdqu      xmm5,   [rsi + 4]
741    movdqu      xmm6,   [rsi + 6]
742    movdqu      xmm7,   [rsi + 8]
743
744    HIGH_APPLY_FILTER_8 0, 0
745
746    movdqu      xmm0,   [rsi + 10]           ;load src
747    movdqu      xmm1,   [rsi + 12]
748    movdqu      xmm2,   [rsi + 14]
749    movdqu      xmm3,   [rsi + 16]
750    movdqu      xmm4,   [rsi + 18]
751    movdqu      xmm5,   [rsi + 20]
752    movdqu      xmm6,   [rsi + 22]
753    movdqu      xmm7,   [rsi + 24]
754
755    HIGH_APPLY_FILTER_8 0, 16
756
757    lea         rsi, [rsi + rax]
758    lea         rdi, [rdi + rdx]
759    dec         rcx
760    jnz         .loop
761
762    add rsp, 16 * 8
763    pop rsp
764
765    ; begin epilog
766    pop rdi
767    pop rsi
768    RESTORE_XMM
769    UNSHADOW_ARGS
770    pop         rbp
771    ret
772
773global sym(vp9_high_filter_block1d4_h8_avg_sse2) PRIVATE
774sym(vp9_high_filter_block1d4_h8_avg_sse2):
775    push        rbp
776    mov         rbp, rsp
777    SHADOW_ARGS_TO_STACK 7
778    SAVE_XMM 7
779    push        rsi
780    push        rdi
781    ; end prolog
782
783    ALIGN_STACK 16, rax
784    sub         rsp, 16 * 7
785    %define k0k6 [rsp + 16 * 0]
786    %define k2k5 [rsp + 16 * 1]
787    %define k3k4 [rsp + 16 * 2]
788    %define k1k7 [rsp + 16 * 3]
789    %define krd [rsp + 16 * 4]
790    %define max [rsp + 16 * 5]
791    %define min [rsp + 16 * 6]
792
793    HIGH_GET_FILTERS_4
794
795    mov         rsi, arg(0)                 ;src_ptr
796    mov         rdi, arg(2)                 ;output_ptr
797
798    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
799    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
800    lea         rax, [rax + rax]            ;bytes per line
801    lea         rdx, [rdx + rdx]
802    movsxd      rcx, DWORD PTR arg(4)       ;output_height
803
804.loop:
805    movdqu      xmm0,   [rsi - 6]           ;load src
806    movdqu      xmm4,   [rsi + 2]
807    movdqa      xmm1, xmm0
808    movdqa      xmm6, xmm4
809    movdqa      xmm7, xmm4
810    movdqa      xmm2, xmm0
811    movdqa      xmm3, xmm0
812    movdqa      xmm5, xmm4
813
814    psrldq      xmm1, 2
815    psrldq      xmm6, 4
816    psrldq      xmm7, 6
817    psrldq      xmm2, 4
818    psrldq      xmm3, 6
819    psrldq      xmm5, 2
820
821    HIGH_APPLY_FILTER_4 1
822
823    lea         rsi, [rsi + rax]
824    lea         rdi, [rdi + rdx]
825    dec         rcx
826    jnz         .loop
827
828    add rsp, 16 * 7
829    pop rsp
830
831    ; begin epilog
832    pop rdi
833    pop rsi
834    RESTORE_XMM
835    UNSHADOW_ARGS
836    pop         rbp
837    ret
838
839global sym(vp9_high_filter_block1d8_h8_avg_sse2) PRIVATE
840sym(vp9_high_filter_block1d8_h8_avg_sse2):
841    push        rbp
842    mov         rbp, rsp
843    SHADOW_ARGS_TO_STACK 7
844    SAVE_XMM 7
845    push        rsi
846    push        rdi
847    ; end prolog
848
849    ALIGN_STACK 16, rax
850    sub         rsp, 16 * 8
851    %define k0k1 [rsp + 16 * 0]
852    %define k6k7 [rsp + 16 * 1]
853    %define k2k5 [rsp + 16 * 2]
854    %define k3k4 [rsp + 16 * 3]
855    %define krd [rsp + 16 * 4]
856    %define temp [rsp + 16 * 5]
857    %define max [rsp + 16 * 6]
858    %define min [rsp + 16 * 7]
859
860    HIGH_GET_FILTERS
861
862    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
863    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
864    lea         rax, [rax + rax]            ;bytes per line
865    lea         rdx, [rdx + rdx]
866    movsxd      rcx, DWORD PTR arg(4)       ;output_height
867
868.loop:
869    movdqu      xmm0,   [rsi - 6]           ;load src
870    movdqu      xmm1,   [rsi - 4]
871    movdqu      xmm2,   [rsi - 2]
872    movdqu      xmm3,   [rsi]
873    movdqu      xmm4,   [rsi + 2]
874    movdqu      xmm5,   [rsi + 4]
875    movdqu      xmm6,   [rsi + 6]
876    movdqu      xmm7,   [rsi + 8]
877
878    HIGH_APPLY_FILTER_8 1, 0
879
880    lea         rsi, [rsi + rax]
881    lea         rdi, [rdi + rdx]
882    dec         rcx
883    jnz         .loop
884
885    add rsp, 16 * 8
886    pop rsp
887
888    ; begin epilog
889    pop rdi
890    pop rsi
891    RESTORE_XMM
892    UNSHADOW_ARGS
893    pop         rbp
894    ret
895
896global sym(vp9_high_filter_block1d16_h8_avg_sse2) PRIVATE
897sym(vp9_high_filter_block1d16_h8_avg_sse2):
898    push        rbp
899    mov         rbp, rsp
900    SHADOW_ARGS_TO_STACK 7
901    SAVE_XMM 7
902    push        rsi
903    push        rdi
904    ; end prolog
905
906    ALIGN_STACK 16, rax
907    sub         rsp, 16 * 8
908    %define k0k1 [rsp + 16 * 0]
909    %define k6k7 [rsp + 16 * 1]
910    %define k2k5 [rsp + 16 * 2]
911    %define k3k4 [rsp + 16 * 3]
912    %define krd [rsp + 16 * 4]
913    %define temp [rsp + 16 * 5]
914    %define max [rsp + 16 * 6]
915    %define min [rsp + 16 * 7]
916
917    HIGH_GET_FILTERS
918
919    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
920    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
921    lea         rax, [rax + rax]            ;bytes per line
922    lea         rdx, [rdx + rdx]
923    movsxd      rcx, DWORD PTR arg(4)       ;output_height
924
925.loop:
926    movdqu      xmm0,   [rsi - 6]           ;load src
927    movdqu      xmm1,   [rsi - 4]
928    movdqu      xmm2,   [rsi - 2]
929    movdqu      xmm3,   [rsi]
930    movdqu      xmm4,   [rsi + 2]
931    movdqu      xmm5,   [rsi + 4]
932    movdqu      xmm6,   [rsi + 6]
933    movdqu      xmm7,   [rsi + 8]
934
935    HIGH_APPLY_FILTER_8 1, 0
936
937    movdqu      xmm0,   [rsi + 10]           ;load src
938    movdqu      xmm1,   [rsi + 12]
939    movdqu      xmm2,   [rsi + 14]
940    movdqu      xmm3,   [rsi + 16]
941    movdqu      xmm4,   [rsi + 18]
942    movdqu      xmm5,   [rsi + 20]
943    movdqu      xmm6,   [rsi + 22]
944    movdqu      xmm7,   [rsi + 24]
945
946    HIGH_APPLY_FILTER_8 1, 16
947
948    lea         rsi, [rsi + rax]
949    lea         rdi, [rdi + rdx]
950    dec         rcx
951    jnz         .loop
952
953    add rsp, 16 * 8
954    pop rsp
955
956    ; begin epilog
957    pop rdi
958    pop rsi
959    RESTORE_XMM
960    UNSHADOW_ARGS
961    pop         rbp
962    ret
963