1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;Note: tap3 and tap4 have to be applied and added after other taps to avoid
15;overflow.
16
17%macro HIGH_GET_FILTERS_4 0
18    mov         rdx, arg(5)                 ;filter ptr
19    mov         rcx, 0x00000040
20
21    movdqa      xmm7, [rdx]                 ;load filters
22    pshuflw     xmm0, xmm7, 0b              ;k0
23    pshuflw     xmm1, xmm7, 01010101b       ;k1
24    pshuflw     xmm2, xmm7, 10101010b       ;k2
25    pshuflw     xmm3, xmm7, 11111111b       ;k3
26    psrldq      xmm7, 8
27    pshuflw     xmm4, xmm7, 0b              ;k4
28    pshuflw     xmm5, xmm7, 01010101b       ;k5
29    pshuflw     xmm6, xmm7, 10101010b       ;k6
30    pshuflw     xmm7, xmm7, 11111111b       ;k7
31
32    punpcklwd   xmm0, xmm6
33    punpcklwd   xmm2, xmm5
34    punpcklwd   xmm3, xmm4
35    punpcklwd   xmm1, xmm7
36
37    movdqa      k0k6, xmm0
38    movdqa      k2k5, xmm2
39    movdqa      k3k4, xmm3
40    movdqa      k1k7, xmm1
41
42    movq        xmm6, rcx
43    pshufd      xmm6, xmm6, 0
44    movdqa      krd, xmm6
45
46    ;Compute max and min values of a pixel
47    mov         rdx, 0x00010001
48    movsxd      rcx, DWORD PTR arg(6)      ;bps
49    movq        xmm0, rdx
50    movq        xmm1, rcx
51    pshufd      xmm0, xmm0, 0b
52    movdqa      xmm2, xmm0
53    psllw       xmm0, xmm1
54    psubw       xmm0, xmm2
55    pxor        xmm1, xmm1
56    movdqa      max, xmm0                  ;max value (for clamping)
57    movdqa      min, xmm1                  ;min value (for clamping)
58
59%endm
60
61%macro HIGH_APPLY_FILTER_4 1
62    punpcklwd   xmm0, xmm6                  ;two row in one register
63    punpcklwd   xmm1, xmm7
64    punpcklwd   xmm2, xmm5
65    punpcklwd   xmm3, xmm4
66
67    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
68    pmaddwd     xmm1, k1k7
69    pmaddwd     xmm2, k2k5
70    pmaddwd     xmm3, k3k4
71
72    paddd       xmm0, xmm1                  ;sum
73    paddd       xmm0, xmm2
74    paddd       xmm0, xmm3
75
76    paddd       xmm0, krd                   ;rounding
77    psrad       xmm0, 7                     ;shift
78    packssdw    xmm0, xmm0                  ;pack to word
79
80    ;clamp the values
81    pminsw      xmm0, max
82    pmaxsw      xmm0, min
83
84%if %1
85    movq        xmm1, [rdi]
86    pavgw       xmm0, xmm1
87%endif
88    movq        [rdi], xmm0
89%endm
90
91%macro HIGH_GET_FILTERS 0
92    mov         rdx, arg(5)                 ;filter ptr
93    mov         rsi, arg(0)                 ;src_ptr
94    mov         rdi, arg(2)                 ;output_ptr
95    mov         rcx, 0x00000040
96
97    movdqa      xmm7, [rdx]                 ;load filters
98    pshuflw     xmm0, xmm7, 0b              ;k0
99    pshuflw     xmm1, xmm7, 01010101b       ;k1
100    pshuflw     xmm2, xmm7, 10101010b       ;k2
101    pshuflw     xmm3, xmm7, 11111111b       ;k3
102    pshufhw     xmm4, xmm7, 0b              ;k4
103    pshufhw     xmm5, xmm7, 01010101b       ;k5
104    pshufhw     xmm6, xmm7, 10101010b       ;k6
105    pshufhw     xmm7, xmm7, 11111111b       ;k7
106    punpcklqdq  xmm2, xmm2
107    punpcklqdq  xmm3, xmm3
108    punpcklwd   xmm0, xmm1
109    punpckhwd   xmm6, xmm7
110    punpckhwd   xmm2, xmm5
111    punpckhwd   xmm3, xmm4
112
113    movdqa      k0k1, xmm0                  ;store filter factors on stack
114    movdqa      k6k7, xmm6
115    movdqa      k2k5, xmm2
116    movdqa      k3k4, xmm3
117
118    movq        xmm6, rcx
119    pshufd      xmm6, xmm6, 0
120    movdqa      krd, xmm6                   ;rounding
121
122    ;Compute max and min values of a pixel
123    mov         rdx, 0x00010001
124    movsxd      rcx, DWORD PTR arg(6)       ;bps
125    movq        xmm0, rdx
126    movq        xmm1, rcx
127    pshufd      xmm0, xmm0, 0b
128    movdqa      xmm2, xmm0
129    psllw       xmm0, xmm1
130    psubw       xmm0, xmm2
131    pxor        xmm1, xmm1
132    movdqa      max, xmm0                  ;max value (for clamping)
133    movdqa      min, xmm1                  ;min value (for clamping)
134%endm
135
136%macro LOAD_VERT_8 1
137    movdqu      xmm0, [rsi + %1]            ;0
138    movdqu      xmm1, [rsi + rax + %1]      ;1
139    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
140    lea         rsi,  [rsi + rax]
141    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
142    movdqu      xmm2, [rsi + rax + %1]      ;2
143    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
144    movdqu      xmm4, [rsi + rdx + %1]      ;4
145    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
146%endm
147
148%macro HIGH_APPLY_FILTER_8 2
149    movdqu      temp, xmm4
150    movdqa      xmm4, xmm0
151    punpcklwd   xmm0, xmm1
152    punpckhwd   xmm4, xmm1
153    movdqa      xmm1, xmm6
154    punpcklwd   xmm6, xmm7
155    punpckhwd   xmm1, xmm7
156    movdqa      xmm7, xmm2
157    punpcklwd   xmm2, xmm5
158    punpckhwd   xmm7, xmm5
159
160    movdqu      xmm5, temp
161    movdqu      temp, xmm4
162    movdqa      xmm4, xmm3
163    punpcklwd   xmm3, xmm5
164    punpckhwd   xmm4, xmm5
165    movdqu      xmm5, temp
166
167    pmaddwd     xmm0, k0k1
168    pmaddwd     xmm5, k0k1
169    pmaddwd     xmm6, k6k7
170    pmaddwd     xmm1, k6k7
171    pmaddwd     xmm2, k2k5
172    pmaddwd     xmm7, k2k5
173    pmaddwd     xmm3, k3k4
174    pmaddwd     xmm4, k3k4
175
176    paddd       xmm0, xmm6
177    paddd       xmm0, xmm2
178    paddd       xmm0, xmm3
179    paddd       xmm5, xmm1
180    paddd       xmm5, xmm7
181    paddd       xmm5, xmm4
182
183    paddd       xmm0, krd                   ;rounding
184    paddd       xmm5, krd
185    psrad       xmm0, 7                     ;shift
186    psrad       xmm5, 7
187    packssdw    xmm0, xmm5                  ;pack back to word
188
189    ;clamp the values
190    pminsw      xmm0, max
191    pmaxsw      xmm0, min
192
193%if %1
194    movdqu      xmm1, [rdi + %2]
195    pavgw       xmm0, xmm1
196%endif
197    movdqu      [rdi + %2], xmm0
198%endm
199
200SECTION .text
201
202;void vpx_filter_block1d4_v8_sse2
203;(
204;    unsigned char *src_ptr,
205;    unsigned int   src_pitch,
206;    unsigned char *output_ptr,
207;    unsigned int   out_pitch,
208;    unsigned int   output_height,
209;    short *filter
210;)
211global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE
212sym(vpx_highbd_filter_block1d4_v8_sse2):
213    push        rbp
214    mov         rbp, rsp
215    SHADOW_ARGS_TO_STACK 7
216    SAVE_XMM 7
217    push        rsi
218    push        rdi
219    push        rbx
220    ; end prolog
221
222    ALIGN_STACK 16, rax
223    sub         rsp, 16 * 7
224    %define k0k6 [rsp + 16 * 0]
225    %define k2k5 [rsp + 16 * 1]
226    %define k3k4 [rsp + 16 * 2]
227    %define k1k7 [rsp + 16 * 3]
228    %define krd [rsp + 16 * 4]
229    %define max [rsp + 16 * 5]
230    %define min [rsp + 16 * 6]
231
232    HIGH_GET_FILTERS_4
233
234    mov         rsi, arg(0)                 ;src_ptr
235    mov         rdi, arg(2)                 ;output_ptr
236
237    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
238    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
239    lea         rax, [rax + rax]            ;bytes per line
240    lea         rbx, [rbx + rbx]
241    lea         rdx, [rax + rax * 2]
242    movsxd      rcx, DWORD PTR arg(4)       ;output_height
243
244.loop:
245    movq        xmm0, [rsi]                 ;load src: row 0
246    movq        xmm1, [rsi + rax]           ;1
247    movq        xmm6, [rsi + rdx * 2]       ;6
248    lea         rsi,  [rsi + rax]
249    movq        xmm7, [rsi + rdx * 2]       ;7
250    movq        xmm2, [rsi + rax]           ;2
251    movq        xmm3, [rsi + rax * 2]       ;3
252    movq        xmm4, [rsi + rdx]           ;4
253    movq        xmm5, [rsi + rax * 4]       ;5
254
255    HIGH_APPLY_FILTER_4 0
256
257    lea         rdi, [rdi + rbx]
258    dec         rcx
259    jnz         .loop
260
261    add rsp, 16 * 7
262    pop rsp
263    pop rbx
264    ; begin epilog
265    pop rdi
266    pop rsi
267    RESTORE_XMM
268    UNSHADOW_ARGS
269    pop         rbp
270    ret
271
272;void vpx_filter_block1d8_v8_sse2
273;(
274;    unsigned char *src_ptr,
275;    unsigned int   src_pitch,
276;    unsigned char *output_ptr,
277;    unsigned int   out_pitch,
278;    unsigned int   output_height,
279;    short *filter
280;)
281global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE
282sym(vpx_highbd_filter_block1d8_v8_sse2):
283    push        rbp
284    mov         rbp, rsp
285    SHADOW_ARGS_TO_STACK 7
286    SAVE_XMM 7
287    push        rsi
288    push        rdi
289    push        rbx
290    ; end prolog
291
292    ALIGN_STACK 16, rax
293    sub         rsp, 16 * 8
294    %define k0k1 [rsp + 16 * 0]
295    %define k6k7 [rsp + 16 * 1]
296    %define k2k5 [rsp + 16 * 2]
297    %define k3k4 [rsp + 16 * 3]
298    %define krd [rsp + 16 * 4]
299    %define temp [rsp + 16 * 5]
300    %define max [rsp + 16 * 6]
301    %define min [rsp + 16 * 7]
302
303    HIGH_GET_FILTERS
304
305    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
306    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
307    lea         rax, [rax + rax]            ;bytes per line
308    lea         rbx, [rbx + rbx]
309    lea         rdx, [rax + rax * 2]
310    movsxd      rcx, DWORD PTR arg(4)       ;output_height
311
312.loop:
313    LOAD_VERT_8 0
314    HIGH_APPLY_FILTER_8 0, 0
315
316    lea         rdi, [rdi + rbx]
317    dec         rcx
318    jnz         .loop
319
320    add rsp, 16 * 8
321    pop rsp
322    pop rbx
323    ; begin epilog
324    pop rdi
325    pop rsi
326    RESTORE_XMM
327    UNSHADOW_ARGS
328    pop         rbp
329    ret
330
331;void vpx_filter_block1d16_v8_sse2
332;(
333;    unsigned char *src_ptr,
334;    unsigned int   src_pitch,
335;    unsigned char *output_ptr,
336;    unsigned int   out_pitch,
337;    unsigned int   output_height,
338;    short *filter
339;)
340global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE
341sym(vpx_highbd_filter_block1d16_v8_sse2):
342    push        rbp
343    mov         rbp, rsp
344    SHADOW_ARGS_TO_STACK 7
345    SAVE_XMM 7
346    push        rsi
347    push        rdi
348    push        rbx
349    ; end prolog
350
351    ALIGN_STACK 16, rax
352    sub         rsp, 16 * 8
353    %define k0k1 [rsp + 16 * 0]
354    %define k6k7 [rsp + 16 * 1]
355    %define k2k5 [rsp + 16 * 2]
356    %define k3k4 [rsp + 16 * 3]
357    %define krd [rsp + 16 * 4]
358    %define temp [rsp + 16 * 5]
359    %define max [rsp + 16 * 6]
360    %define min [rsp + 16 * 7]
361
362    HIGH_GET_FILTERS
363
364    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
365    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
366    lea         rax, [rax + rax]            ;bytes per line
367    lea         rbx, [rbx + rbx]
368    lea         rdx, [rax + rax * 2]
369    movsxd      rcx, DWORD PTR arg(4)       ;output_height
370
371.loop:
372    LOAD_VERT_8 0
373    HIGH_APPLY_FILTER_8 0, 0
374    sub         rsi, rax
375
376    LOAD_VERT_8 16
377    HIGH_APPLY_FILTER_8 0, 16
378    add         rdi, rbx
379
380    dec         rcx
381    jnz         .loop
382
383    add rsp, 16 * 8
384    pop rsp
385    pop rbx
386    ; begin epilog
387    pop rdi
388    pop rsi
389    RESTORE_XMM
390    UNSHADOW_ARGS
391    pop         rbp
392    ret
393
394global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
395sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
396    push        rbp
397    mov         rbp, rsp
398    SHADOW_ARGS_TO_STACK 7
399    SAVE_XMM 7
400    push        rsi
401    push        rdi
402    push        rbx
403    ; end prolog
404
405    ALIGN_STACK 16, rax
406    sub         rsp, 16 * 7
407    %define k0k6 [rsp + 16 * 0]
408    %define k2k5 [rsp + 16 * 1]
409    %define k3k4 [rsp + 16 * 2]
410    %define k1k7 [rsp + 16 * 3]
411    %define krd [rsp + 16 * 4]
412    %define max [rsp + 16 * 5]
413    %define min [rsp + 16 * 6]
414
415    HIGH_GET_FILTERS_4
416
417    mov         rsi, arg(0)                 ;src_ptr
418    mov         rdi, arg(2)                 ;output_ptr
419
420    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
421    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
422    lea         rax, [rax + rax]            ;bytes per line
423    lea         rbx, [rbx + rbx]
424    lea         rdx, [rax + rax * 2]
425    movsxd      rcx, DWORD PTR arg(4)       ;output_height
426
427.loop:
428    movq        xmm0, [rsi]                 ;load src: row 0
429    movq        xmm1, [rsi + rax]           ;1
430    movq        xmm6, [rsi + rdx * 2]       ;6
431    lea         rsi,  [rsi + rax]
432    movq        xmm7, [rsi + rdx * 2]       ;7
433    movq        xmm2, [rsi + rax]           ;2
434    movq        xmm3, [rsi + rax * 2]       ;3
435    movq        xmm4, [rsi + rdx]           ;4
436    movq        xmm5, [rsi + rax * 4]       ;5
437
438    HIGH_APPLY_FILTER_4 1
439
440    lea         rdi, [rdi + rbx]
441    dec         rcx
442    jnz         .loop
443
444    add rsp, 16 * 7
445    pop rsp
446    pop rbx
447    ; begin epilog
448    pop rdi
449    pop rsi
450    RESTORE_XMM
451    UNSHADOW_ARGS
452    pop         rbp
453    ret
454
455global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
456sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
457    push        rbp
458    mov         rbp, rsp
459    SHADOW_ARGS_TO_STACK 7
460    SAVE_XMM 7
461    push        rsi
462    push        rdi
463    push        rbx
464    ; end prolog
465
466    ALIGN_STACK 16, rax
467    sub         rsp, 16 * 8
468    %define k0k1 [rsp + 16 * 0]
469    %define k6k7 [rsp + 16 * 1]
470    %define k2k5 [rsp + 16 * 2]
471    %define k3k4 [rsp + 16 * 3]
472    %define krd [rsp + 16 * 4]
473    %define temp [rsp + 16 * 5]
474    %define max [rsp + 16 * 6]
475    %define min [rsp + 16 * 7]
476
477    HIGH_GET_FILTERS
478
479    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
480    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
481    lea         rax, [rax + rax]            ;bytes per line
482    lea         rbx, [rbx + rbx]
483    lea         rdx, [rax + rax * 2]
484    movsxd      rcx, DWORD PTR arg(4)       ;output_height
485.loop:
486    LOAD_VERT_8 0
487    HIGH_APPLY_FILTER_8 1, 0
488
489    lea         rdi, [rdi + rbx]
490    dec         rcx
491    jnz         .loop
492
493    add rsp, 16 * 8
494    pop rsp
495    pop rbx
496    ; begin epilog
497    pop rdi
498    pop rsi
499    RESTORE_XMM
500    UNSHADOW_ARGS
501    pop         rbp
502    ret
503
504global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
505sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
506    push        rbp
507    mov         rbp, rsp
508    SHADOW_ARGS_TO_STACK 7
509    SAVE_XMM 7
510    push        rsi
511    push        rdi
512    push        rbx
513    ; end prolog
514
515    ALIGN_STACK 16, rax
516    sub         rsp, 16 * 8
517    %define k0k1 [rsp + 16 * 0]
518    %define k6k7 [rsp + 16 * 1]
519    %define k2k5 [rsp + 16 * 2]
520    %define k3k4 [rsp + 16 * 3]
521    %define krd [rsp + 16 * 4]
522    %define temp [rsp + 16 * 5]
523    %define max [rsp + 16 * 6]
524    %define min [rsp + 16 * 7]
525
526    HIGH_GET_FILTERS
527
528    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
529    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
530    lea         rax, [rax + rax]            ;bytes per line
531    lea         rbx, [rbx + rbx]
532    lea         rdx, [rax + rax * 2]
533    movsxd      rcx, DWORD PTR arg(4)       ;output_height
534.loop:
535    LOAD_VERT_8 0
536    HIGH_APPLY_FILTER_8 1, 0
537    sub         rsi, rax
538
539    LOAD_VERT_8 16
540    HIGH_APPLY_FILTER_8 1, 16
541    add         rdi, rbx
542
543    dec         rcx
544    jnz         .loop
545
546    add rsp, 16 * 8
547    pop rsp
548    pop rbx
549    ; begin epilog
550    pop rdi
551    pop rsi
552    RESTORE_XMM
553    UNSHADOW_ARGS
554    pop         rbp
555    ret
556
557;void vpx_filter_block1d4_h8_sse2
558;(
559;    unsigned char  *src_ptr,
560;    unsigned int    src_pixels_per_line,
561;    unsigned char  *output_ptr,
562;    unsigned int    output_pitch,
563;    unsigned int    output_height,
564;    short *filter
565;)
566global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE
567sym(vpx_highbd_filter_block1d4_h8_sse2):
568    push        rbp
569    mov         rbp, rsp
570    SHADOW_ARGS_TO_STACK 7
571    SAVE_XMM 7
572    push        rsi
573    push        rdi
574    ; end prolog
575
576    ALIGN_STACK 16, rax
577    sub         rsp, 16 * 7
578    %define k0k6 [rsp + 16 * 0]
579    %define k2k5 [rsp + 16 * 1]
580    %define k3k4 [rsp + 16 * 2]
581    %define k1k7 [rsp + 16 * 3]
582    %define krd [rsp + 16 * 4]
583    %define max [rsp + 16 * 5]
584    %define min [rsp + 16 * 6]
585
586    HIGH_GET_FILTERS_4
587
588    mov         rsi, arg(0)                 ;src_ptr
589    mov         rdi, arg(2)                 ;output_ptr
590
591    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
592    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
593    lea         rax, [rax + rax]            ;bytes per line
594    lea         rdx, [rdx + rdx]
595    movsxd      rcx, DWORD PTR arg(4)       ;output_height
596
597.loop:
598    movdqu      xmm0,   [rsi - 6]           ;load src
599    movdqu      xmm4,   [rsi + 2]
600    movdqa      xmm1, xmm0
601    movdqa      xmm6, xmm4
602    movdqa      xmm7, xmm4
603    movdqa      xmm2, xmm0
604    movdqa      xmm3, xmm0
605    movdqa      xmm5, xmm4
606
607    psrldq      xmm1, 2
608    psrldq      xmm6, 4
609    psrldq      xmm7, 6
610    psrldq      xmm2, 4
611    psrldq      xmm3, 6
612    psrldq      xmm5, 2
613
614    HIGH_APPLY_FILTER_4 0
615
616    lea         rsi, [rsi + rax]
617    lea         rdi, [rdi + rdx]
618    dec         rcx
619    jnz         .loop
620
621    add rsp, 16 * 7
622    pop rsp
623
624    ; begin epilog
625    pop rdi
626    pop rsi
627    RESTORE_XMM
628    UNSHADOW_ARGS
629    pop         rbp
630    ret
631
632;void vpx_filter_block1d8_h8_sse2
633;(
634;    unsigned char  *src_ptr,
635;    unsigned int    src_pixels_per_line,
636;    unsigned char  *output_ptr,
637;    unsigned int    output_pitch,
638;    unsigned int    output_height,
639;    short *filter
640;)
641global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE
642sym(vpx_highbd_filter_block1d8_h8_sse2):
643    push        rbp
644    mov         rbp, rsp
645    SHADOW_ARGS_TO_STACK 7
646    SAVE_XMM 7
647    push        rsi
648    push        rdi
649    ; end prolog
650
651    ALIGN_STACK 16, rax
652    sub         rsp, 16 * 8
653    %define k0k1 [rsp + 16 * 0]
654    %define k6k7 [rsp + 16 * 1]
655    %define k2k5 [rsp + 16 * 2]
656    %define k3k4 [rsp + 16 * 3]
657    %define krd [rsp + 16 * 4]
658    %define temp [rsp + 16 * 5]
659    %define max [rsp + 16 * 6]
660    %define min [rsp + 16 * 7]
661
662    HIGH_GET_FILTERS
663
664    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
665    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
666    lea         rax, [rax + rax]            ;bytes per line
667    lea         rdx, [rdx + rdx]
668    movsxd      rcx, DWORD PTR arg(4)       ;output_height
669
670.loop:
671    movdqu      xmm0,   [rsi - 6]           ;load src
672    movdqu      xmm1,   [rsi - 4]
673    movdqu      xmm2,   [rsi - 2]
674    movdqu      xmm3,   [rsi]
675    movdqu      xmm4,   [rsi + 2]
676    movdqu      xmm5,   [rsi + 4]
677    movdqu      xmm6,   [rsi + 6]
678    movdqu      xmm7,   [rsi + 8]
679
680    HIGH_APPLY_FILTER_8 0, 0
681
682    lea         rsi, [rsi + rax]
683    lea         rdi, [rdi + rdx]
684    dec         rcx
685    jnz         .loop
686
687    add rsp, 16 * 8
688    pop rsp
689
690    ; begin epilog
691    pop rdi
692    pop rsi
693    RESTORE_XMM
694    UNSHADOW_ARGS
695    pop         rbp
696    ret
697
698;void vpx_filter_block1d16_h8_sse2
699;(
700;    unsigned char  *src_ptr,
701;    unsigned int    src_pixels_per_line,
702;    unsigned char  *output_ptr,
703;    unsigned int    output_pitch,
704;    unsigned int    output_height,
705;    short *filter
706;)
707global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE
708sym(vpx_highbd_filter_block1d16_h8_sse2):
709    push        rbp
710    mov         rbp, rsp
711    SHADOW_ARGS_TO_STACK 7
712    SAVE_XMM 7
713    push        rsi
714    push        rdi
715    ; end prolog
716
717    ALIGN_STACK 16, rax
718    sub         rsp, 16 * 8
719    %define k0k1 [rsp + 16 * 0]
720    %define k6k7 [rsp + 16 * 1]
721    %define k2k5 [rsp + 16 * 2]
722    %define k3k4 [rsp + 16 * 3]
723    %define krd [rsp + 16 * 4]
724    %define temp [rsp + 16 * 5]
725    %define max [rsp + 16 * 6]
726    %define min [rsp + 16 * 7]
727
728    HIGH_GET_FILTERS
729
730    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
731    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
732    lea         rax, [rax + rax]            ;bytes per line
733    lea         rdx, [rdx + rdx]
734    movsxd      rcx, DWORD PTR arg(4)       ;output_height
735
736.loop:
737    movdqu      xmm0,   [rsi - 6]           ;load src
738    movdqu      xmm1,   [rsi - 4]
739    movdqu      xmm2,   [rsi - 2]
740    movdqu      xmm3,   [rsi]
741    movdqu      xmm4,   [rsi + 2]
742    movdqu      xmm5,   [rsi + 4]
743    movdqu      xmm6,   [rsi + 6]
744    movdqu      xmm7,   [rsi + 8]
745
746    HIGH_APPLY_FILTER_8 0, 0
747
748    movdqu      xmm0,   [rsi + 10]           ;load src
749    movdqu      xmm1,   [rsi + 12]
750    movdqu      xmm2,   [rsi + 14]
751    movdqu      xmm3,   [rsi + 16]
752    movdqu      xmm4,   [rsi + 18]
753    movdqu      xmm5,   [rsi + 20]
754    movdqu      xmm6,   [rsi + 22]
755    movdqu      xmm7,   [rsi + 24]
756
757    HIGH_APPLY_FILTER_8 0, 16
758
759    lea         rsi, [rsi + rax]
760    lea         rdi, [rdi + rdx]
761    dec         rcx
762    jnz         .loop
763
764    add rsp, 16 * 8
765    pop rsp
766
767    ; begin epilog
768    pop rdi
769    pop rsi
770    RESTORE_XMM
771    UNSHADOW_ARGS
772    pop         rbp
773    ret
774
775global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
776sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
777    push        rbp
778    mov         rbp, rsp
779    SHADOW_ARGS_TO_STACK 7
780    SAVE_XMM 7
781    push        rsi
782    push        rdi
783    ; end prolog
784
785    ALIGN_STACK 16, rax
786    sub         rsp, 16 * 7
787    %define k0k6 [rsp + 16 * 0]
788    %define k2k5 [rsp + 16 * 1]
789    %define k3k4 [rsp + 16 * 2]
790    %define k1k7 [rsp + 16 * 3]
791    %define krd [rsp + 16 * 4]
792    %define max [rsp + 16 * 5]
793    %define min [rsp + 16 * 6]
794
795    HIGH_GET_FILTERS_4
796
797    mov         rsi, arg(0)                 ;src_ptr
798    mov         rdi, arg(2)                 ;output_ptr
799
800    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
801    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
802    lea         rax, [rax + rax]            ;bytes per line
803    lea         rdx, [rdx + rdx]
804    movsxd      rcx, DWORD PTR arg(4)       ;output_height
805
806.loop:
807    movdqu      xmm0,   [rsi - 6]           ;load src
808    movdqu      xmm4,   [rsi + 2]
809    movdqa      xmm1, xmm0
810    movdqa      xmm6, xmm4
811    movdqa      xmm7, xmm4
812    movdqa      xmm2, xmm0
813    movdqa      xmm3, xmm0
814    movdqa      xmm5, xmm4
815
816    psrldq      xmm1, 2
817    psrldq      xmm6, 4
818    psrldq      xmm7, 6
819    psrldq      xmm2, 4
820    psrldq      xmm3, 6
821    psrldq      xmm5, 2
822
823    HIGH_APPLY_FILTER_4 1
824
825    lea         rsi, [rsi + rax]
826    lea         rdi, [rdi + rdx]
827    dec         rcx
828    jnz         .loop
829
830    add rsp, 16 * 7
831    pop rsp
832
833    ; begin epilog
834    pop rdi
835    pop rsi
836    RESTORE_XMM
837    UNSHADOW_ARGS
838    pop         rbp
839    ret
840
841global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
842sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
843    push        rbp
844    mov         rbp, rsp
845    SHADOW_ARGS_TO_STACK 7
846    SAVE_XMM 7
847    push        rsi
848    push        rdi
849    ; end prolog
850
851    ALIGN_STACK 16, rax
852    sub         rsp, 16 * 8
853    %define k0k1 [rsp + 16 * 0]
854    %define k6k7 [rsp + 16 * 1]
855    %define k2k5 [rsp + 16 * 2]
856    %define k3k4 [rsp + 16 * 3]
857    %define krd [rsp + 16 * 4]
858    %define temp [rsp + 16 * 5]
859    %define max [rsp + 16 * 6]
860    %define min [rsp + 16 * 7]
861
862    HIGH_GET_FILTERS
863
864    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
865    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
866    lea         rax, [rax + rax]            ;bytes per line
867    lea         rdx, [rdx + rdx]
868    movsxd      rcx, DWORD PTR arg(4)       ;output_height
869
870.loop:
871    movdqu      xmm0,   [rsi - 6]           ;load src
872    movdqu      xmm1,   [rsi - 4]
873    movdqu      xmm2,   [rsi - 2]
874    movdqu      xmm3,   [rsi]
875    movdqu      xmm4,   [rsi + 2]
876    movdqu      xmm5,   [rsi + 4]
877    movdqu      xmm6,   [rsi + 6]
878    movdqu      xmm7,   [rsi + 8]
879
880    HIGH_APPLY_FILTER_8 1, 0
881
882    lea         rsi, [rsi + rax]
883    lea         rdi, [rdi + rdx]
884    dec         rcx
885    jnz         .loop
886
887    add rsp, 16 * 8
888    pop rsp
889
890    ; begin epilog
891    pop rdi
892    pop rsi
893    RESTORE_XMM
894    UNSHADOW_ARGS
895    pop         rbp
896    ret
897
898global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
899sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
900    push        rbp
901    mov         rbp, rsp
902    SHADOW_ARGS_TO_STACK 7
903    SAVE_XMM 7
904    push        rsi
905    push        rdi
906    ; end prolog
907
908    ALIGN_STACK 16, rax
909    sub         rsp, 16 * 8
910    %define k0k1 [rsp + 16 * 0]
911    %define k6k7 [rsp + 16 * 1]
912    %define k2k5 [rsp + 16 * 2]
913    %define k3k4 [rsp + 16 * 3]
914    %define krd [rsp + 16 * 4]
915    %define temp [rsp + 16 * 5]
916    %define max [rsp + 16 * 6]
917    %define min [rsp + 16 * 7]
918
919    HIGH_GET_FILTERS
920
921    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
922    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
923    lea         rax, [rax + rax]            ;bytes per line
924    lea         rdx, [rdx + rdx]
925    movsxd      rcx, DWORD PTR arg(4)       ;output_height
926
927.loop:
928    movdqu      xmm0,   [rsi - 6]           ;load src
929    movdqu      xmm1,   [rsi - 4]
930    movdqu      xmm2,   [rsi - 2]
931    movdqu      xmm3,   [rsi]
932    movdqu      xmm4,   [rsi + 2]
933    movdqu      xmm5,   [rsi + 4]
934    movdqu      xmm6,   [rsi + 6]
935    movdqu      xmm7,   [rsi + 8]
936
937    HIGH_APPLY_FILTER_8 1, 0
938
939    movdqu      xmm0,   [rsi + 10]           ;load src
940    movdqu      xmm1,   [rsi + 12]
941    movdqu      xmm2,   [rsi + 14]
942    movdqu      xmm3,   [rsi + 16]
943    movdqu      xmm4,   [rsi + 18]
944    movdqu      xmm5,   [rsi + 20]
945    movdqu      xmm6,   [rsi + 22]
946    movdqu      xmm7,   [rsi + 24]
947
948    HIGH_APPLY_FILTER_8 1, 16
949
950    lea         rsi, [rsi + rax]
951    lea         rdi, [rdi + rdx]
952    dec         rcx
953    jnz         .loop
954
955    add rsp, 16 * 8
956    pop rsp
957
958    ; begin epilog
959    pop rdi
960    pop rsi
961    RESTORE_XMM
962    UNSHADOW_ARGS
963    pop         rbp
964    ret
965