1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro HIGH_GET_PARAM_4 0
14    mov         rdx, arg(5)                 ;filter ptr
15    mov         rsi, arg(0)                 ;src_ptr
16    mov         rdi, arg(2)                 ;output_ptr
17    mov         rcx, 0x00000040
18
19    movdqa      xmm3, [rdx]                 ;load filters
20    pshuflw     xmm4, xmm3, 11111111b       ;k3
21    psrldq      xmm3, 8
22    pshuflw     xmm3, xmm3, 0b              ;k4
23    punpcklwd   xmm4, xmm3                  ;k3k4
24
25    movq        xmm3, rcx                   ;rounding
26    pshufd      xmm3, xmm3, 0
27
28    mov         rdx, 0x00010001
29    movsxd      rcx, DWORD PTR arg(6)       ;bps
30    movq        xmm5, rdx
31    movq        xmm2, rcx
32    pshufd      xmm5, xmm5, 0b
33    movdqa      xmm1, xmm5
34    psllw       xmm5, xmm2
35    psubw       xmm5, xmm1                  ;max value (for clamping)
36    pxor        xmm2, xmm2                  ;min value (for clamping)
37
38    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
39    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
40    movsxd      rcx, DWORD PTR arg(4)       ;output_height
41%endm
42
43%macro HIGH_APPLY_FILTER_4 1
44
45    punpcklwd   xmm0, xmm1                  ;two row in one register
46    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
47
48    paddd       xmm0, xmm3                  ;rounding
49    psrad       xmm0, 7                     ;shift
50    packssdw    xmm0, xmm0                  ;pack to word
51
52    ;clamp the values
53    pminsw      xmm0, xmm5
54    pmaxsw      xmm0, xmm2
55
56%if %1
57    movq        xmm1, [rdi]
58    pavgw       xmm0, xmm1
59%endif
60
61    movq        [rdi], xmm0
62    lea         rsi, [rsi + 2*rax]
63    lea         rdi, [rdi + 2*rdx]
64    dec         rcx
65%endm
66
67%if ARCH_X86_64
68%macro HIGH_GET_PARAM 0
69    mov         rdx, arg(5)                 ;filter ptr
70    mov         rsi, arg(0)                 ;src_ptr
71    mov         rdi, arg(2)                 ;output_ptr
72    mov         rcx, 0x00000040
73
74    movdqa      xmm6, [rdx]                 ;load filters
75
76    pshuflw     xmm7, xmm6, 11111111b       ;k3
77    pshufhw     xmm6, xmm6, 0b              ;k4
78    psrldq      xmm6, 8
79    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
80
81    movq        xmm4, rcx                   ;rounding
82    pshufd      xmm4, xmm4, 0
83
84    mov         rdx, 0x00010001
85    movsxd      rcx, DWORD PTR arg(6)       ;bps
86    movq        xmm8, rdx
87    movq        xmm5, rcx
88    pshufd      xmm8, xmm8, 0b
89    movdqa      xmm1, xmm8
90    psllw       xmm8, xmm5
91    psubw       xmm8, xmm1                  ;max value (for clamping)
92    pxor        xmm5, xmm5                  ;min value (for clamping)
93
94    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
95    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
96    movsxd      rcx, DWORD PTR arg(4)       ;output_height
97%endm
98
99%macro HIGH_APPLY_FILTER_8 1
100    movdqa      xmm6, xmm0
101    punpckhwd   xmm6, xmm1
102    punpcklwd   xmm0, xmm1
103    pmaddwd     xmm6, xmm7
104    pmaddwd     xmm0, xmm7
105
106    paddd       xmm6, xmm4                  ;rounding
107    paddd       xmm0, xmm4                  ;rounding
108    psrad       xmm6, 7                     ;shift
109    psrad       xmm0, 7                     ;shift
110    packssdw    xmm0, xmm6                  ;pack back to word
111
112    ;clamp the values
113    pminsw      xmm0, xmm8
114    pmaxsw      xmm0, xmm5
115
116%if %1
117    movdqu      xmm1, [rdi]
118    pavgw       xmm0, xmm1
119%endif
120    movdqu      [rdi], xmm0                 ;store the result
121
122    lea         rsi, [rsi + 2*rax]
123    lea         rdi, [rdi + 2*rdx]
124    dec         rcx
125%endm
126
127%macro HIGH_APPLY_FILTER_16 1
128    movdqa      xmm9, xmm0
129    movdqa      xmm6, xmm2
130    punpckhwd   xmm9, xmm1
131    punpckhwd   xmm6, xmm3
132    punpcklwd   xmm0, xmm1
133    punpcklwd   xmm2, xmm3
134
135    pmaddwd     xmm9, xmm7
136    pmaddwd     xmm6, xmm7
137    pmaddwd     xmm0, xmm7
138    pmaddwd     xmm2, xmm7
139
140    paddd       xmm9, xmm4                  ;rounding
141    paddd       xmm6, xmm4
142    paddd       xmm0, xmm4
143    paddd       xmm2, xmm4
144
145    psrad       xmm9, 7                     ;shift
146    psrad       xmm6, 7
147    psrad       xmm0, 7
148    psrad       xmm2, 7
149
150    packssdw    xmm0, xmm9                  ;pack back to word
151    packssdw    xmm2, xmm6                  ;pack back to word
152
153    ;clamp the values
154    pminsw      xmm0, xmm8
155    pmaxsw      xmm0, xmm5
156    pminsw      xmm2, xmm8
157    pmaxsw      xmm2, xmm5
158
159%if %1
160    movdqu      xmm1, [rdi]
161    movdqu      xmm3, [rdi + 16]
162    pavgw       xmm0, xmm1
163    pavgw       xmm2, xmm3
164%endif
165    movdqu      [rdi], xmm0               ;store the result
166    movdqu      [rdi + 16], xmm2          ;store the result
167
168    lea         rsi, [rsi + 2*rax]
169    lea         rdi, [rdi + 2*rdx]
170    dec         rcx
171%endm
172%endif
173
174global sym(vp9_high_filter_block1d4_v2_sse2) PRIVATE
175sym(vp9_high_filter_block1d4_v2_sse2):
176    push        rbp
177    mov         rbp, rsp
178    SHADOW_ARGS_TO_STACK 7
179    push        rsi
180    push        rdi
181    ; end prolog
182
183    HIGH_GET_PARAM_4
184.loop:
185    movq        xmm0, [rsi]                 ;load src
186    movq        xmm1, [rsi + 2*rax]
187
188    HIGH_APPLY_FILTER_4 0
189    jnz         .loop
190
191    ; begin epilog
192    pop         rdi
193    pop         rsi
194    UNSHADOW_ARGS
195    pop         rbp
196    ret
197
198%if ARCH_X86_64
199global sym(vp9_high_filter_block1d8_v2_sse2) PRIVATE
200sym(vp9_high_filter_block1d8_v2_sse2):
201    push        rbp
202    mov         rbp, rsp
203    SHADOW_ARGS_TO_STACK 7
204    SAVE_XMM 8
205    push        rsi
206    push        rdi
207    ; end prolog
208
209    HIGH_GET_PARAM
210.loop:
211    movdqu      xmm0, [rsi]                 ;0
212    movdqu      xmm1, [rsi + 2*rax]         ;1
213
214    HIGH_APPLY_FILTER_8 0
215    jnz         .loop
216
217    ; begin epilog
218    pop         rdi
219    pop         rsi
220    RESTORE_XMM
221    UNSHADOW_ARGS
222    pop         rbp
223    ret
224
225global sym(vp9_high_filter_block1d16_v2_sse2) PRIVATE
226sym(vp9_high_filter_block1d16_v2_sse2):
227    push        rbp
228    mov         rbp, rsp
229    SHADOW_ARGS_TO_STACK 7
230    SAVE_XMM 9
231    push        rsi
232    push        rdi
233    ; end prolog
234
235    HIGH_GET_PARAM
236.loop:
237    movdqu        xmm0, [rsi]               ;0
238    movdqu        xmm2, [rsi + 16]
239    movdqu        xmm1, [rsi + 2*rax]       ;1
240    movdqu        xmm3, [rsi + 2*rax + 16]
241
242    HIGH_APPLY_FILTER_16 0
243    jnz         .loop
244
245    ; begin epilog
246    pop         rdi
247    pop         rsi
248    RESTORE_XMM
249    UNSHADOW_ARGS
250    pop         rbp
251    ret
252%endif
253
254global sym(vp9_high_filter_block1d4_v2_avg_sse2) PRIVATE
255sym(vp9_high_filter_block1d4_v2_avg_sse2):
256    push        rbp
257    mov         rbp, rsp
258    SHADOW_ARGS_TO_STACK 7
259    push        rsi
260    push        rdi
261    ; end prolog
262
263    HIGH_GET_PARAM_4
264.loop:
265    movq        xmm0, [rsi]                 ;load src
266    movq        xmm1, [rsi + 2*rax]
267
268    HIGH_APPLY_FILTER_4 1
269    jnz         .loop
270
271    ; begin epilog
272    pop         rdi
273    pop         rsi
274    UNSHADOW_ARGS
275    pop         rbp
276    ret
277
278%if ARCH_X86_64
279global sym(vp9_high_filter_block1d8_v2_avg_sse2) PRIVATE
280sym(vp9_high_filter_block1d8_v2_avg_sse2):
281    push        rbp
282    mov         rbp, rsp
283    SHADOW_ARGS_TO_STACK 7
284    SAVE_XMM 8
285    push        rsi
286    push        rdi
287    ; end prolog
288
289    HIGH_GET_PARAM
290.loop:
291    movdqu      xmm0, [rsi]                 ;0
292    movdqu      xmm1, [rsi + 2*rax]         ;1
293
294    HIGH_APPLY_FILTER_8 1
295    jnz         .loop
296
297    ; begin epilog
298    pop         rdi
299    pop         rsi
300    RESTORE_XMM
301    UNSHADOW_ARGS
302    pop         rbp
303    ret
304
305global sym(vp9_high_filter_block1d16_v2_avg_sse2) PRIVATE
306sym(vp9_high_filter_block1d16_v2_avg_sse2):
307    push        rbp
308    mov         rbp, rsp
309    SHADOW_ARGS_TO_STACK 7
310    SAVE_XMM 9
311    push        rsi
312    push        rdi
313    ; end prolog
314
315    HIGH_GET_PARAM
316.loop:
317    movdqu        xmm0, [rsi]               ;0
318    movdqu        xmm1, [rsi + 2*rax]       ;1
319    movdqu        xmm2, [rsi + 16]
320    movdqu        xmm3, [rsi + 2*rax + 16]
321
322    HIGH_APPLY_FILTER_16 1
323    jnz         .loop
324
325    ; begin epilog
326    pop         rdi
327    pop         rsi
328    RESTORE_XMM
329    UNSHADOW_ARGS
330    pop         rbp
331    ret
332%endif
333
334global sym(vp9_high_filter_block1d4_h2_sse2) PRIVATE
335sym(vp9_high_filter_block1d4_h2_sse2):
336    push        rbp
337    mov         rbp, rsp
338    SHADOW_ARGS_TO_STACK 7
339    push        rsi
340    push        rdi
341    ; end prolog
342
343    HIGH_GET_PARAM_4
344.loop:
345    movdqu      xmm0, [rsi]                 ;load src
346    movdqa      xmm1, xmm0
347    psrldq      xmm1, 2
348
349    HIGH_APPLY_FILTER_4 0
350    jnz         .loop
351
352    ; begin epilog
353    pop         rdi
354    pop         rsi
355    UNSHADOW_ARGS
356    pop         rbp
357    ret
358
359%if ARCH_X86_64
360global sym(vp9_high_filter_block1d8_h2_sse2) PRIVATE
361sym(vp9_high_filter_block1d8_h2_sse2):
362    push        rbp
363    mov         rbp, rsp
364    SHADOW_ARGS_TO_STACK 7
365    SAVE_XMM 8
366    push        rsi
367    push        rdi
368    ; end prolog
369
370    HIGH_GET_PARAM
371.loop:
372    movdqu      xmm0, [rsi]                 ;load src
373    movdqu      xmm1, [rsi + 2]
374
375    HIGH_APPLY_FILTER_8 0
376    jnz         .loop
377
378    ; begin epilog
379    pop         rdi
380    pop         rsi
381    RESTORE_XMM
382    UNSHADOW_ARGS
383    pop         rbp
384    ret
385
386global sym(vp9_high_filter_block1d16_h2_sse2) PRIVATE
387sym(vp9_high_filter_block1d16_h2_sse2):
388    push        rbp
389    mov         rbp, rsp
390    SHADOW_ARGS_TO_STACK 7
391    SAVE_XMM 9
392    push        rsi
393    push        rdi
394    ; end prolog
395
396    HIGH_GET_PARAM
397.loop:
398    movdqu      xmm0,   [rsi]               ;load src
399    movdqu      xmm1,   [rsi + 2]
400    movdqu      xmm2,   [rsi + 16]
401    movdqu      xmm3,   [rsi + 18]
402
403    HIGH_APPLY_FILTER_16 0
404    jnz         .loop
405
406    ; begin epilog
407    pop         rdi
408    pop         rsi
409    RESTORE_XMM
410    UNSHADOW_ARGS
411    pop         rbp
412    ret
413%endif
414
415global sym(vp9_high_filter_block1d4_h2_avg_sse2) PRIVATE
416sym(vp9_high_filter_block1d4_h2_avg_sse2):
417    push        rbp
418    mov         rbp, rsp
419    SHADOW_ARGS_TO_STACK 7
420    push        rsi
421    push        rdi
422    ; end prolog
423
424    HIGH_GET_PARAM_4
425.loop:
426    movdqu      xmm0, [rsi]                 ;load src
427    movdqa      xmm1, xmm0
428    psrldq      xmm1, 2
429
430    HIGH_APPLY_FILTER_4 1
431    jnz         .loop
432
433    ; begin epilog
434    pop         rdi
435    pop         rsi
436    UNSHADOW_ARGS
437    pop         rbp
438    ret
439
440%if ARCH_X86_64
441global sym(vp9_high_filter_block1d8_h2_avg_sse2) PRIVATE
442sym(vp9_high_filter_block1d8_h2_avg_sse2):
443    push        rbp
444    mov         rbp, rsp
445    SHADOW_ARGS_TO_STACK 7
446    SAVE_XMM 8
447    push        rsi
448    push        rdi
449    ; end prolog
450
451    HIGH_GET_PARAM
452.loop:
453    movdqu      xmm0, [rsi]                 ;load src
454    movdqu      xmm1, [rsi + 2]
455
456    HIGH_APPLY_FILTER_8 1
457    jnz         .loop
458
459    ; begin epilog
460    pop         rdi
461    pop         rsi
462    RESTORE_XMM
463    UNSHADOW_ARGS
464    pop         rbp
465    ret
466
467global sym(vp9_high_filter_block1d16_h2_avg_sse2) PRIVATE
468sym(vp9_high_filter_block1d16_h2_avg_sse2):
469    push        rbp
470    mov         rbp, rsp
471    SHADOW_ARGS_TO_STACK 7
472    SAVE_XMM 9
473    push        rsi
474    push        rdi
475    ; end prolog
476
477    HIGH_GET_PARAM
478.loop:
479    movdqu      xmm0,   [rsi]               ;load src
480    movdqu      xmm1,   [rsi + 2]
481    movdqu      xmm2,   [rsi + 16]
482    movdqu      xmm3,   [rsi + 18]
483
484    HIGH_APPLY_FILTER_16 1
485    jnz         .loop
486
487    ; begin epilog
488    pop         rdi
489    pop         rsi
490    RESTORE_XMM
491    UNSHADOW_ARGS
492    pop         rbp
493    ret
494%endif
495