1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_8: times  8 dw  8
15bilin_filter_m_sse2: times  8 dw 16
16                     times  8 dw  0
17                     times  8 dw 14
18                     times  8 dw  2
19                     times  8 dw 12
20                     times  8 dw  4
21                     times  8 dw 10
22                     times  8 dw  6
23                     times 16 dw  8
24                     times  8 dw  6
25                     times  8 dw 10
26                     times  8 dw  4
27                     times  8 dw 12
28                     times  8 dw  2
29                     times  8 dw 14
30
31SECTION .text
32
33; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
34;                               int x_offset, int y_offset,
35;                               const uint8_t *dst, ptrdiff_t dst_stride,
36;                               int height, unsigned int *sse);
37;
38; This function returns the SE and stores SSE in the given pointer.
39
40%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
41  psubw                %3, %4
42  psubw                %1, %2
43  mova                 %4, %3       ; make copies to manipulate to calc sum
44  mova                 %2, %1       ; use originals for calc sse
45  pmaddwd              %3, %3
46  paddw                %4, %2
47  pmaddwd              %1, %1
48  movhlps              %2, %4
49  paddd                %6, %3
50  paddw                %4, %2
51  pxor                 %2, %2
52  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
53  punpcklwd            %4, %2       ; sign-extend word to dword
54  paddd                %6, %1
55  paddd                %5, %4
56
57%endmacro
58
59%macro STORE_AND_RET 0
60%if mmsize == 16
61  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
62  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
63  ; We have to sign-extend it before adding the words within the register
64  ; and outputing to a dword.
65  movhlps              m3, m7
66  movhlps              m4, m6
67  paddd                m7, m3
68  paddd                m6, m4
69  pshufd               m3, m7, 0x1
70  pshufd               m4, m6, 0x1
71  paddd                m7, m3
72  paddd                m6, m4
73  mov                  r1, ssem         ; r1 = unsigned int *sse
74  movd               [r1], m7           ; store sse
75  movd                rax, m6           ; store sum as return value
76%endif
77  RET
78%endmacro
79
80%macro INC_SRC_BY_SRC_STRIDE  0
81%if ARCH_X86=1 && CONFIG_PIC=1
82  add                srcq, src_stridemp
83  add                srcq, src_stridemp
84%else
85  lea                srcq, [srcq + src_strideq*2]
86%endif
87%endmacro
88
89%macro SUBPEL_VARIANCE 1-2 0 ; W
90%define bilin_filter_m bilin_filter_m_sse2
91%define filter_idx_shift 5
92
93
94%ifdef PIC    ; 64bit PIC
95  %if %2 == 1 ; avg
96    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
97                                      x_offset, y_offset, \
98                                      dst, dst_stride, \
99                                      sec, sec_stride, height, sse
100    %define sec_str sec_strideq
101  %else
102    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
103                                  y_offset, dst, dst_stride, height, sse
104  %endif
105  %define block_height heightd
106  %define bilin_filter sseq
107%else
108  %if ARCH_X86=1 && CONFIG_PIC=1
109    %if %2 == 1 ; avg
110      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
111                                  x_offset, y_offset, \
112                                  dst, dst_stride, \
113                                  sec, sec_stride, \
114                                  height, sse, g_bilin_filter, g_pw_8
115      %define block_height dword heightm
116      %define sec_str sec_stridemp
117
118      ; Store bilin_filter and pw_8 location in stack
119      %if GET_GOT_DEFINED == 1
120        GET_GOT eax
121        add esp, 4                ; restore esp
122      %endif
123
124      lea ecx, [GLOBAL(bilin_filter_m)]
125      mov g_bilin_filterm, ecx
126
127      lea ecx, [GLOBAL(pw_8)]
128      mov g_pw_8m, ecx
129
130      LOAD_IF_USED 0, 1         ; load eax, ecx back
131    %else
132      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
133                                x_offset, y_offset, dst, dst_stride, height, \
134                                sse, g_bilin_filter, g_pw_8
135      %define block_height heightd
136
137      ; Store bilin_filter and pw_8 location in stack
138      %if GET_GOT_DEFINED == 1
139        GET_GOT eax
140        add esp, 4                ; restore esp
141      %endif
142
143      lea ecx, [GLOBAL(bilin_filter_m)]
144      mov g_bilin_filterm, ecx
145
146      lea ecx, [GLOBAL(pw_8)]
147      mov g_pw_8m, ecx
148
149      LOAD_IF_USED 0, 1         ; load eax, ecx back
150    %endif
151  %else
152    %if %2 == 1 ; avg
153      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
154                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
155                                             x_offset, y_offset, \
156                                             dst, dst_stride, \
157                                             sec, sec_stride, \
158                                             height, sse
159      %if ARCH_X86_64
160      %define block_height heightd
161      %define sec_str sec_strideq
162      %else
163      %define block_height dword heightm
164      %define sec_str sec_stridemp
165      %endif
166    %else
167      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
168                              x_offset, y_offset, dst, dst_stride, height, sse
169      %define block_height heightd
170    %endif
171
172    %define bilin_filter bilin_filter_m
173  %endif
174%endif
175
176  ASSERT               %1 <= 16         ; m6 overflows if w > 16
177  pxor                 m6, m6           ; sum
178  pxor                 m7, m7           ; sse
179
180%if %1 < 16
181  sar                   block_height, 1
182%endif
183%if %2 == 1 ; avg
184  shl             sec_str, 1
185%endif
186
187  ; FIXME(rbultje) replace by jumptable?
188  test          x_offsetd, x_offsetd
189  jnz .x_nonzero
190  ; x_offset == 0
191  test          y_offsetd, y_offsetd
192  jnz .x_zero_y_nonzero
193
194  ; x_offset == 0 && y_offset == 0
195.x_zero_y_zero_loop:
196%if %1 == 16
197  movu                 m0, [srcq]
198  movu                 m2, [srcq + 16]
199  mova                 m1, [dstq]
200  mova                 m3, [dstq + 16]
201%if %2 == 1 ; avg
202  pavgw                m0, [secq]
203  pavgw                m2, [secq+16]
204%endif
205  SUM_SSE              m0, m1, m2, m3, m6, m7
206
207  lea                srcq, [srcq + src_strideq*2]
208  lea                dstq, [dstq + dst_strideq*2]
209%if %2 == 1 ; avg
210  add                secq, sec_str
211%endif
212%else ; %1 < 16
213  movu                 m0, [srcq]
214  movu                 m2, [srcq + src_strideq*2]
215  mova                 m1, [dstq]
216  mova                 m3, [dstq + dst_strideq*2]
217%if %2 == 1 ; avg
218  pavgw                m0, [secq]
219  add                secq, sec_str
220  pavgw                m2, [secq]
221%endif
222  SUM_SSE              m0, m1, m2, m3, m6, m7
223
224  lea                srcq, [srcq + src_strideq*4]
225  lea                dstq, [dstq + dst_strideq*4]
226%if %2 == 1 ; avg
227  add                secq, sec_str
228%endif
229%endif
230  dec                   block_height
231  jg .x_zero_y_zero_loop
232  STORE_AND_RET
233
234.x_zero_y_nonzero:
235  cmp           y_offsetd, 8
236  jne .x_zero_y_nonhalf
237
238  ; x_offset == 0 && y_offset == 0.5
239.x_zero_y_half_loop:
240%if %1 == 16
241  movu                 m0, [srcq]
242  movu                 m1, [srcq+16]
243  movu                 m4, [srcq+src_strideq*2]
244  movu                 m5, [srcq+src_strideq*2+16]
245  mova                 m2, [dstq]
246  mova                 m3, [dstq+16]
247  pavgw                m0, m4
248  pavgw                m1, m5
249%if %2 == 1 ; avg
250  pavgw                m0, [secq]
251  pavgw                m1, [secq+16]
252%endif
253  SUM_SSE              m0, m2, m1, m3, m6, m7
254
255  lea                srcq, [srcq + src_strideq*2]
256  lea                dstq, [dstq + dst_strideq*2]
257%if %2 == 1 ; avg
258  add                secq, sec_str
259%endif
260%else ; %1 < 16
261  movu                 m0, [srcq]
262  movu                 m1, [srcq+src_strideq*2]
263  movu                 m5, [srcq+src_strideq*4]
264  mova                 m2, [dstq]
265  mova                 m3, [dstq+dst_strideq*2]
266  pavgw                m0, m1
267  pavgw                m1, m5
268%if %2 == 1 ; avg
269  pavgw                m0, [secq]
270  add                secq, sec_str
271  pavgw                m1, [secq]
272%endif
273  SUM_SSE              m0, m2, m1, m3, m6, m7
274
275  lea                srcq, [srcq + src_strideq*4]
276  lea                dstq, [dstq + dst_strideq*4]
277%if %2 == 1 ; avg
278  add                secq, sec_str
279%endif
280%endif
281  dec                   block_height
282  jg .x_zero_y_half_loop
283  STORE_AND_RET
284
285.x_zero_y_nonhalf:
286  ; x_offset == 0 && y_offset == bilin interpolation
287%ifdef PIC
288  lea        bilin_filter, [bilin_filter_m]
289%endif
290  shl           y_offsetd, filter_idx_shift
291%if ARCH_X86_64 && mmsize == 16
292  mova                 m8, [bilin_filter+y_offsetq]
293  mova                 m9, [bilin_filter+y_offsetq+16]
294  mova                m10, [pw_8]
295%define filter_y_a m8
296%define filter_y_b m9
297%define filter_rnd m10
298%else ; x86-32 or mmx
299%if ARCH_X86=1 && CONFIG_PIC=1
300; x_offset == 0, reuse x_offset reg
301%define tempq x_offsetq
302  add y_offsetq, g_bilin_filterm
303%define filter_y_a [y_offsetq]
304%define filter_y_b [y_offsetq+16]
305  mov tempq, g_pw_8m
306%define filter_rnd [tempq]
307%else
308  add           y_offsetq, bilin_filter
309%define filter_y_a [y_offsetq]
310%define filter_y_b [y_offsetq+16]
311%define filter_rnd [pw_8]
312%endif
313%endif
314
315.x_zero_y_other_loop:
316%if %1 == 16
317  movu                 m0, [srcq]
318  movu                 m1, [srcq + 16]
319  movu                 m4, [srcq+src_strideq*2]
320  movu                 m5, [srcq+src_strideq*2+16]
321  mova                 m2, [dstq]
322  mova                 m3, [dstq+16]
323  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
324  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
325  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
326  ; slightly faster because of pmullw latency. It would also cut our rodata
327  ; tables in half for this function, and save 1-2 registers on x86-64.
328  pmullw               m1, filter_y_a
329  pmullw               m5, filter_y_b
330  paddw                m1, filter_rnd
331  pmullw               m0, filter_y_a
332  pmullw               m4, filter_y_b
333  paddw                m0, filter_rnd
334  paddw                m1, m5
335  paddw                m0, m4
336  psrlw                m1, 4
337  psrlw                m0, 4
338%if %2 == 1 ; avg
339  pavgw                m0, [secq]
340  pavgw                m1, [secq+16]
341%endif
342  SUM_SSE              m0, m2, m1, m3, m6, m7
343
344  lea                srcq, [srcq + src_strideq*2]
345  lea                dstq, [dstq + dst_strideq*2]
346%if %2 == 1 ; avg
347  add                secq, sec_str
348%endif
349%else ; %1 < 16
350  movu                 m0, [srcq]
351  movu                 m1, [srcq+src_strideq*2]
352  movu                 m5, [srcq+src_strideq*4]
353  mova                 m4, m1
354  mova                 m2, [dstq]
355  mova                 m3, [dstq+dst_strideq*2]
356  pmullw               m1, filter_y_a
357  pmullw               m5, filter_y_b
358  paddw                m1, filter_rnd
359  pmullw               m0, filter_y_a
360  pmullw               m4, filter_y_b
361  paddw                m0, filter_rnd
362  paddw                m1, m5
363  paddw                m0, m4
364  psrlw                m1, 4
365  psrlw                m0, 4
366%if %2 == 1 ; avg
367  pavgw                m0, [secq]
368  add                secq, sec_str
369  pavgw                m1, [secq]
370%endif
371  SUM_SSE              m0, m2, m1, m3, m6, m7
372
373  lea                srcq, [srcq + src_strideq*4]
374  lea                dstq, [dstq + dst_strideq*4]
375%if %2 == 1 ; avg
376  add                secq, sec_str
377%endif
378%endif
379  dec                   block_height
380  jg .x_zero_y_other_loop
381%undef filter_y_a
382%undef filter_y_b
383%undef filter_rnd
384  STORE_AND_RET
385
386.x_nonzero:
387  cmp           x_offsetd, 8
388  jne .x_nonhalf
389  ; x_offset == 0.5
390  test          y_offsetd, y_offsetd
391  jnz .x_half_y_nonzero
392
393  ; x_offset == 0.5 && y_offset == 0
394.x_half_y_zero_loop:
395%if %1 == 16
396  movu                 m0, [srcq]
397  movu                 m1, [srcq + 16]
398  movu                 m4, [srcq + 2]
399  movu                 m5, [srcq + 18]
400  mova                 m2, [dstq]
401  mova                 m3, [dstq + 16]
402  pavgw                m0, m4
403  pavgw                m1, m5
404%if %2 == 1 ; avg
405  pavgw                m0, [secq]
406  pavgw                m1, [secq+16]
407%endif
408  SUM_SSE              m0, m2, m1, m3, m6, m7
409
410  lea                srcq, [srcq + src_strideq*2]
411  lea                dstq, [dstq + dst_strideq*2]
412%if %2 == 1 ; avg
413  add                secq, sec_str
414%endif
415%else ; %1 < 16
416  movu                 m0, [srcq]
417  movu                 m1, [srcq + src_strideq*2]
418  movu                 m4, [srcq + 2]
419  movu                 m5, [srcq + src_strideq*2 + 2]
420  mova                 m2, [dstq]
421  mova                 m3, [dstq + dst_strideq*2]
422  pavgw                m0, m4
423  pavgw                m1, m5
424%if %2 == 1 ; avg
425  pavgw                m0, [secq]
426  add                secq, sec_str
427  pavgw                m1, [secq]
428%endif
429  SUM_SSE              m0, m2, m1, m3, m6, m7
430
431  lea                srcq, [srcq + src_strideq*4]
432  lea                dstq, [dstq + dst_strideq*4]
433%if %2 == 1 ; avg
434  add                secq, sec_str
435%endif
436%endif
437  dec                   block_height
438  jg .x_half_y_zero_loop
439  STORE_AND_RET
440
441.x_half_y_nonzero:
442  cmp           y_offsetd, 8
443  jne .x_half_y_nonhalf
444
445  ; x_offset == 0.5 && y_offset == 0.5
446%if %1 == 16
447  movu                 m0, [srcq]
448  movu                 m1, [srcq+16]
449  movu                 m2, [srcq+2]
450  movu                 m3, [srcq+18]
451  lea                srcq, [srcq + src_strideq*2]
452  pavgw                m0, m2
453  pavgw                m1, m3
454.x_half_y_half_loop:
455  movu                 m2, [srcq]
456  movu                 m3, [srcq + 16]
457  movu                 m4, [srcq + 2]
458  movu                 m5, [srcq + 18]
459  pavgw                m2, m4
460  pavgw                m3, m5
461  pavgw                m0, m2
462  pavgw                m1, m3
463  mova                 m4, [dstq]
464  mova                 m5, [dstq + 16]
465%if %2 == 1 ; avg
466  pavgw                m0, [secq]
467  pavgw                m1, [secq+16]
468%endif
469  SUM_SSE              m0, m4, m1, m5, m6, m7
470  mova                 m0, m2
471  mova                 m1, m3
472
473  lea                srcq, [srcq + src_strideq*2]
474  lea                dstq, [dstq + dst_strideq*2]
475%if %2 == 1 ; avg
476  add                secq, sec_str
477%endif
478%else ; %1 < 16
479  movu                 m0, [srcq]
480  movu                 m2, [srcq+2]
481  lea                srcq, [srcq + src_strideq*2]
482  pavgw                m0, m2
483.x_half_y_half_loop:
484  movu                 m2, [srcq]
485  movu                 m3, [srcq + src_strideq*2]
486  movu                 m4, [srcq + 2]
487  movu                 m5, [srcq + src_strideq*2 + 2]
488  pavgw                m2, m4
489  pavgw                m3, m5
490  pavgw                m0, m2
491  pavgw                m2, m3
492  mova                 m4, [dstq]
493  mova                 m5, [dstq + dst_strideq*2]
494%if %2 == 1 ; avg
495  pavgw                m0, [secq]
496  add                secq, sec_str
497  pavgw                m2, [secq]
498%endif
499  SUM_SSE              m0, m4, m2, m5, m6, m7
500  mova                 m0, m3
501
502  lea                srcq, [srcq + src_strideq*4]
503  lea                dstq, [dstq + dst_strideq*4]
504%if %2 == 1 ; avg
505  add                secq, sec_str
506%endif
507%endif
508  dec                   block_height
509  jg .x_half_y_half_loop
510  STORE_AND_RET
511
512.x_half_y_nonhalf:
513  ; x_offset == 0.5 && y_offset == bilin interpolation
514%ifdef PIC
515  lea        bilin_filter, [bilin_filter_m]
516%endif
517  shl           y_offsetd, filter_idx_shift
518%if ARCH_X86_64 && mmsize == 16
519  mova                 m8, [bilin_filter+y_offsetq]
520  mova                 m9, [bilin_filter+y_offsetq+16]
521  mova                m10, [pw_8]
522%define filter_y_a m8
523%define filter_y_b m9
524%define filter_rnd m10
525%else  ; x86_32
526%if ARCH_X86=1 && CONFIG_PIC=1
527; x_offset == 0.5. We can reuse x_offset reg
528%define tempq x_offsetq
529  add y_offsetq, g_bilin_filterm
530%define filter_y_a [y_offsetq]
531%define filter_y_b [y_offsetq+16]
532  mov tempq, g_pw_8m
533%define filter_rnd [tempq]
534%else
535  add           y_offsetq, bilin_filter
536%define filter_y_a [y_offsetq]
537%define filter_y_b [y_offsetq+16]
538%define filter_rnd [pw_8]
539%endif
540%endif
541
542%if %1 == 16
543  movu                 m0, [srcq]
544  movu                 m1, [srcq+16]
545  movu                 m2, [srcq+2]
546  movu                 m3, [srcq+18]
547  lea                srcq, [srcq + src_strideq*2]
548  pavgw                m0, m2
549  pavgw                m1, m3
550.x_half_y_other_loop:
551  movu                 m2, [srcq]
552  movu                 m3, [srcq+16]
553  movu                 m4, [srcq+2]
554  movu                 m5, [srcq+18]
555  pavgw                m2, m4
556  pavgw                m3, m5
557  mova                 m4, m2
558  mova                 m5, m3
559  pmullw               m1, filter_y_a
560  pmullw               m3, filter_y_b
561  paddw                m1, filter_rnd
562  paddw                m1, m3
563  pmullw               m0, filter_y_a
564  pmullw               m2, filter_y_b
565  paddw                m0, filter_rnd
566  psrlw                m1, 4
567  paddw                m0, m2
568  mova                 m2, [dstq]
569  psrlw                m0, 4
570  mova                 m3, [dstq+16]
571%if %2 == 1 ; avg
572  pavgw                m0, [secq]
573  pavgw                m1, [secq+16]
574%endif
575  SUM_SSE              m0, m2, m1, m3, m6, m7
576  mova                 m0, m4
577  mova                 m1, m5
578
579  lea                srcq, [srcq + src_strideq*2]
580  lea                dstq, [dstq + dst_strideq*2]
581%if %2 == 1 ; avg
582  add                secq, sec_str
583%endif
584%else ; %1 < 16
585  movu                 m0, [srcq]
586  movu                 m2, [srcq+2]
587  lea                srcq, [srcq + src_strideq*2]
588  pavgw                m0, m2
589.x_half_y_other_loop:
590  movu                 m2, [srcq]
591  movu                 m3, [srcq+src_strideq*2]
592  movu                 m4, [srcq+2]
593  movu                 m5, [srcq+src_strideq*2+2]
594  pavgw                m2, m4
595  pavgw                m3, m5
596  mova                 m4, m2
597  mova                 m5, m3
598  pmullw               m4, filter_y_a
599  pmullw               m3, filter_y_b
600  paddw                m4, filter_rnd
601  paddw                m4, m3
602  pmullw               m0, filter_y_a
603  pmullw               m2, filter_y_b
604  paddw                m0, filter_rnd
605  psrlw                m4, 4
606  paddw                m0, m2
607  mova                 m2, [dstq]
608  psrlw                m0, 4
609  mova                 m3, [dstq+dst_strideq*2]
610%if %2 == 1 ; avg
611  pavgw                m0, [secq]
612  add                secq, sec_str
613  pavgw                m4, [secq]
614%endif
615  SUM_SSE              m0, m2, m4, m3, m6, m7
616  mova                 m0, m5
617
618  lea                srcq, [srcq + src_strideq*4]
619  lea                dstq, [dstq + dst_strideq*4]
620%if %2 == 1 ; avg
621  add                secq, sec_str
622%endif
623%endif
624  dec                   block_height
625  jg .x_half_y_other_loop
626%undef filter_y_a
627%undef filter_y_b
628%undef filter_rnd
629  STORE_AND_RET
630
631.x_nonhalf:
632  test          y_offsetd, y_offsetd
633  jnz .x_nonhalf_y_nonzero
634
635  ; x_offset == bilin interpolation && y_offset == 0
636%ifdef PIC
637  lea        bilin_filter, [bilin_filter_m]
638%endif
639  shl           x_offsetd, filter_idx_shift
640%if ARCH_X86_64 && mmsize == 16
641  mova                 m8, [bilin_filter+x_offsetq]
642  mova                 m9, [bilin_filter+x_offsetq+16]
643  mova                m10, [pw_8]
644%define filter_x_a m8
645%define filter_x_b m9
646%define filter_rnd m10
647%else    ; x86-32
648%if ARCH_X86=1 && CONFIG_PIC=1
649; y_offset == 0. We can reuse y_offset reg.
650%define tempq y_offsetq
651  add x_offsetq, g_bilin_filterm
652%define filter_x_a [x_offsetq]
653%define filter_x_b [x_offsetq+16]
654  mov tempq, g_pw_8m
655%define filter_rnd [tempq]
656%else
657  add           x_offsetq, bilin_filter
658%define filter_x_a [x_offsetq]
659%define filter_x_b [x_offsetq+16]
660%define filter_rnd [pw_8]
661%endif
662%endif
663
664.x_other_y_zero_loop:
665%if %1 == 16
666  movu                 m0, [srcq]
667  movu                 m1, [srcq+16]
668  movu                 m2, [srcq+2]
669  movu                 m3, [srcq+18]
670  mova                 m4, [dstq]
671  mova                 m5, [dstq+16]
672  pmullw               m1, filter_x_a
673  pmullw               m3, filter_x_b
674  paddw                m1, filter_rnd
675  pmullw               m0, filter_x_a
676  pmullw               m2, filter_x_b
677  paddw                m0, filter_rnd
678  paddw                m1, m3
679  paddw                m0, m2
680  psrlw                m1, 4
681  psrlw                m0, 4
682%if %2 == 1 ; avg
683  pavgw                m0, [secq]
684  pavgw                m1, [secq+16]
685%endif
686  SUM_SSE              m0, m4, m1, m5, m6, m7
687
688  lea                srcq, [srcq+src_strideq*2]
689  lea                dstq, [dstq+dst_strideq*2]
690%if %2 == 1 ; avg
691  add                secq, sec_str
692%endif
693%else ; %1 < 16
694  movu                 m0, [srcq]
695  movu                 m1, [srcq+src_strideq*2]
696  movu                 m2, [srcq+2]
697  movu                 m3, [srcq+src_strideq*2+2]
698  mova                 m4, [dstq]
699  mova                 m5, [dstq+dst_strideq*2]
700  pmullw               m1, filter_x_a
701  pmullw               m3, filter_x_b
702  paddw                m1, filter_rnd
703  pmullw               m0, filter_x_a
704  pmullw               m2, filter_x_b
705  paddw                m0, filter_rnd
706  paddw                m1, m3
707  paddw                m0, m2
708  psrlw                m1, 4
709  psrlw                m0, 4
710%if %2 == 1 ; avg
711  pavgw                m0, [secq]
712  add                secq, sec_str
713  pavgw                m1, [secq]
714%endif
715  SUM_SSE              m0, m4, m1, m5, m6, m7
716
717  lea                srcq, [srcq+src_strideq*4]
718  lea                dstq, [dstq+dst_strideq*4]
719%if %2 == 1 ; avg
720  add                secq, sec_str
721%endif
722%endif
723  dec                   block_height
724  jg .x_other_y_zero_loop
725%undef filter_x_a
726%undef filter_x_b
727%undef filter_rnd
728  STORE_AND_RET
729
730.x_nonhalf_y_nonzero:
731  cmp           y_offsetd, 8
732  jne .x_nonhalf_y_nonhalf
733
734  ; x_offset == bilin interpolation && y_offset == 0.5
735%ifdef PIC
736  lea        bilin_filter, [bilin_filter_m]
737%endif
738  shl           x_offsetd, filter_idx_shift
739%if ARCH_X86_64 && mmsize == 16
740  mova                 m8, [bilin_filter+x_offsetq]
741  mova                 m9, [bilin_filter+x_offsetq+16]
742  mova                m10, [pw_8]
743%define filter_x_a m8
744%define filter_x_b m9
745%define filter_rnd m10
746%else    ; x86-32
747%if ARCH_X86=1 && CONFIG_PIC=1
748; y_offset == 0.5. We can reuse y_offset reg.
749%define tempq y_offsetq
750  add x_offsetq, g_bilin_filterm
751%define filter_x_a [x_offsetq]
752%define filter_x_b [x_offsetq+16]
753  mov tempq, g_pw_8m
754%define filter_rnd [tempq]
755%else
756  add           x_offsetq, bilin_filter
757%define filter_x_a [x_offsetq]
758%define filter_x_b [x_offsetq+16]
759%define filter_rnd [pw_8]
760%endif
761%endif
762
763%if %1 == 16
764  movu                 m0, [srcq]
765  movu                 m1, [srcq+16]
766  movu                 m2, [srcq+2]
767  movu                 m3, [srcq+18]
768  pmullw               m0, filter_x_a
769  pmullw               m2, filter_x_b
770  paddw                m0, filter_rnd
771  pmullw               m1, filter_x_a
772  pmullw               m3, filter_x_b
773  paddw                m1, filter_rnd
774  paddw                m0, m2
775  paddw                m1, m3
776  psrlw                m0, 4
777  psrlw                m1, 4
778  lea                srcq, [srcq+src_strideq*2]
779.x_other_y_half_loop:
780  movu                 m2, [srcq]
781  movu                 m3, [srcq+16]
782  movu                 m4, [srcq+2]
783  movu                 m5, [srcq+18]
784  pmullw               m2, filter_x_a
785  pmullw               m4, filter_x_b
786  paddw                m2, filter_rnd
787  pmullw               m3, filter_x_a
788  pmullw               m5, filter_x_b
789  paddw                m3, filter_rnd
790  paddw                m2, m4
791  paddw                m3, m5
792  mova                 m4, [dstq]
793  mova                 m5, [dstq+16]
794  psrlw                m2, 4
795  psrlw                m3, 4
796  pavgw                m0, m2
797  pavgw                m1, m3
798%if %2 == 1 ; avg
799  pavgw                m0, [secq]
800  pavgw                m1, [secq+16]
801%endif
802  SUM_SSE              m0, m4, m1, m5, m6, m7
803  mova                 m0, m2
804  mova                 m1, m3
805
806  lea                srcq, [srcq+src_strideq*2]
807  lea                dstq, [dstq+dst_strideq*2]
808%if %2 == 1 ; avg
809  add                secq, sec_str
810%endif
811%else ; %1 < 16
812  movu                 m0, [srcq]
813  movu                 m2, [srcq+2]
814  pmullw               m0, filter_x_a
815  pmullw               m2, filter_x_b
816  paddw                m0, filter_rnd
817  paddw                m0, m2
818  psrlw                m0, 4
819  lea                srcq, [srcq+src_strideq*2]
820.x_other_y_half_loop:
821  movu                 m2, [srcq]
822  movu                 m3, [srcq+src_strideq*2]
823  movu                 m4, [srcq+2]
824  movu                 m5, [srcq+src_strideq*2+2]
825  pmullw               m2, filter_x_a
826  pmullw               m4, filter_x_b
827  paddw                m2, filter_rnd
828  pmullw               m3, filter_x_a
829  pmullw               m5, filter_x_b
830  paddw                m3, filter_rnd
831  paddw                m2, m4
832  paddw                m3, m5
833  mova                 m4, [dstq]
834  mova                 m5, [dstq+dst_strideq*2]
835  psrlw                m2, 4
836  psrlw                m3, 4
837  pavgw                m0, m2
838  pavgw                m2, m3
839%if %2 == 1 ; avg
840  pavgw                m0, [secq]
841  add                secq, sec_str
842  pavgw                m2, [secq]
843%endif
844  SUM_SSE              m0, m4, m2, m5, m6, m7
845  mova                 m0, m3
846
847  lea                srcq, [srcq+src_strideq*4]
848  lea                dstq, [dstq+dst_strideq*4]
849%if %2 == 1 ; avg
850  add                secq, sec_str
851%endif
852%endif
853  dec                   block_height
854  jg .x_other_y_half_loop
855%undef filter_x_a
856%undef filter_x_b
857%undef filter_rnd
858  STORE_AND_RET
859
860.x_nonhalf_y_nonhalf:
861; loading filter - this is same as in 8-bit depth
862%ifdef PIC
863  lea        bilin_filter, [bilin_filter_m]
864%endif
865  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
866  shl           y_offsetd, filter_idx_shift
867%if ARCH_X86_64 && mmsize == 16
868  mova                 m8, [bilin_filter+x_offsetq]
869  mova                 m9, [bilin_filter+x_offsetq+16]
870  mova                m10, [bilin_filter+y_offsetq]
871  mova                m11, [bilin_filter+y_offsetq+16]
872  mova                m12, [pw_8]
873%define filter_x_a m8
874%define filter_x_b m9
875%define filter_y_a m10
876%define filter_y_b m11
877%define filter_rnd m12
878%else   ; x86-32
879%if ARCH_X86=1 && CONFIG_PIC=1
880; In this case, there is NO unused register. Used src_stride register. Later,
881; src_stride has to be loaded from stack when it is needed.
882%define tempq src_strideq
883  mov tempq, g_bilin_filterm
884  add           x_offsetq, tempq
885  add           y_offsetq, tempq
886%define filter_x_a [x_offsetq]
887%define filter_x_b [x_offsetq+16]
888%define filter_y_a [y_offsetq]
889%define filter_y_b [y_offsetq+16]
890
891  mov tempq, g_pw_8m
892%define filter_rnd [tempq]
893%else
894  add           x_offsetq, bilin_filter
895  add           y_offsetq, bilin_filter
896%define filter_x_a [x_offsetq]
897%define filter_x_b [x_offsetq+16]
898%define filter_y_a [y_offsetq]
899%define filter_y_b [y_offsetq+16]
900%define filter_rnd [pw_8]
901%endif
902%endif
903; end of load filter
904
905  ; x_offset == bilin interpolation && y_offset == bilin interpolation
906%if %1 == 16
907  movu                 m0, [srcq]
908  movu                 m2, [srcq+2]
909  movu                 m1, [srcq+16]
910  movu                 m3, [srcq+18]
911  pmullw               m0, filter_x_a
912  pmullw               m2, filter_x_b
913  paddw                m0, filter_rnd
914  pmullw               m1, filter_x_a
915  pmullw               m3, filter_x_b
916  paddw                m1, filter_rnd
917  paddw                m0, m2
918  paddw                m1, m3
919  psrlw                m0, 4
920  psrlw                m1, 4
921
922  INC_SRC_BY_SRC_STRIDE
923
924.x_other_y_other_loop:
925  movu                 m2, [srcq]
926  movu                 m4, [srcq+2]
927  movu                 m3, [srcq+16]
928  movu                 m5, [srcq+18]
929  pmullw               m2, filter_x_a
930  pmullw               m4, filter_x_b
931  paddw                m2, filter_rnd
932  pmullw               m3, filter_x_a
933  pmullw               m5, filter_x_b
934  paddw                m3, filter_rnd
935  paddw                m2, m4
936  paddw                m3, m5
937  psrlw                m2, 4
938  psrlw                m3, 4
939  mova                 m4, m2
940  mova                 m5, m3
941  pmullw               m0, filter_y_a
942  pmullw               m2, filter_y_b
943  paddw                m0, filter_rnd
944  pmullw               m1, filter_y_a
945  pmullw               m3, filter_y_b
946  paddw                m0, m2
947  paddw                m1, filter_rnd
948  mova                 m2, [dstq]
949  paddw                m1, m3
950  psrlw                m0, 4
951  psrlw                m1, 4
952  mova                 m3, [dstq+16]
953%if %2 == 1 ; avg
954  pavgw                m0, [secq]
955  pavgw                m1, [secq+16]
956%endif
957  SUM_SSE              m0, m2, m1, m3, m6, m7
958  mova                 m0, m4
959  mova                 m1, m5
960
961  INC_SRC_BY_SRC_STRIDE
962  lea                dstq, [dstq + dst_strideq * 2]
963%if %2 == 1 ; avg
964  add                secq, sec_str
965%endif
966%else ; %1 < 16
967  movu                 m0, [srcq]
968  movu                 m2, [srcq+2]
969  pmullw               m0, filter_x_a
970  pmullw               m2, filter_x_b
971  paddw                m0, filter_rnd
972  paddw                m0, m2
973  psrlw                m0, 4
974
975  INC_SRC_BY_SRC_STRIDE
976
977.x_other_y_other_loop:
978  movu                 m2, [srcq]
979  movu                 m4, [srcq+2]
980  INC_SRC_BY_SRC_STRIDE
981  movu                 m3, [srcq]
982  movu                 m5, [srcq+2]
983  pmullw               m2, filter_x_a
984  pmullw               m4, filter_x_b
985  paddw                m2, filter_rnd
986  pmullw               m3, filter_x_a
987  pmullw               m5, filter_x_b
988  paddw                m3, filter_rnd
989  paddw                m2, m4
990  paddw                m3, m5
991  psrlw                m2, 4
992  psrlw                m3, 4
993  mova                 m4, m2
994  mova                 m5, m3
995  pmullw               m0, filter_y_a
996  pmullw               m2, filter_y_b
997  paddw                m0, filter_rnd
998  pmullw               m4, filter_y_a
999  pmullw               m3, filter_y_b
1000  paddw                m0, m2
1001  paddw                m4, filter_rnd
1002  mova                 m2, [dstq]
1003  paddw                m4, m3
1004  psrlw                m0, 4
1005  psrlw                m4, 4
1006  mova                 m3, [dstq+dst_strideq*2]
1007%if %2 == 1 ; avg
1008  pavgw                m0, [secq]
1009  add                secq, sec_str
1010  pavgw                m4, [secq]
1011%endif
1012  SUM_SSE              m0, m2, m4, m3, m6, m7
1013  mova                 m0, m5
1014
1015  INC_SRC_BY_SRC_STRIDE
1016  lea                dstq, [dstq + dst_strideq * 4]
1017%if %2 == 1 ; avg
1018  add                secq, sec_str
1019%endif
1020%endif
1021  dec                   block_height
1022  jg .x_other_y_other_loop
1023%undef filter_x_a
1024%undef filter_x_b
1025%undef filter_y_a
1026%undef filter_y_b
1027%undef filter_rnd
1028  STORE_AND_RET
1029%endmacro
1030
1031INIT_XMM sse2
1032SUBPEL_VARIANCE  8
1033SUBPEL_VARIANCE 16
1034
1035INIT_XMM sse2
1036SUBPEL_VARIANCE  8, 1
1037SUBPEL_VARIANCE 16, 1
1038