1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_8: times  8 dw  8
15bilin_filter_m_sse2: times  8 dw 16
16                     times  8 dw  0
17                     times  8 dw 14
18                     times  8 dw  2
19                     times  8 dw 12
20                     times  8 dw  4
21                     times  8 dw 10
22                     times  8 dw  6
23                     times 16 dw  8
24                     times  8 dw  6
25                     times  8 dw 10
26                     times  8 dw  4
27                     times  8 dw 12
28                     times  8 dw  2
29                     times  8 dw 14
30
31bilin_filter_m_ssse3: times  8 db 16,  0
32                      times  8 db 14,  2
33                      times  8 db 12,  4
34                      times  8 db 10,  6
35                      times 16 db  8
36                      times  8 db  6, 10
37                      times  8 db  4, 12
38                      times  8 db  2, 14
39
40SECTION .text
41
42; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
43;                               int x_offset, int y_offset,
44;                               const uint8_t *dst, ptrdiff_t dst_stride,
45;                               int height, unsigned int *sse);
46;
47; This function returns the SE and stores SSE in the given pointer.
48
49%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
50  psubw                %3, %4
51  psubw                %1, %2
52  paddw                %5, %3
53  pmaddwd              %3, %3
54  paddw                %5, %1
55  pmaddwd              %1, %1
56  paddd                %6, %3
57  paddd                %6, %1
58%endmacro
59
60%macro STORE_AND_RET 1
61%if %1 > 4
62  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
63  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
64  ; We have to sign-extend it before adding the words within the register
65  ; and outputing to a dword.
66  pcmpgtw              m5, m6           ; mask for 0 > x
67  movhlps              m3, m7
68  punpcklwd            m4, m6, m5
69  punpckhwd            m6, m5           ; sign-extend m6 word->dword
70  paddd                m7, m3
71  paddd                m6, m4
72  pshufd               m3, m7, 0x1
73  movhlps              m4, m6
74  paddd                m7, m3
75  paddd                m6, m4
76  mov                  r1, ssem         ; r1 = unsigned int *sse
77  pshufd               m4, m6, 0x1
78  movd               [r1], m7           ; store sse
79  paddd                m6, m4
80  movd               raxd, m6           ; store sum as return value
81%else ; 4xh
82  pshuflw              m4, m6, 0xe
83  pshuflw              m3, m7, 0xe
84  paddw                m6, m4
85  paddd                m7, m3
86  pcmpgtw              m5, m6           ; mask for 0 > x
87  mov                  r1, ssem         ; r1 = unsigned int *sse
88  punpcklwd            m6, m5           ; sign-extend m6 word->dword
89  movd               [r1], m7           ; store sse
90  pshuflw              m4, m6, 0xe
91  paddd                m6, m4
92  movd               raxd, m6           ; store sum as return value
93%endif
94  RET
95%endmacro
96
97%macro INC_SRC_BY_SRC_STRIDE  0
98%if ARCH_X86=1 && CONFIG_PIC=1
99  add                srcq, src_stridemp
100%else
101  add                srcq, src_strideq
102%endif
103%endmacro
104
105%macro SUBPEL_VARIANCE 1-2 0 ; W
106%if cpuflag(ssse3)
107%define bilin_filter_m bilin_filter_m_ssse3
108%define filter_idx_shift 4
109%else
110%define bilin_filter_m bilin_filter_m_sse2
111%define filter_idx_shift 5
112%endif
113; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
114; 11, not 13, if the registers are ordered correctly. May make a minor speed
115; difference on Win64
116
117%ifdef PIC    ; 64bit PIC
118  %if %2 == 1 ; avg
119    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
120                                      x_offset, y_offset, \
121                                      dst, dst_stride, \
122                                      sec, sec_stride, height, sse
123    %define sec_str sec_strideq
124  %else
125    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
126                                  y_offset, dst, dst_stride, height, sse
127  %endif
128  %define block_height heightd
129  %define bilin_filter sseq
130%else
131  %if ARCH_X86=1 && CONFIG_PIC=1
132    %if %2 == 1 ; avg
133      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
134                                  x_offset, y_offset, \
135                                  dst, dst_stride, \
136                                  sec, sec_stride, \
137                                  height, sse, g_bilin_filter, g_pw_8
138      %define block_height dword heightm
139      %define sec_str sec_stridemp
140
141      ;Store bilin_filter and pw_8 location in stack
142      %if GET_GOT_DEFINED == 1
143        GET_GOT eax
144        add esp, 4                ; restore esp
145      %endif
146
147      lea ecx, [GLOBAL(bilin_filter_m)]
148      mov g_bilin_filterm, ecx
149
150      lea ecx, [GLOBAL(pw_8)]
151      mov g_pw_8m, ecx
152
153      LOAD_IF_USED 0, 1         ; load eax, ecx back
154    %else
155      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
156                                y_offset, dst, dst_stride, height, sse, \
157                                g_bilin_filter, g_pw_8
158      %define block_height heightd
159
160      ;Store bilin_filter and pw_8 location in stack
161      %if GET_GOT_DEFINED == 1
162        GET_GOT eax
163        add esp, 4                ; restore esp
164      %endif
165
166      lea ecx, [GLOBAL(bilin_filter_m)]
167      mov g_bilin_filterm, ecx
168
169      lea ecx, [GLOBAL(pw_8)]
170      mov g_pw_8m, ecx
171
172      LOAD_IF_USED 0, 1         ; load eax, ecx back
173    %endif
174  %else
175    %if %2 == 1 ; avg
176      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
177                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
178                                             x_offset, y_offset, \
179                                             dst, dst_stride, \
180                                             sec, sec_stride, \
181                                             height, sse
182      %if ARCH_X86_64
183      %define block_height heightd
184      %define sec_str sec_strideq
185      %else
186      %define block_height dword heightm
187      %define sec_str sec_stridemp
188      %endif
189    %else
190      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
191                              y_offset, dst, dst_stride, height, sse
192      %define block_height heightd
193    %endif
194
195    %define bilin_filter bilin_filter_m
196  %endif
197%endif
198
199%if %1 == 4
200  %define movx movd
201%else
202  %define movx movh
203%endif
204
205  ASSERT               %1 <= 16         ; m6 overflows if w > 16
206  pxor                 m6, m6           ; sum
207  pxor                 m7, m7           ; sse
208  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
209  ; could perhaps use it for something more productive then
210  pxor                 m5, m5           ; dedicated zero register
211%if %1 < 16
212  sar                   block_height, 1
213%if %2 == 1 ; avg
214  shl             sec_str, 1
215%endif
216%endif
217
218  ; FIXME(rbultje) replace by jumptable?
219  test          x_offsetd, x_offsetd
220  jnz .x_nonzero
221  ; x_offset == 0
222  test          y_offsetd, y_offsetd
223  jnz .x_zero_y_nonzero
224
225  ; x_offset == 0 && y_offset == 0
226.x_zero_y_zero_loop:
227%if %1 == 16
228  movu                 m0, [srcq]
229  mova                 m1, [dstq]
230%if %2 == 1 ; avg
231  pavgb                m0, [secq]
232  punpckhbw            m3, m1, m5
233  punpcklbw            m1, m5
234%endif
235  punpckhbw            m2, m0, m5
236  punpcklbw            m0, m5
237
238%if %2 == 0 ; !avg
239  punpckhbw            m3, m1, m5
240  punpcklbw            m1, m5
241%endif
242  SUM_SSE              m0, m1, m2, m3, m6, m7
243
244  add                srcq, src_strideq
245  add                dstq, dst_strideq
246%else ; %1 < 16
247  movx                 m0, [srcq]
248%if %2 == 1 ; avg
249%if %1 > 4
250  movhps               m0, [srcq+src_strideq]
251%else ; 4xh
252  movx                 m1, [srcq+src_strideq]
253  punpckldq            m0, m1
254%endif
255%else ; !avg
256  movx                 m2, [srcq+src_strideq]
257%endif
258
259  movx                 m1, [dstq]
260  movx                 m3, [dstq+dst_strideq]
261
262%if %2 == 1 ; avg
263%if %1 > 4
264  pavgb                m0, [secq]
265%else
266  movh                 m2, [secq]
267  pavgb                m0, m2
268%endif
269  punpcklbw            m3, m5
270  punpcklbw            m1, m5
271%if %1 > 4
272  punpckhbw            m2, m0, m5
273  punpcklbw            m0, m5
274%else ; 4xh
275  punpcklbw            m0, m5
276  movhlps              m2, m0
277%endif
278%else ; !avg
279  punpcklbw            m0, m5
280  punpcklbw            m2, m5
281  punpcklbw            m3, m5
282  punpcklbw            m1, m5
283%endif
284  SUM_SSE              m0, m1, m2, m3, m6, m7
285
286  lea                srcq, [srcq+src_strideq*2]
287  lea                dstq, [dstq+dst_strideq*2]
288%endif
289%if %2 == 1 ; avg
290  add                secq, sec_str
291%endif
292  dec                   block_height
293  jg .x_zero_y_zero_loop
294  STORE_AND_RET %1
295
296.x_zero_y_nonzero:
297  cmp           y_offsetd, 4
298  jne .x_zero_y_nonhalf
299
300  ; x_offset == 0 && y_offset == 0.5
301.x_zero_y_half_loop:
302%if %1 == 16
303  movu                 m0, [srcq]
304  movu                 m4, [srcq+src_strideq]
305  mova                 m1, [dstq]
306  pavgb                m0, m4
307  punpckhbw            m3, m1, m5
308%if %2 == 1 ; avg
309  pavgb                m0, [secq]
310%endif
311  punpcklbw            m1, m5
312  punpckhbw            m2, m0, m5
313  punpcklbw            m0, m5
314  SUM_SSE              m0, m1, m2, m3, m6, m7
315
316  add                srcq, src_strideq
317  add                dstq, dst_strideq
318%else ; %1 < 16
319  movx                 m0, [srcq]
320  movx                 m2, [srcq+src_strideq]
321%if %2 == 1 ; avg
322%if %1 > 4
323  movhps               m2, [srcq+src_strideq*2]
324%else ; 4xh
325  movx                 m1, [srcq+src_strideq*2]
326  punpckldq            m2, m1
327%endif
328  movx                 m1, [dstq]
329%if %1 > 4
330  movlhps              m0, m2
331%else ; 4xh
332  punpckldq            m0, m2
333%endif
334  movx                 m3, [dstq+dst_strideq]
335  pavgb                m0, m2
336  punpcklbw            m1, m5
337%if %1 > 4
338  pavgb                m0, [secq]
339  punpcklbw            m3, m5
340  punpckhbw            m2, m0, m5
341  punpcklbw            m0, m5
342%else ; 4xh
343  movh                 m4, [secq]
344  pavgb                m0, m4
345  punpcklbw            m3, m5
346  punpcklbw            m0, m5
347  movhlps              m2, m0
348%endif
349%else ; !avg
350  movx                 m4, [srcq+src_strideq*2]
351  movx                 m1, [dstq]
352  pavgb                m0, m2
353  movx                 m3, [dstq+dst_strideq]
354  pavgb                m2, m4
355  punpcklbw            m0, m5
356  punpcklbw            m2, m5
357  punpcklbw            m3, m5
358  punpcklbw            m1, m5
359%endif
360  SUM_SSE              m0, m1, m2, m3, m6, m7
361
362  lea                srcq, [srcq+src_strideq*2]
363  lea                dstq, [dstq+dst_strideq*2]
364%endif
365%if %2 == 1 ; avg
366  add                secq, sec_str
367%endif
368  dec                   block_height
369  jg .x_zero_y_half_loop
370  STORE_AND_RET %1
371
372.x_zero_y_nonhalf:
373  ; x_offset == 0 && y_offset == bilin interpolation
374%ifdef PIC
375  lea        bilin_filter, [bilin_filter_m]
376%endif
377  shl           y_offsetd, filter_idx_shift
378%if ARCH_X86_64 && %1 > 4
379  mova                 m8, [bilin_filter+y_offsetq]
380%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
381  mova                 m9, [bilin_filter+y_offsetq+16]
382%endif
383  mova                m10, [pw_8]
384%define filter_y_a m8
385%define filter_y_b m9
386%define filter_rnd m10
387%else ; x86-32 or mmx
388%if ARCH_X86=1 && CONFIG_PIC=1
389; x_offset == 0, reuse x_offset reg
390%define tempq x_offsetq
391  add y_offsetq, g_bilin_filterm
392%define filter_y_a [y_offsetq]
393%define filter_y_b [y_offsetq+16]
394  mov tempq, g_pw_8m
395%define filter_rnd [tempq]
396%else
397  add           y_offsetq, bilin_filter
398%define filter_y_a [y_offsetq]
399%define filter_y_b [y_offsetq+16]
400%define filter_rnd [pw_8]
401%endif
402%endif
403
404.x_zero_y_other_loop:
405%if %1 == 16
406  movu                 m0, [srcq]
407  movu                 m4, [srcq+src_strideq]
408  mova                 m1, [dstq]
409%if cpuflag(ssse3)
410  punpckhbw            m2, m0, m4
411  punpcklbw            m0, m4
412  pmaddubsw            m2, filter_y_a
413  pmaddubsw            m0, filter_y_a
414  paddw                m2, filter_rnd
415  paddw                m0, filter_rnd
416%else
417  punpckhbw            m2, m0, m5
418  punpckhbw            m3, m4, m5
419  punpcklbw            m0, m5
420  punpcklbw            m4, m5
421  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
422  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
423  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
424  ; slightly faster because of pmullw latency. It would also cut our rodata
425  ; tables in half for this function, and save 1-2 registers on x86-64.
426  pmullw               m2, filter_y_a
427  pmullw               m3, filter_y_b
428  paddw                m2, filter_rnd
429  pmullw               m0, filter_y_a
430  pmullw               m4, filter_y_b
431  paddw                m0, filter_rnd
432  paddw                m2, m3
433  paddw                m0, m4
434%endif
435  psraw                m2, 4
436  psraw                m0, 4
437%if %2 == 1 ; avg
438  ; FIXME(rbultje) pipeline
439  packuswb             m0, m2
440  pavgb                m0, [secq]
441  punpckhbw            m2, m0, m5
442  punpcklbw            m0, m5
443%endif
444  punpckhbw            m3, m1, m5
445  punpcklbw            m1, m5
446  SUM_SSE              m0, m1, m2, m3, m6, m7
447
448  add                srcq, src_strideq
449  add                dstq, dst_strideq
450%else ; %1 < 16
451  movx                 m0, [srcq]
452  movx                 m2, [srcq+src_strideq]
453  movx                 m4, [srcq+src_strideq*2]
454  movx                 m3, [dstq+dst_strideq]
455%if cpuflag(ssse3)
456  movx                 m1, [dstq]
457  punpcklbw            m0, m2
458  punpcklbw            m2, m4
459  pmaddubsw            m0, filter_y_a
460  pmaddubsw            m2, filter_y_a
461  punpcklbw            m3, m5
462  paddw                m2, filter_rnd
463  paddw                m0, filter_rnd
464%else
465  punpcklbw            m0, m5
466  punpcklbw            m2, m5
467  punpcklbw            m4, m5
468  pmullw               m0, filter_y_a
469  pmullw               m1, m2, filter_y_b
470  punpcklbw            m3, m5
471  paddw                m0, filter_rnd
472  pmullw               m2, filter_y_a
473  pmullw               m4, filter_y_b
474  paddw                m0, m1
475  paddw                m2, filter_rnd
476  movx                 m1, [dstq]
477  paddw                m2, m4
478%endif
479  psraw                m0, 4
480  psraw                m2, 4
481%if %2 == 1 ; avg
482  ; FIXME(rbultje) pipeline
483%if %1 == 4
484  movlhps              m0, m2
485%endif
486  packuswb             m0, m2
487%if %1 > 4
488  pavgb                m0, [secq]
489  punpckhbw            m2, m0, m5
490  punpcklbw            m0, m5
491%else ; 4xh
492  movh                 m2, [secq]
493  pavgb                m0, m2
494  punpcklbw            m0, m5
495  movhlps              m2, m0
496%endif
497%endif
498  punpcklbw            m1, m5
499  SUM_SSE              m0, m1, m2, m3, m6, m7
500
501  lea                srcq, [srcq+src_strideq*2]
502  lea                dstq, [dstq+dst_strideq*2]
503%endif
504%if %2 == 1 ; avg
505  add                secq, sec_str
506%endif
507  dec                   block_height
508  jg .x_zero_y_other_loop
509%undef filter_y_a
510%undef filter_y_b
511%undef filter_rnd
512  STORE_AND_RET %1
513
514.x_nonzero:
515  cmp           x_offsetd, 4
516  jne .x_nonhalf
517  ; x_offset == 0.5
518  test          y_offsetd, y_offsetd
519  jnz .x_half_y_nonzero
520
521  ; x_offset == 0.5 && y_offset == 0
522.x_half_y_zero_loop:
523%if %1 == 16
524  movu                 m0, [srcq]
525  movu                 m4, [srcq+1]
526  mova                 m1, [dstq]
527  pavgb                m0, m4
528  punpckhbw            m3, m1, m5
529%if %2 == 1 ; avg
530  pavgb                m0, [secq]
531%endif
532  punpcklbw            m1, m5
533  punpckhbw            m2, m0, m5
534  punpcklbw            m0, m5
535  SUM_SSE              m0, m1, m2, m3, m6, m7
536
537  add                srcq, src_strideq
538  add                dstq, dst_strideq
539%else ; %1 < 16
540  movx                 m0, [srcq]
541  movx                 m4, [srcq+1]
542%if %2 == 1 ; avg
543%if %1 > 4
544  movhps               m0, [srcq+src_strideq]
545  movhps               m4, [srcq+src_strideq+1]
546%else ; 4xh
547  movx                 m1, [srcq+src_strideq]
548  punpckldq            m0, m1
549  movx                 m2, [srcq+src_strideq+1]
550  punpckldq            m4, m2
551%endif
552  movx                 m1, [dstq]
553  movx                 m3, [dstq+dst_strideq]
554  pavgb                m0, m4
555  punpcklbw            m3, m5
556%if %1 > 4
557  pavgb                m0, [secq]
558  punpcklbw            m1, m5
559  punpckhbw            m2, m0, m5
560  punpcklbw            m0, m5
561%else ; 4xh
562  movh                 m2, [secq]
563  pavgb                m0, m2
564  punpcklbw            m1, m5
565  punpcklbw            m0, m5
566  movhlps              m2, m0
567%endif
568%else ; !avg
569  movx                 m2, [srcq+src_strideq]
570  movx                 m1, [dstq]
571  pavgb                m0, m4
572  movx                 m4, [srcq+src_strideq+1]
573  movx                 m3, [dstq+dst_strideq]
574  pavgb                m2, m4
575  punpcklbw            m0, m5
576  punpcklbw            m2, m5
577  punpcklbw            m3, m5
578  punpcklbw            m1, m5
579%endif
580  SUM_SSE              m0, m1, m2, m3, m6, m7
581
582  lea                srcq, [srcq+src_strideq*2]
583  lea                dstq, [dstq+dst_strideq*2]
584%endif
585%if %2 == 1 ; avg
586  add                secq, sec_str
587%endif
588  dec                   block_height
589  jg .x_half_y_zero_loop
590  STORE_AND_RET %1
591
592.x_half_y_nonzero:
593  cmp           y_offsetd, 4
594  jne .x_half_y_nonhalf
595
596  ; x_offset == 0.5 && y_offset == 0.5
597%if %1 == 16
598  movu                 m0, [srcq]
599  movu                 m3, [srcq+1]
600  add                srcq, src_strideq
601  pavgb                m0, m3
602.x_half_y_half_loop:
603  movu                 m4, [srcq]
604  movu                 m3, [srcq+1]
605  mova                 m1, [dstq]
606  pavgb                m4, m3
607  punpckhbw            m3, m1, m5
608  pavgb                m0, m4
609%if %2 == 1 ; avg
610  punpcklbw            m1, m5
611  pavgb                m0, [secq]
612  punpckhbw            m2, m0, m5
613  punpcklbw            m0, m5
614%else
615  punpckhbw            m2, m0, m5
616  punpcklbw            m0, m5
617  punpcklbw            m1, m5
618%endif
619  SUM_SSE              m0, m1, m2, m3, m6, m7
620  mova                 m0, m4
621
622  add                srcq, src_strideq
623  add                dstq, dst_strideq
624%else ; %1 < 16
625  movx                 m0, [srcq]
626  movx                 m3, [srcq+1]
627  add                srcq, src_strideq
628  pavgb                m0, m3
629.x_half_y_half_loop:
630  movx                 m2, [srcq]
631  movx                 m3, [srcq+1]
632%if %2 == 1 ; avg
633%if %1 > 4
634  movhps               m2, [srcq+src_strideq]
635  movhps               m3, [srcq+src_strideq+1]
636%else
637  movx                 m1, [srcq+src_strideq]
638  punpckldq            m2, m1
639  movx                 m1, [srcq+src_strideq+1]
640  punpckldq            m3, m1
641%endif
642  pavgb                m2, m3
643%if %1 > 4
644  movlhps              m0, m2
645  movhlps              m4, m2
646%else ; 4xh
647  punpckldq            m0, m2
648  pshuflw              m4, m2, 0xe
649%endif
650  movx                 m1, [dstq]
651  pavgb                m0, m2
652  movx                 m3, [dstq+dst_strideq]
653%if %1 > 4
654  pavgb                m0, [secq]
655%else
656  movh                 m2, [secq]
657  pavgb                m0, m2
658%endif
659  punpcklbw            m3, m5
660  punpcklbw            m1, m5
661%if %1 > 4
662  punpckhbw            m2, m0, m5
663  punpcklbw            m0, m5
664%else
665  punpcklbw            m0, m5
666  movhlps              m2, m0
667%endif
668%else ; !avg
669  movx                 m4, [srcq+src_strideq]
670  movx                 m1, [srcq+src_strideq+1]
671  pavgb                m2, m3
672  pavgb                m4, m1
673  pavgb                m0, m2
674  pavgb                m2, m4
675  movx                 m1, [dstq]
676  movx                 m3, [dstq+dst_strideq]
677  punpcklbw            m0, m5
678  punpcklbw            m2, m5
679  punpcklbw            m3, m5
680  punpcklbw            m1, m5
681%endif
682  SUM_SSE              m0, m1, m2, m3, m6, m7
683  mova                 m0, m4
684
685  lea                srcq, [srcq+src_strideq*2]
686  lea                dstq, [dstq+dst_strideq*2]
687%endif
688%if %2 == 1 ; avg
689  add                secq, sec_str
690%endif
691  dec                   block_height
692  jg .x_half_y_half_loop
693  STORE_AND_RET %1
694
695.x_half_y_nonhalf:
696  ; x_offset == 0.5 && y_offset == bilin interpolation
697%ifdef PIC
698  lea        bilin_filter, [bilin_filter_m]
699%endif
700  shl           y_offsetd, filter_idx_shift
701%if ARCH_X86_64 && %1 > 4
702  mova                 m8, [bilin_filter+y_offsetq]
703%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
704  mova                 m9, [bilin_filter+y_offsetq+16]
705%endif
706  mova                m10, [pw_8]
707%define filter_y_a m8
708%define filter_y_b m9
709%define filter_rnd m10
710%else  ;x86_32
711%if ARCH_X86=1 && CONFIG_PIC=1
712; x_offset == 0.5. We can reuse x_offset reg
713%define tempq x_offsetq
714  add y_offsetq, g_bilin_filterm
715%define filter_y_a [y_offsetq]
716%define filter_y_b [y_offsetq+16]
717  mov tempq, g_pw_8m
718%define filter_rnd [tempq]
719%else
720  add           y_offsetq, bilin_filter
721%define filter_y_a [y_offsetq]
722%define filter_y_b [y_offsetq+16]
723%define filter_rnd [pw_8]
724%endif
725%endif
726
727%if %1 == 16
728  movu                 m0, [srcq]
729  movu                 m3, [srcq+1]
730  add                srcq, src_strideq
731  pavgb                m0, m3
732.x_half_y_other_loop:
733  movu                 m4, [srcq]
734  movu                 m2, [srcq+1]
735  mova                 m1, [dstq]
736  pavgb                m4, m2
737%if cpuflag(ssse3)
738  punpckhbw            m2, m0, m4
739  punpcklbw            m0, m4
740  pmaddubsw            m2, filter_y_a
741  pmaddubsw            m0, filter_y_a
742  paddw                m2, filter_rnd
743  paddw                m0, filter_rnd
744  psraw                m2, 4
745%else
746  punpckhbw            m2, m0, m5
747  punpckhbw            m3, m4, m5
748  pmullw               m2, filter_y_a
749  pmullw               m3, filter_y_b
750  paddw                m2, filter_rnd
751  punpcklbw            m0, m5
752  paddw                m2, m3
753  punpcklbw            m3, m4, m5
754  pmullw               m0, filter_y_a
755  pmullw               m3, filter_y_b
756  paddw                m0, filter_rnd
757  psraw                m2, 4
758  paddw                m0, m3
759%endif
760  punpckhbw            m3, m1, m5
761  psraw                m0, 4
762%if %2 == 1 ; avg
763  ; FIXME(rbultje) pipeline
764  packuswb             m0, m2
765  pavgb                m0, [secq]
766  punpckhbw            m2, m0, m5
767  punpcklbw            m0, m5
768%endif
769  punpcklbw            m1, m5
770  SUM_SSE              m0, m1, m2, m3, m6, m7
771  mova                 m0, m4
772
773  add                srcq, src_strideq
774  add                dstq, dst_strideq
775%else ; %1 < 16
776  movx                 m0, [srcq]
777  movx                 m3, [srcq+1]
778  add                srcq, src_strideq
779  pavgb                m0, m3
780%if notcpuflag(ssse3)
781  punpcklbw            m0, m5
782%endif
783.x_half_y_other_loop:
784  movx                 m2, [srcq]
785  movx                 m1, [srcq+1]
786  movx                 m4, [srcq+src_strideq]
787  movx                 m3, [srcq+src_strideq+1]
788  pavgb                m2, m1
789  pavgb                m4, m3
790  movx                 m3, [dstq+dst_strideq]
791%if cpuflag(ssse3)
792  movx                 m1, [dstq]
793  punpcklbw            m0, m2
794  punpcklbw            m2, m4
795  pmaddubsw            m0, filter_y_a
796  pmaddubsw            m2, filter_y_a
797  punpcklbw            m3, m5
798  paddw                m0, filter_rnd
799  paddw                m2, filter_rnd
800%else
801  punpcklbw            m2, m5
802  punpcklbw            m4, m5
803  pmullw               m0, filter_y_a
804  pmullw               m1, m2, filter_y_b
805  punpcklbw            m3, m5
806  paddw                m0, filter_rnd
807  pmullw               m2, filter_y_a
808  paddw                m0, m1
809  pmullw               m1, m4, filter_y_b
810  paddw                m2, filter_rnd
811  paddw                m2, m1
812  movx                 m1, [dstq]
813%endif
814  psraw                m0, 4
815  psraw                m2, 4
816%if %2 == 1 ; avg
817  ; FIXME(rbultje) pipeline
818%if %1 == 4
819  movlhps              m0, m2
820%endif
821  packuswb             m0, m2
822%if %1 > 4
823  pavgb                m0, [secq]
824  punpckhbw            m2, m0, m5
825  punpcklbw            m0, m5
826%else
827  movh                 m2, [secq]
828  pavgb                m0, m2
829  punpcklbw            m0, m5
830  movhlps              m2, m0
831%endif
832%endif
833  punpcklbw            m1, m5
834  SUM_SSE              m0, m1, m2, m3, m6, m7
835  mova                 m0, m4
836
837  lea                srcq, [srcq+src_strideq*2]
838  lea                dstq, [dstq+dst_strideq*2]
839%endif
840%if %2 == 1 ; avg
841  add                secq, sec_str
842%endif
843  dec                   block_height
844  jg .x_half_y_other_loop
845%undef filter_y_a
846%undef filter_y_b
847%undef filter_rnd
848  STORE_AND_RET %1
849
850.x_nonhalf:
851  test          y_offsetd, y_offsetd
852  jnz .x_nonhalf_y_nonzero
853
854  ; x_offset == bilin interpolation && y_offset == 0
855%ifdef PIC
856  lea        bilin_filter, [bilin_filter_m]
857%endif
858  shl           x_offsetd, filter_idx_shift
859%if ARCH_X86_64 && %1 > 4
860  mova                 m8, [bilin_filter+x_offsetq]
861%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
862  mova                 m9, [bilin_filter+x_offsetq+16]
863%endif
864  mova                m10, [pw_8]
865%define filter_x_a m8
866%define filter_x_b m9
867%define filter_rnd m10
868%else    ; x86-32
869%if ARCH_X86=1 && CONFIG_PIC=1
870;y_offset == 0. We can reuse y_offset reg.
871%define tempq y_offsetq
872  add x_offsetq, g_bilin_filterm
873%define filter_x_a [x_offsetq]
874%define filter_x_b [x_offsetq+16]
875  mov tempq, g_pw_8m
876%define filter_rnd [tempq]
877%else
878  add           x_offsetq, bilin_filter
879%define filter_x_a [x_offsetq]
880%define filter_x_b [x_offsetq+16]
881%define filter_rnd [pw_8]
882%endif
883%endif
884
885.x_other_y_zero_loop:
886%if %1 == 16
887  movu                 m0, [srcq]
888  movu                 m4, [srcq+1]
889  mova                 m1, [dstq]
890%if cpuflag(ssse3)
891  punpckhbw            m2, m0, m4
892  punpcklbw            m0, m4
893  pmaddubsw            m2, filter_x_a
894  pmaddubsw            m0, filter_x_a
895  paddw                m2, filter_rnd
896  paddw                m0, filter_rnd
897%else
898  punpckhbw            m2, m0, m5
899  punpckhbw            m3, m4, m5
900  punpcklbw            m0, m5
901  punpcklbw            m4, m5
902  pmullw               m2, filter_x_a
903  pmullw               m3, filter_x_b
904  paddw                m2, filter_rnd
905  pmullw               m0, filter_x_a
906  pmullw               m4, filter_x_b
907  paddw                m0, filter_rnd
908  paddw                m2, m3
909  paddw                m0, m4
910%endif
911  psraw                m2, 4
912  psraw                m0, 4
913%if %2 == 1 ; avg
914  ; FIXME(rbultje) pipeline
915  packuswb             m0, m2
916  pavgb                m0, [secq]
917  punpckhbw            m2, m0, m5
918  punpcklbw            m0, m5
919%endif
920  punpckhbw            m3, m1, m5
921  punpcklbw            m1, m5
922  SUM_SSE              m0, m1, m2, m3, m6, m7
923
924  add                srcq, src_strideq
925  add                dstq, dst_strideq
926%else ; %1 < 16
927  movx                 m0, [srcq]
928  movx                 m1, [srcq+1]
929  movx                 m2, [srcq+src_strideq]
930  movx                 m4, [srcq+src_strideq+1]
931  movx                 m3, [dstq+dst_strideq]
932%if cpuflag(ssse3)
933  punpcklbw            m0, m1
934  movx                 m1, [dstq]
935  punpcklbw            m2, m4
936  pmaddubsw            m0, filter_x_a
937  pmaddubsw            m2, filter_x_a
938  punpcklbw            m3, m5
939  paddw                m0, filter_rnd
940  paddw                m2, filter_rnd
941%else
942  punpcklbw            m0, m5
943  punpcklbw            m1, m5
944  punpcklbw            m2, m5
945  punpcklbw            m4, m5
946  pmullw               m0, filter_x_a
947  pmullw               m1, filter_x_b
948  punpcklbw            m3, m5
949  paddw                m0, filter_rnd
950  pmullw               m2, filter_x_a
951  pmullw               m4, filter_x_b
952  paddw                m0, m1
953  paddw                m2, filter_rnd
954  movx                 m1, [dstq]
955  paddw                m2, m4
956%endif
957  psraw                m0, 4
958  psraw                m2, 4
959%if %2 == 1 ; avg
960  ; FIXME(rbultje) pipeline
961%if %1 == 4
962  movlhps              m0, m2
963%endif
964  packuswb             m0, m2
965%if %1 > 4
966  pavgb                m0, [secq]
967  punpckhbw            m2, m0, m5
968  punpcklbw            m0, m5
969%else
970  movh                 m2, [secq]
971  pavgb                m0, m2
972  punpcklbw            m0, m5
973  movhlps              m2, m0
974%endif
975%endif
976  punpcklbw            m1, m5
977  SUM_SSE              m0, m1, m2, m3, m6, m7
978
979  lea                srcq, [srcq+src_strideq*2]
980  lea                dstq, [dstq+dst_strideq*2]
981%endif
982%if %2 == 1 ; avg
983  add                secq, sec_str
984%endif
985  dec                   block_height
986  jg .x_other_y_zero_loop
987%undef filter_x_a
988%undef filter_x_b
989%undef filter_rnd
990  STORE_AND_RET %1
991
992.x_nonhalf_y_nonzero:
993  cmp           y_offsetd, 4
994  jne .x_nonhalf_y_nonhalf
995
996  ; x_offset == bilin interpolation && y_offset == 0.5
997%ifdef PIC
998  lea        bilin_filter, [bilin_filter_m]
999%endif
1000  shl           x_offsetd, filter_idx_shift
1001%if ARCH_X86_64 && %1 > 4
1002  mova                 m8, [bilin_filter+x_offsetq]
1003%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1004  mova                 m9, [bilin_filter+x_offsetq+16]
1005%endif
1006  mova                m10, [pw_8]
1007%define filter_x_a m8
1008%define filter_x_b m9
1009%define filter_rnd m10
1010%else    ; x86-32
1011%if ARCH_X86=1 && CONFIG_PIC=1
1012; y_offset == 0.5. We can reuse y_offset reg.
1013%define tempq y_offsetq
1014  add x_offsetq, g_bilin_filterm
1015%define filter_x_a [x_offsetq]
1016%define filter_x_b [x_offsetq+16]
1017  mov tempq, g_pw_8m
1018%define filter_rnd [tempq]
1019%else
1020  add           x_offsetq, bilin_filter
1021%define filter_x_a [x_offsetq]
1022%define filter_x_b [x_offsetq+16]
1023%define filter_rnd [pw_8]
1024%endif
1025%endif
1026
1027%if %1 == 16
1028  movu                 m0, [srcq]
1029  movu                 m1, [srcq+1]
1030%if cpuflag(ssse3)
1031  punpckhbw            m2, m0, m1
1032  punpcklbw            m0, m1
1033  pmaddubsw            m2, filter_x_a
1034  pmaddubsw            m0, filter_x_a
1035  paddw                m2, filter_rnd
1036  paddw                m0, filter_rnd
1037%else
1038  punpckhbw            m2, m0, m5
1039  punpckhbw            m3, m1, m5
1040  punpcklbw            m0, m5
1041  punpcklbw            m1, m5
1042  pmullw               m0, filter_x_a
1043  pmullw               m1, filter_x_b
1044  paddw                m0, filter_rnd
1045  pmullw               m2, filter_x_a
1046  pmullw               m3, filter_x_b
1047  paddw                m2, filter_rnd
1048  paddw                m0, m1
1049  paddw                m2, m3
1050%endif
1051  psraw                m0, 4
1052  psraw                m2, 4
1053  add                srcq, src_strideq
1054  packuswb             m0, m2
1055.x_other_y_half_loop:
1056  movu                 m4, [srcq]
1057  movu                 m3, [srcq+1]
1058%if cpuflag(ssse3)
1059  mova                 m1, [dstq]
1060  punpckhbw            m2, m4, m3
1061  punpcklbw            m4, m3
1062  pmaddubsw            m2, filter_x_a
1063  pmaddubsw            m4, filter_x_a
1064  paddw                m2, filter_rnd
1065  paddw                m4, filter_rnd
1066  psraw                m2, 4
1067  psraw                m4, 4
1068  packuswb             m4, m2
1069  pavgb                m0, m4
1070  punpckhbw            m3, m1, m5
1071  punpcklbw            m1, m5
1072%else
1073  punpckhbw            m2, m4, m5
1074  punpckhbw            m1, m3, m5
1075  punpcklbw            m4, m5
1076  punpcklbw            m3, m5
1077  pmullw               m4, filter_x_a
1078  pmullw               m3, filter_x_b
1079  paddw                m4, filter_rnd
1080  pmullw               m2, filter_x_a
1081  pmullw               m1, filter_x_b
1082  paddw                m2, filter_rnd
1083  paddw                m4, m3
1084  paddw                m2, m1
1085  mova                 m1, [dstq]
1086  psraw                m4, 4
1087  psraw                m2, 4
1088  punpckhbw            m3, m1, m5
1089  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
1090  ; have a 1-register shortage to be able to store the backup of the bilin
1091  ; filtered second line as words as cache for the next line. Packing into
1092  ; a byte costs 1 pack and 2 unpacks, but saves a register.
1093  packuswb             m4, m2
1094  punpcklbw            m1, m5
1095  pavgb                m0, m4
1096%endif
1097%if %2 == 1 ; avg
1098  ; FIXME(rbultje) pipeline
1099  pavgb                m0, [secq]
1100%endif
1101  punpckhbw            m2, m0, m5
1102  punpcklbw            m0, m5
1103  SUM_SSE              m0, m1, m2, m3, m6, m7
1104  mova                 m0, m4
1105
1106  add                srcq, src_strideq
1107  add                dstq, dst_strideq
1108%else ; %1 < 16
1109  movx                 m0, [srcq]
1110  movx                 m1, [srcq+1]
1111%if cpuflag(ssse3)
1112  punpcklbw            m0, m1
1113  pmaddubsw            m0, filter_x_a
1114  paddw                m0, filter_rnd
1115%else
1116  punpcklbw            m0, m5
1117  punpcklbw            m1, m5
1118  pmullw               m0, filter_x_a
1119  pmullw               m1, filter_x_b
1120  paddw                m0, filter_rnd
1121  paddw                m0, m1
1122%endif
1123  add                srcq, src_strideq
1124  psraw                m0, 4
1125.x_other_y_half_loop:
1126  movx                 m2, [srcq]
1127  movx                 m1, [srcq+1]
1128  movx                 m4, [srcq+src_strideq]
1129  movx                 m3, [srcq+src_strideq+1]
1130%if cpuflag(ssse3)
1131  punpcklbw            m2, m1
1132  punpcklbw            m4, m3
1133  pmaddubsw            m2, filter_x_a
1134  pmaddubsw            m4, filter_x_a
1135  movx                 m1, [dstq]
1136  movx                 m3, [dstq+dst_strideq]
1137  paddw                m2, filter_rnd
1138  paddw                m4, filter_rnd
1139%else
1140  punpcklbw            m2, m5
1141  punpcklbw            m1, m5
1142  punpcklbw            m4, m5
1143  punpcklbw            m3, m5
1144  pmullw               m2, filter_x_a
1145  pmullw               m1, filter_x_b
1146  paddw                m2, filter_rnd
1147  pmullw               m4, filter_x_a
1148  pmullw               m3, filter_x_b
1149  paddw                m4, filter_rnd
1150  paddw                m2, m1
1151  movx                 m1, [dstq]
1152  paddw                m4, m3
1153  movx                 m3, [dstq+dst_strideq]
1154%endif
1155  psraw                m2, 4
1156  psraw                m4, 4
1157  pavgw                m0, m2
1158  pavgw                m2, m4
1159%if %2 == 1 ; avg
1160  ; FIXME(rbultje) pipeline - also consider going to bytes here
1161%if %1 == 4
1162  movlhps              m0, m2
1163%endif
1164  packuswb             m0, m2
1165%if %1 > 4
1166  pavgb                m0, [secq]
1167  punpckhbw            m2, m0, m5
1168  punpcklbw            m0, m5
1169%else
1170  movh                 m2, [secq]
1171  pavgb                m0, m2
1172  punpcklbw            m0, m5
1173  movhlps              m2, m0
1174%endif
1175%endif
1176  punpcklbw            m3, m5
1177  punpcklbw            m1, m5
1178  SUM_SSE              m0, m1, m2, m3, m6, m7
1179  mova                 m0, m4
1180
1181  lea                srcq, [srcq+src_strideq*2]
1182  lea                dstq, [dstq+dst_strideq*2]
1183%endif
1184%if %2 == 1 ; avg
1185  add                secq, sec_str
1186%endif
1187  dec                   block_height
1188  jg .x_other_y_half_loop
1189%undef filter_x_a
1190%undef filter_x_b
1191%undef filter_rnd
1192  STORE_AND_RET %1
1193
1194.x_nonhalf_y_nonhalf:
1195%ifdef PIC
1196  lea        bilin_filter, [bilin_filter_m]
1197%endif
1198  shl           x_offsetd, filter_idx_shift
1199  shl           y_offsetd, filter_idx_shift
1200%if ARCH_X86_64 && %1 > 4
1201  mova                 m8, [bilin_filter+x_offsetq]
1202%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1203  mova                 m9, [bilin_filter+x_offsetq+16]
1204%endif
1205  mova                m10, [bilin_filter+y_offsetq]
1206%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1207  mova                m11, [bilin_filter+y_offsetq+16]
1208%endif
1209  mova                m12, [pw_8]
1210%define filter_x_a m8
1211%define filter_x_b m9
1212%define filter_y_a m10
1213%define filter_y_b m11
1214%define filter_rnd m12
1215%else   ; x86-32
1216%if ARCH_X86=1 && CONFIG_PIC=1
1217; In this case, there is NO unused register. Used src_stride register. Later,
1218; src_stride has to be loaded from stack when it is needed.
1219%define tempq src_strideq
1220  mov tempq, g_bilin_filterm
1221  add           x_offsetq, tempq
1222  add           y_offsetq, tempq
1223%define filter_x_a [x_offsetq]
1224%define filter_x_b [x_offsetq+16]
1225%define filter_y_a [y_offsetq]
1226%define filter_y_b [y_offsetq+16]
1227
1228  mov tempq, g_pw_8m
1229%define filter_rnd [tempq]
1230%else
1231  add           x_offsetq, bilin_filter
1232  add           y_offsetq, bilin_filter
1233%define filter_x_a [x_offsetq]
1234%define filter_x_b [x_offsetq+16]
1235%define filter_y_a [y_offsetq]
1236%define filter_y_b [y_offsetq+16]
1237%define filter_rnd [pw_8]
1238%endif
1239%endif
1240
1241  ; x_offset == bilin interpolation && y_offset == bilin interpolation
1242%if %1 == 16
1243  movu                 m0, [srcq]
1244  movu                 m1, [srcq+1]
1245%if cpuflag(ssse3)
1246  punpckhbw            m2, m0, m1
1247  punpcklbw            m0, m1
1248  pmaddubsw            m2, filter_x_a
1249  pmaddubsw            m0, filter_x_a
1250  paddw                m2, filter_rnd
1251  paddw                m0, filter_rnd
1252%else
1253  punpckhbw            m2, m0, m5
1254  punpckhbw            m3, m1, m5
1255  punpcklbw            m0, m5
1256  punpcklbw            m1, m5
1257  pmullw               m0, filter_x_a
1258  pmullw               m1, filter_x_b
1259  paddw                m0, filter_rnd
1260  pmullw               m2, filter_x_a
1261  pmullw               m3, filter_x_b
1262  paddw                m2, filter_rnd
1263  paddw                m0, m1
1264  paddw                m2, m3
1265%endif
1266  psraw                m0, 4
1267  psraw                m2, 4
1268
1269  INC_SRC_BY_SRC_STRIDE
1270
1271  packuswb             m0, m2
1272.x_other_y_other_loop:
1273%if cpuflag(ssse3)
1274  movu                 m4, [srcq]
1275  movu                 m3, [srcq+1]
1276  mova                 m1, [dstq]
1277  punpckhbw            m2, m4, m3
1278  punpcklbw            m4, m3
1279  pmaddubsw            m2, filter_x_a
1280  pmaddubsw            m4, filter_x_a
1281  punpckhbw            m3, m1, m5
1282  paddw                m2, filter_rnd
1283  paddw                m4, filter_rnd
1284  psraw                m2, 4
1285  psraw                m4, 4
1286  packuswb             m4, m2
1287  punpckhbw            m2, m0, m4
1288  punpcklbw            m0, m4
1289  pmaddubsw            m2, filter_y_a
1290  pmaddubsw            m0, filter_y_a
1291  punpcklbw            m1, m5
1292  paddw                m2, filter_rnd
1293  paddw                m0, filter_rnd
1294  psraw                m2, 4
1295  psraw                m0, 4
1296%else
1297  movu                 m3, [srcq]
1298  movu                 m4, [srcq+1]
1299  punpckhbw            m1, m3, m5
1300  punpckhbw            m2, m4, m5
1301  punpcklbw            m3, m5
1302  punpcklbw            m4, m5
1303  pmullw               m3, filter_x_a
1304  pmullw               m4, filter_x_b
1305  paddw                m3, filter_rnd
1306  pmullw               m1, filter_x_a
1307  pmullw               m2, filter_x_b
1308  paddw                m1, filter_rnd
1309  paddw                m3, m4
1310  paddw                m1, m2
1311  psraw                m3, 4
1312  psraw                m1, 4
1313  packuswb             m4, m3, m1
1314  punpckhbw            m2, m0, m5
1315  punpcklbw            m0, m5
1316  pmullw               m2, filter_y_a
1317  pmullw               m1, filter_y_b
1318  paddw                m2, filter_rnd
1319  pmullw               m0, filter_y_a
1320  pmullw               m3, filter_y_b
1321  paddw                m2, m1
1322  mova                 m1, [dstq]
1323  paddw                m0, filter_rnd
1324  psraw                m2, 4
1325  paddw                m0, m3
1326  punpckhbw            m3, m1, m5
1327  psraw                m0, 4
1328  punpcklbw            m1, m5
1329%endif
1330%if %2 == 1 ; avg
1331  ; FIXME(rbultje) pipeline
1332  packuswb             m0, m2
1333  pavgb                m0, [secq]
1334  punpckhbw            m2, m0, m5
1335  punpcklbw            m0, m5
1336%endif
1337  SUM_SSE              m0, m1, m2, m3, m6, m7
1338  mova                 m0, m4
1339
1340  INC_SRC_BY_SRC_STRIDE
1341  add                dstq, dst_strideq
1342%else ; %1 < 16
1343  movx                 m0, [srcq]
1344  movx                 m1, [srcq+1]
1345%if cpuflag(ssse3)
1346  punpcklbw            m0, m1
1347  pmaddubsw            m0, filter_x_a
1348  paddw                m0, filter_rnd
1349%else
1350  punpcklbw            m0, m5
1351  punpcklbw            m1, m5
1352  pmullw               m0, filter_x_a
1353  pmullw               m1, filter_x_b
1354  paddw                m0, filter_rnd
1355  paddw                m0, m1
1356%endif
1357  psraw                m0, 4
1358%if cpuflag(ssse3)
1359  packuswb             m0, m0
1360%endif
1361
1362  INC_SRC_BY_SRC_STRIDE
1363
1364.x_other_y_other_loop:
1365  movx                 m2, [srcq]
1366  movx                 m1, [srcq+1]
1367
1368  INC_SRC_BY_SRC_STRIDE
1369  movx                 m4, [srcq]
1370  movx                 m3, [srcq+1]
1371
1372%if cpuflag(ssse3)
1373  punpcklbw            m2, m1
1374  punpcklbw            m4, m3
1375  pmaddubsw            m2, filter_x_a
1376  pmaddubsw            m4, filter_x_a
1377  movx                 m3, [dstq+dst_strideq]
1378  movx                 m1, [dstq]
1379  paddw                m2, filter_rnd
1380  paddw                m4, filter_rnd
1381  psraw                m2, 4
1382  psraw                m4, 4
1383  packuswb             m2, m2
1384  packuswb             m4, m4
1385  punpcklbw            m0, m2
1386  punpcklbw            m2, m4
1387  pmaddubsw            m0, filter_y_a
1388  pmaddubsw            m2, filter_y_a
1389  punpcklbw            m3, m5
1390  paddw                m0, filter_rnd
1391  paddw                m2, filter_rnd
1392  psraw                m0, 4
1393  psraw                m2, 4
1394  punpcklbw            m1, m5
1395%else
1396  punpcklbw            m2, m5
1397  punpcklbw            m1, m5
1398  punpcklbw            m4, m5
1399  punpcklbw            m3, m5
1400  pmullw               m2, filter_x_a
1401  pmullw               m1, filter_x_b
1402  paddw                m2, filter_rnd
1403  pmullw               m4, filter_x_a
1404  pmullw               m3, filter_x_b
1405  paddw                m4, filter_rnd
1406  paddw                m2, m1
1407  paddw                m4, m3
1408  psraw                m2, 4
1409  psraw                m4, 4
1410  pmullw               m0, filter_y_a
1411  pmullw               m3, m2, filter_y_b
1412  paddw                m0, filter_rnd
1413  pmullw               m2, filter_y_a
1414  pmullw               m1, m4, filter_y_b
1415  paddw                m2, filter_rnd
1416  paddw                m0, m3
1417  movx                 m3, [dstq+dst_strideq]
1418  paddw                m2, m1
1419  movx                 m1, [dstq]
1420  psraw                m0, 4
1421  psraw                m2, 4
1422  punpcklbw            m3, m5
1423  punpcklbw            m1, m5
1424%endif
1425%if %2 == 1 ; avg
1426  ; FIXME(rbultje) pipeline
1427%if %1 == 4
1428  movlhps              m0, m2
1429%endif
1430  packuswb             m0, m2
1431%if %1 > 4
1432  pavgb                m0, [secq]
1433  punpckhbw            m2, m0, m5
1434  punpcklbw            m0, m5
1435%else
1436  movh                 m2, [secq]
1437  pavgb                m0, m2
1438  punpcklbw            m0, m5
1439  movhlps              m2, m0
1440%endif
1441%endif
1442  SUM_SSE              m0, m1, m2, m3, m6, m7
1443  mova                 m0, m4
1444
1445  INC_SRC_BY_SRC_STRIDE
1446  lea                dstq, [dstq+dst_strideq*2]
1447%endif
1448%if %2 == 1 ; avg
1449  add                secq, sec_str
1450%endif
1451  dec                   block_height
1452  jg .x_other_y_other_loop
1453%undef filter_x_a
1454%undef filter_x_b
1455%undef filter_y_a
1456%undef filter_y_b
1457%undef filter_rnd
1458%undef movx
1459  STORE_AND_RET %1
1460%endmacro
1461
1462; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
1463; between the ssse3 and non-ssse3 version. It may make sense to merge their
1464; code in the sense that the ssse3 version would jump to the appropriate
1465; location in the sse/2 version, rather than duplicating that code in the
1466; binary.
1467
1468INIT_XMM sse2
1469SUBPEL_VARIANCE  4
1470SUBPEL_VARIANCE  8
1471SUBPEL_VARIANCE 16
1472
1473INIT_XMM ssse3
1474SUBPEL_VARIANCE  4
1475SUBPEL_VARIANCE  8
1476SUBPEL_VARIANCE 16
1477
1478INIT_XMM sse2
1479SUBPEL_VARIANCE  4, 1
1480SUBPEL_VARIANCE  8, 1
1481SUBPEL_VARIANCE 16, 1
1482
1483INIT_XMM ssse3
1484SUBPEL_VARIANCE  4, 1
1485SUBPEL_VARIANCE  8, 1
1486SUBPEL_VARIANCE 16, 1
1487