1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_8: times  8 dw  8
15bilin_filter_m_sse2: times  8 dw 16
16                     times  8 dw  0
17                     times  8 dw 15
18                     times  8 dw  1
19                     times  8 dw 14
20                     times  8 dw  2
21                     times  8 dw 13
22                     times  8 dw  3
23                     times  8 dw 12
24                     times  8 dw  4
25                     times  8 dw 11
26                     times  8 dw  5
27                     times  8 dw 10
28                     times  8 dw  6
29                     times  8 dw  9
30                     times  8 dw  7
31                     times 16 dw  8
32                     times  8 dw  7
33                     times  8 dw  9
34                     times  8 dw  6
35                     times  8 dw 10
36                     times  8 dw  5
37                     times  8 dw 11
38                     times  8 dw  4
39                     times  8 dw 12
40                     times  8 dw  3
41                     times  8 dw 13
42                     times  8 dw  2
43                     times  8 dw 14
44                     times  8 dw  1
45                     times  8 dw 15
46
47bilin_filter_m_ssse3: times  8 db 16,  0
48                      times  8 db 15,  1
49                      times  8 db 14,  2
50                      times  8 db 13,  3
51                      times  8 db 12,  4
52                      times  8 db 11,  5
53                      times  8 db 10,  6
54                      times  8 db  9,  7
55                      times 16 db  8
56                      times  8 db  7,  9
57                      times  8 db  6, 10
58                      times  8 db  5, 11
59                      times  8 db  4, 12
60                      times  8 db  3, 13
61                      times  8 db  2, 14
62                      times  8 db  1, 15
63
64SECTION .text
65
66; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
67;                               int x_offset, int y_offset,
68;                               const uint8_t *dst, ptrdiff_t dst_stride,
69;                               int height, unsigned int *sse);
70;
71; This function returns the SE and stores SSE in the given pointer.
72
73%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
74  psubw                %3, %4
75  psubw                %1, %2
76  paddw                %5, %3
77  pmaddwd              %3, %3
78  paddw                %5, %1
79  pmaddwd              %1, %1
80  paddd                %6, %3
81  paddd                %6, %1
82%endmacro
83
84%macro STORE_AND_RET 0
85%if mmsize == 16
86  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
87  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
88  ; We have to sign-extend it before adding the words within the register
89  ; and outputing to a dword.
90  pcmpgtw              m5, m6           ; mask for 0 > x
91  movhlps              m3, m7
92  punpcklwd            m4, m6, m5
93  punpckhwd            m6, m5           ; sign-extend m6 word->dword
94  paddd                m7, m3
95  paddd                m6, m4
96  pshufd               m3, m7, 0x1
97  movhlps              m4, m6
98  paddd                m7, m3
99  paddd                m6, m4
100  mov                  r1, ssem         ; r1 = unsigned int *sse
101  pshufd               m4, m6, 0x1
102  movd               [r1], m7           ; store sse
103  paddd                m6, m4
104  movd                rax, m6           ; store sum as return value
105%else ; mmsize == 8
106  pshufw               m4, m6, 0xe
107  pshufw               m3, m7, 0xe
108  paddw                m6, m4
109  paddd                m7, m3
110  pcmpgtw              m5, m6           ; mask for 0 > x
111  mov                  r1, ssem         ; r1 = unsigned int *sse
112  punpcklwd            m6, m5           ; sign-extend m6 word->dword
113  movd               [r1], m7           ; store sse
114  pshufw               m4, m6, 0xe
115  paddd                m6, m4
116  movd                rax, m6           ; store sum as return value
117%endif
118  RET
119%endmacro
120
121%macro INC_SRC_BY_SRC_STRIDE  0
122%if ARCH_X86=1 && CONFIG_PIC=1
123  add                srcq, src_stridemp
124%else
125  add                srcq, src_strideq
126%endif
127%endmacro
128
129%macro SUBPEL_VARIANCE 1-2 0 ; W
130%if cpuflag(ssse3)
131%define bilin_filter_m bilin_filter_m_ssse3
132%define filter_idx_shift 4
133%else
134%define bilin_filter_m bilin_filter_m_sse2
135%define filter_idx_shift 5
136%endif
137; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
138; 11, not 13, if the registers are ordered correctly. May make a minor speed
139; difference on Win64
140
141%ifdef PIC    ; 64bit PIC
142  %if %2 == 1 ; avg
143    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
144                                      x_offset, y_offset, \
145                                      dst, dst_stride, \
146                                      sec, sec_stride, height, sse
147    %define sec_str sec_strideq
148  %else
149    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
150                                  y_offset, dst, dst_stride, height, sse
151  %endif
152  %define h heightd
153  %define bilin_filter sseq
154%else
155  %if ARCH_X86=1 && CONFIG_PIC=1
156    %if %2 == 1 ; avg
157      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
158                                  x_offset, y_offset, \
159                                  dst, dst_stride, \
160                                  sec, sec_stride, \
161                                  height, sse, g_bilin_filter, g_pw_8
162      %define h dword heightm
163      %define sec_str sec_stridemp
164
165      ;Store bilin_filter and pw_8 location in stack
166      GET_GOT eax
167      add esp, 4                ; restore esp
168
169      lea ecx, [GLOBAL(bilin_filter_m)]
170      mov g_bilin_filterm, ecx
171
172      lea ecx, [GLOBAL(pw_8)]
173      mov g_pw_8m, ecx
174
175      LOAD_IF_USED 0, 1         ; load eax, ecx back
176    %else
177      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
178                                y_offset, dst, dst_stride, height, sse, \
179                                g_bilin_filter, g_pw_8
180      %define h heightd
181
182      ;Store bilin_filter and pw_8 location in stack
183      GET_GOT eax
184      add esp, 4                ; restore esp
185
186      lea ecx, [GLOBAL(bilin_filter_m)]
187      mov g_bilin_filterm, ecx
188
189      lea ecx, [GLOBAL(pw_8)]
190      mov g_pw_8m, ecx
191
192      LOAD_IF_USED 0, 1         ; load eax, ecx back
193    %endif
194  %else
195    %if %2 == 1 ; avg
196      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
197                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
198                                             x_offset, y_offset, \
199                                             dst, dst_stride, \
200                                             sec, sec_stride, \
201                                             height, sse
202      %if ARCH_X86_64
203      %define h heightd
204      %define sec_str sec_strideq
205      %else
206      %define h dword heightm
207      %define sec_str sec_stridemp
208      %endif
209    %else
210      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
211                              y_offset, dst, dst_stride, height, sse
212      %define h heightd
213    %endif
214
215    %define bilin_filter bilin_filter_m
216  %endif
217%endif
218
219  ASSERT               %1 <= 16         ; m6 overflows if w > 16
220  pxor                 m6, m6           ; sum
221  pxor                 m7, m7           ; sse
222  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
223  ; could perhaps use it for something more productive then
224  pxor                 m5, m5           ; dedicated zero register
225%if %1 < 16
226  sar                   h, 1
227%if %2 == 1 ; avg
228  shl             sec_str, 1
229%endif
230%endif
231
232  ; FIXME(rbultje) replace by jumptable?
233  test          x_offsetd, x_offsetd
234  jnz .x_nonzero
235  ; x_offset == 0
236  test          y_offsetd, y_offsetd
237  jnz .x_zero_y_nonzero
238
239  ; x_offset == 0 && y_offset == 0
240.x_zero_y_zero_loop:
241%if %1 == 16
242  movu                 m0, [srcq]
243  mova                 m1, [dstq]
244%if %2 == 1 ; avg
245  pavgb                m0, [secq]
246  punpckhbw            m3, m1, m5
247  punpcklbw            m1, m5
248%endif
249  punpckhbw            m2, m0, m5
250  punpcklbw            m0, m5
251%if %2 == 0 ; !avg
252  punpckhbw            m3, m1, m5
253  punpcklbw            m1, m5
254%endif
255  SUM_SSE              m0, m1, m2, m3, m6, m7
256
257  add                srcq, src_strideq
258  add                dstq, dst_strideq
259%else ; %1 < 16
260  movh                 m0, [srcq]
261%if %2 == 1 ; avg
262%if mmsize == 16
263  movhps               m0, [srcq+src_strideq]
264%else ; mmsize == 8
265  punpckldq            m0, [srcq+src_strideq]
266%endif
267%else ; !avg
268  movh                 m2, [srcq+src_strideq]
269%endif
270  movh                 m1, [dstq]
271  movh                 m3, [dstq+dst_strideq]
272%if %2 == 1 ; avg
273  pavgb                m0, [secq]
274  punpcklbw            m3, m5
275  punpcklbw            m1, m5
276  punpckhbw            m2, m0, m5
277  punpcklbw            m0, m5
278%else ; !avg
279  punpcklbw            m0, m5
280  punpcklbw            m2, m5
281  punpcklbw            m3, m5
282  punpcklbw            m1, m5
283%endif
284  SUM_SSE              m0, m1, m2, m3, m6, m7
285
286  lea                srcq, [srcq+src_strideq*2]
287  lea                dstq, [dstq+dst_strideq*2]
288%endif
289%if %2 == 1 ; avg
290  add                secq, sec_str
291%endif
292  dec                   h
293  jg .x_zero_y_zero_loop
294  STORE_AND_RET
295
296.x_zero_y_nonzero:
297  cmp           y_offsetd, 8
298  jne .x_zero_y_nonhalf
299
300  ; x_offset == 0 && y_offset == 0.5
301.x_zero_y_half_loop:
302%if %1 == 16
303  movu                 m0, [srcq]
304  movu                 m4, [srcq+src_strideq]
305  mova                 m1, [dstq]
306  pavgb                m0, m4
307  punpckhbw            m3, m1, m5
308%if %2 == 1 ; avg
309  pavgb                m0, [secq]
310%endif
311  punpcklbw            m1, m5
312  punpckhbw            m2, m0, m5
313  punpcklbw            m0, m5
314  SUM_SSE              m0, m1, m2, m3, m6, m7
315
316  add                srcq, src_strideq
317  add                dstq, dst_strideq
318%else ; %1 < 16
319  movh                 m0, [srcq]
320  movh                 m2, [srcq+src_strideq]
321%if %2 == 1 ; avg
322%if mmsize == 16
323  movhps               m2, [srcq+src_strideq*2]
324%else ; mmsize == 8
325%if %1 == 4
326  movh                 m1, [srcq+src_strideq*2]
327  punpckldq            m2, m1
328%else
329  punpckldq            m2, [srcq+src_strideq*2]
330%endif
331%endif
332  movh                 m1, [dstq]
333%if mmsize == 16
334  movlhps              m0, m2
335%else ; mmsize == 8
336  punpckldq            m0, m2
337%endif
338  movh                 m3, [dstq+dst_strideq]
339  pavgb                m0, m2
340  punpcklbw            m1, m5
341  pavgb                m0, [secq]
342  punpcklbw            m3, m5
343  punpckhbw            m2, m0, m5
344  punpcklbw            m0, m5
345%else ; !avg
346  movh                 m4, [srcq+src_strideq*2]
347  movh                 m1, [dstq]
348  pavgb                m0, m2
349  movh                 m3, [dstq+dst_strideq]
350  pavgb                m2, m4
351  punpcklbw            m0, m5
352  punpcklbw            m2, m5
353  punpcklbw            m3, m5
354  punpcklbw            m1, m5
355%endif
356  SUM_SSE              m0, m1, m2, m3, m6, m7
357
358  lea                srcq, [srcq+src_strideq*2]
359  lea                dstq, [dstq+dst_strideq*2]
360%endif
361%if %2 == 1 ; avg
362  add                secq, sec_str
363%endif
364  dec                   h
365  jg .x_zero_y_half_loop
366  STORE_AND_RET
367
368.x_zero_y_nonhalf:
369  ; x_offset == 0 && y_offset == bilin interpolation
370%ifdef PIC
371  lea        bilin_filter, [bilin_filter_m]
372%endif
373  shl           y_offsetd, filter_idx_shift
374%if ARCH_X86_64 && mmsize == 16
375  mova                 m8, [bilin_filter+y_offsetq]
376%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
377  mova                 m9, [bilin_filter+y_offsetq+16]
378%endif
379  mova                m10, [pw_8]
380%define filter_y_a m8
381%define filter_y_b m9
382%define filter_rnd m10
383%else ; x86-32 or mmx
384%if ARCH_X86=1 && CONFIG_PIC=1
385; x_offset == 0, reuse x_offset reg
386%define tempq x_offsetq
387  add y_offsetq, g_bilin_filterm
388%define filter_y_a [y_offsetq]
389%define filter_y_b [y_offsetq+16]
390  mov tempq, g_pw_8m
391%define filter_rnd [tempq]
392%else
393  add           y_offsetq, bilin_filter
394%define filter_y_a [y_offsetq]
395%define filter_y_b [y_offsetq+16]
396%define filter_rnd [pw_8]
397%endif
398%endif
399
400.x_zero_y_other_loop:
401%if %1 == 16
402  movu                 m0, [srcq]
403  movu                 m4, [srcq+src_strideq]
404  mova                 m1, [dstq]
405%if cpuflag(ssse3)
406  punpckhbw            m2, m0, m4
407  punpcklbw            m0, m4
408  pmaddubsw            m2, filter_y_a
409  pmaddubsw            m0, filter_y_a
410  paddw                m2, filter_rnd
411  paddw                m0, filter_rnd
412%else
413  punpckhbw            m2, m0, m5
414  punpckhbw            m3, m4, m5
415  punpcklbw            m0, m5
416  punpcklbw            m4, m5
417  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
418  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
419  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
420  ; slightly faster because of pmullw latency. It would also cut our rodata
421  ; tables in half for this function, and save 1-2 registers on x86-64.
422  pmullw               m2, filter_y_a
423  pmullw               m3, filter_y_b
424  paddw                m2, filter_rnd
425  pmullw               m0, filter_y_a
426  pmullw               m4, filter_y_b
427  paddw                m0, filter_rnd
428  paddw                m2, m3
429  paddw                m0, m4
430%endif
431  psraw                m2, 4
432  psraw                m0, 4
433%if %2 == 1 ; avg
434  ; FIXME(rbultje) pipeline
435  packuswb             m0, m2
436  pavgb                m0, [secq]
437  punpckhbw            m2, m0, m5
438  punpcklbw            m0, m5
439%endif
440  punpckhbw            m3, m1, m5
441  punpcklbw            m1, m5
442  SUM_SSE              m0, m1, m2, m3, m6, m7
443
444  add                srcq, src_strideq
445  add                dstq, dst_strideq
446%else ; %1 < 16
447  movh                 m0, [srcq]
448  movh                 m2, [srcq+src_strideq]
449  movh                 m4, [srcq+src_strideq*2]
450  movh                 m3, [dstq+dst_strideq]
451%if cpuflag(ssse3)
452  movh                 m1, [dstq]
453  punpcklbw            m0, m2
454  punpcklbw            m2, m4
455  pmaddubsw            m0, filter_y_a
456  pmaddubsw            m2, filter_y_a
457  punpcklbw            m3, m5
458  paddw                m2, filter_rnd
459  paddw                m0, filter_rnd
460%else
461  punpcklbw            m0, m5
462  punpcklbw            m2, m5
463  punpcklbw            m4, m5
464  pmullw               m0, filter_y_a
465  pmullw               m1, m2, filter_y_b
466  punpcklbw            m3, m5
467  paddw                m0, filter_rnd
468  pmullw               m2, filter_y_a
469  pmullw               m4, filter_y_b
470  paddw                m0, m1
471  paddw                m2, filter_rnd
472  movh                 m1, [dstq]
473  paddw                m2, m4
474%endif
475  psraw                m0, 4
476  psraw                m2, 4
477%if %2 == 1 ; avg
478  ; FIXME(rbultje) pipeline
479  packuswb             m0, m2
480  pavgb                m0, [secq]
481  punpckhbw            m2, m0, m5
482  punpcklbw            m0, m5
483%endif
484  punpcklbw            m1, m5
485  SUM_SSE              m0, m1, m2, m3, m6, m7
486
487  lea                srcq, [srcq+src_strideq*2]
488  lea                dstq, [dstq+dst_strideq*2]
489%endif
490%if %2 == 1 ; avg
491  add                secq, sec_str
492%endif
493  dec                   h
494  jg .x_zero_y_other_loop
495%undef filter_y_a
496%undef filter_y_b
497%undef filter_rnd
498  STORE_AND_RET
499
500.x_nonzero:
501  cmp           x_offsetd, 8
502  jne .x_nonhalf
503  ; x_offset == 0.5
504  test          y_offsetd, y_offsetd
505  jnz .x_half_y_nonzero
506
507  ; x_offset == 0.5 && y_offset == 0
508.x_half_y_zero_loop:
509%if %1 == 16
510  movu                 m0, [srcq]
511  movu                 m4, [srcq+1]
512  mova                 m1, [dstq]
513  pavgb                m0, m4
514  punpckhbw            m3, m1, m5
515%if %2 == 1 ; avg
516  pavgb                m0, [secq]
517%endif
518  punpcklbw            m1, m5
519  punpckhbw            m2, m0, m5
520  punpcklbw            m0, m5
521  SUM_SSE              m0, m1, m2, m3, m6, m7
522
523  add                srcq, src_strideq
524  add                dstq, dst_strideq
525%else ; %1 < 16
526  movh                 m0, [srcq]
527  movh                 m4, [srcq+1]
528%if %2 == 1 ; avg
529%if mmsize == 16
530  movhps               m0, [srcq+src_strideq]
531  movhps               m4, [srcq+src_strideq+1]
532%else ; mmsize == 8
533  punpckldq            m0, [srcq+src_strideq]
534  punpckldq            m4, [srcq+src_strideq+1]
535%endif
536  movh                 m1, [dstq]
537  movh                 m3, [dstq+dst_strideq]
538  pavgb                m0, m4
539  punpcklbw            m3, m5
540  pavgb                m0, [secq]
541  punpcklbw            m1, m5
542  punpckhbw            m2, m0, m5
543  punpcklbw            m0, m5
544%else ; !avg
545  movh                 m2, [srcq+src_strideq]
546  movh                 m1, [dstq]
547  pavgb                m0, m4
548  movh                 m4, [srcq+src_strideq+1]
549  movh                 m3, [dstq+dst_strideq]
550  pavgb                m2, m4
551  punpcklbw            m0, m5
552  punpcklbw            m2, m5
553  punpcklbw            m3, m5
554  punpcklbw            m1, m5
555%endif
556  SUM_SSE              m0, m1, m2, m3, m6, m7
557
558  lea                srcq, [srcq+src_strideq*2]
559  lea                dstq, [dstq+dst_strideq*2]
560%endif
561%if %2 == 1 ; avg
562  add                secq, sec_str
563%endif
564  dec                   h
565  jg .x_half_y_zero_loop
566  STORE_AND_RET
567
568.x_half_y_nonzero:
569  cmp           y_offsetd, 8
570  jne .x_half_y_nonhalf
571
572  ; x_offset == 0.5 && y_offset == 0.5
573%if %1 == 16
574  movu                 m0, [srcq]
575  movu                 m3, [srcq+1]
576  add                srcq, src_strideq
577  pavgb                m0, m3
578.x_half_y_half_loop:
579  movu                 m4, [srcq]
580  movu                 m3, [srcq+1]
581  mova                 m1, [dstq]
582  pavgb                m4, m3
583  punpckhbw            m3, m1, m5
584  pavgb                m0, m4
585%if %2 == 1 ; avg
586  punpcklbw            m1, m5
587  pavgb                m0, [secq]
588  punpckhbw            m2, m0, m5
589  punpcklbw            m0, m5
590%else
591  punpckhbw            m2, m0, m5
592  punpcklbw            m0, m5
593  punpcklbw            m1, m5
594%endif
595  SUM_SSE              m0, m1, m2, m3, m6, m7
596  mova                 m0, m4
597
598  add                srcq, src_strideq
599  add                dstq, dst_strideq
600%else ; %1 < 16
601  movh                 m0, [srcq]
602  movh                 m3, [srcq+1]
603  add                srcq, src_strideq
604  pavgb                m0, m3
605.x_half_y_half_loop:
606  movh                 m2, [srcq]
607  movh                 m3, [srcq+1]
608%if %2 == 1 ; avg
609%if mmsize == 16
610  movhps               m2, [srcq+src_strideq]
611  movhps               m3, [srcq+src_strideq+1]
612%else
613%if %1 == 4
614  movh                 m1, [srcq+src_strideq]
615  punpckldq            m2, m1
616  movh                 m1, [srcq+src_strideq+1]
617  punpckldq            m3, m1
618%else
619  punpckldq            m2, [srcq+src_strideq]
620  punpckldq            m3, [srcq+src_strideq+1]
621%endif
622%endif
623  pavgb                m2, m3
624%if mmsize == 16
625  movlhps              m0, m2
626  movhlps              m4, m2
627%else ; mmsize == 8
628  punpckldq            m0, m2
629  pshufw               m4, m2, 0xe
630%endif
631  movh                 m1, [dstq]
632  pavgb                m0, m2
633  movh                 m3, [dstq+dst_strideq]
634  pavgb                m0, [secq]
635  punpcklbw            m3, m5
636  punpcklbw            m1, m5
637  punpckhbw            m2, m0, m5
638  punpcklbw            m0, m5
639%else ; !avg
640  movh                 m4, [srcq+src_strideq]
641  movh                 m1, [srcq+src_strideq+1]
642  pavgb                m2, m3
643  pavgb                m4, m1
644  pavgb                m0, m2
645  pavgb                m2, m4
646  movh                 m1, [dstq]
647  movh                 m3, [dstq+dst_strideq]
648  punpcklbw            m0, m5
649  punpcklbw            m2, m5
650  punpcklbw            m3, m5
651  punpcklbw            m1, m5
652%endif
653  SUM_SSE              m0, m1, m2, m3, m6, m7
654  mova                 m0, m4
655
656  lea                srcq, [srcq+src_strideq*2]
657  lea                dstq, [dstq+dst_strideq*2]
658%endif
659%if %2 == 1 ; avg
660  add                secq, sec_str
661%endif
662  dec                   h
663  jg .x_half_y_half_loop
664  STORE_AND_RET
665
666.x_half_y_nonhalf:
667  ; x_offset == 0.5 && y_offset == bilin interpolation
668%ifdef PIC
669  lea        bilin_filter, [bilin_filter_m]
670%endif
671  shl           y_offsetd, filter_idx_shift
672%if ARCH_X86_64 && mmsize == 16
673  mova                 m8, [bilin_filter+y_offsetq]
674%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
675  mova                 m9, [bilin_filter+y_offsetq+16]
676%endif
677  mova                m10, [pw_8]
678%define filter_y_a m8
679%define filter_y_b m9
680%define filter_rnd m10
681%else  ;x86_32
682%if ARCH_X86=1 && CONFIG_PIC=1
683; x_offset == 0.5. We can reuse x_offset reg
684%define tempq x_offsetq
685  add y_offsetq, g_bilin_filterm
686%define filter_y_a [y_offsetq]
687%define filter_y_b [y_offsetq+16]
688  mov tempq, g_pw_8m
689%define filter_rnd [tempq]
690%else
691  add           y_offsetq, bilin_filter
692%define filter_y_a [y_offsetq]
693%define filter_y_b [y_offsetq+16]
694%define filter_rnd [pw_8]
695%endif
696%endif
697
698%if %1 == 16
699  movu                 m0, [srcq]
700  movu                 m3, [srcq+1]
701  add                srcq, src_strideq
702  pavgb                m0, m3
703.x_half_y_other_loop:
704  movu                 m4, [srcq]
705  movu                 m2, [srcq+1]
706  mova                 m1, [dstq]
707  pavgb                m4, m2
708%if cpuflag(ssse3)
709  punpckhbw            m2, m0, m4
710  punpcklbw            m0, m4
711  pmaddubsw            m2, filter_y_a
712  pmaddubsw            m0, filter_y_a
713  paddw                m2, filter_rnd
714  paddw                m0, filter_rnd
715  psraw                m2, 4
716%else
717  punpckhbw            m2, m0, m5
718  punpckhbw            m3, m4, m5
719  pmullw               m2, filter_y_a
720  pmullw               m3, filter_y_b
721  paddw                m2, filter_rnd
722  punpcklbw            m0, m5
723  paddw                m2, m3
724  punpcklbw            m3, m4, m5
725  pmullw               m0, filter_y_a
726  pmullw               m3, filter_y_b
727  paddw                m0, filter_rnd
728  psraw                m2, 4
729  paddw                m0, m3
730%endif
731  punpckhbw            m3, m1, m5
732  psraw                m0, 4
733%if %2 == 1 ; avg
734  ; FIXME(rbultje) pipeline
735  packuswb             m0, m2
736  pavgb                m0, [secq]
737  punpckhbw            m2, m0, m5
738  punpcklbw            m0, m5
739%endif
740  punpcklbw            m1, m5
741  SUM_SSE              m0, m1, m2, m3, m6, m7
742  mova                 m0, m4
743
744  add                srcq, src_strideq
745  add                dstq, dst_strideq
746%else ; %1 < 16
747  movh                 m0, [srcq]
748  movh                 m3, [srcq+1]
749  add                srcq, src_strideq
750  pavgb                m0, m3
751%if notcpuflag(ssse3)
752  punpcklbw            m0, m5
753%endif
754.x_half_y_other_loop:
755  movh                 m2, [srcq]
756  movh                 m1, [srcq+1]
757  movh                 m4, [srcq+src_strideq]
758  movh                 m3, [srcq+src_strideq+1]
759  pavgb                m2, m1
760  pavgb                m4, m3
761  movh                 m3, [dstq+dst_strideq]
762%if cpuflag(ssse3)
763  movh                 m1, [dstq]
764  punpcklbw            m0, m2
765  punpcklbw            m2, m4
766  pmaddubsw            m0, filter_y_a
767  pmaddubsw            m2, filter_y_a
768  punpcklbw            m3, m5
769  paddw                m0, filter_rnd
770  paddw                m2, filter_rnd
771%else
772  punpcklbw            m2, m5
773  punpcklbw            m4, m5
774  pmullw               m0, filter_y_a
775  pmullw               m1, m2, filter_y_b
776  punpcklbw            m3, m5
777  paddw                m0, filter_rnd
778  pmullw               m2, filter_y_a
779  paddw                m0, m1
780  pmullw               m1, m4, filter_y_b
781  paddw                m2, filter_rnd
782  paddw                m2, m1
783  movh                 m1, [dstq]
784%endif
785  psraw                m0, 4
786  psraw                m2, 4
787%if %2 == 1 ; avg
788  ; FIXME(rbultje) pipeline
789  packuswb             m0, m2
790  pavgb                m0, [secq]
791  punpckhbw            m2, m0, m5
792  punpcklbw            m0, m5
793%endif
794  punpcklbw            m1, m5
795  SUM_SSE              m0, m1, m2, m3, m6, m7
796  mova                 m0, m4
797
798  lea                srcq, [srcq+src_strideq*2]
799  lea                dstq, [dstq+dst_strideq*2]
800%endif
801%if %2 == 1 ; avg
802  add                secq, sec_str
803%endif
804  dec                   h
805  jg .x_half_y_other_loop
806%undef filter_y_a
807%undef filter_y_b
808%undef filter_rnd
809  STORE_AND_RET
810
811.x_nonhalf:
812  test          y_offsetd, y_offsetd
813  jnz .x_nonhalf_y_nonzero
814
815  ; x_offset == bilin interpolation && y_offset == 0
816%ifdef PIC
817  lea        bilin_filter, [bilin_filter_m]
818%endif
819  shl           x_offsetd, filter_idx_shift
820%if ARCH_X86_64 && mmsize == 16
821  mova                 m8, [bilin_filter+x_offsetq]
822%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
823  mova                 m9, [bilin_filter+x_offsetq+16]
824%endif
825  mova                m10, [pw_8]
826%define filter_x_a m8
827%define filter_x_b m9
828%define filter_rnd m10
829%else    ; x86-32
830%if ARCH_X86=1 && CONFIG_PIC=1
831;y_offset == 0. We can reuse y_offset reg.
832%define tempq y_offsetq
833  add x_offsetq, g_bilin_filterm
834%define filter_x_a [x_offsetq]
835%define filter_x_b [x_offsetq+16]
836  mov tempq, g_pw_8m
837%define filter_rnd [tempq]
838%else
839  add           x_offsetq, bilin_filter
840%define filter_x_a [x_offsetq]
841%define filter_x_b [x_offsetq+16]
842%define filter_rnd [pw_8]
843%endif
844%endif
845
846.x_other_y_zero_loop:
847%if %1 == 16
848  movu                 m0, [srcq]
849  movu                 m4, [srcq+1]
850  mova                 m1, [dstq]
851%if cpuflag(ssse3)
852  punpckhbw            m2, m0, m4
853  punpcklbw            m0, m4
854  pmaddubsw            m2, filter_x_a
855  pmaddubsw            m0, filter_x_a
856  paddw                m2, filter_rnd
857  paddw                m0, filter_rnd
858%else
859  punpckhbw            m2, m0, m5
860  punpckhbw            m3, m4, m5
861  punpcklbw            m0, m5
862  punpcklbw            m4, m5
863  pmullw               m2, filter_x_a
864  pmullw               m3, filter_x_b
865  paddw                m2, filter_rnd
866  pmullw               m0, filter_x_a
867  pmullw               m4, filter_x_b
868  paddw                m0, filter_rnd
869  paddw                m2, m3
870  paddw                m0, m4
871%endif
872  psraw                m2, 4
873  psraw                m0, 4
874%if %2 == 1 ; avg
875  ; FIXME(rbultje) pipeline
876  packuswb             m0, m2
877  pavgb                m0, [secq]
878  punpckhbw            m2, m0, m5
879  punpcklbw            m0, m5
880%endif
881  punpckhbw            m3, m1, m5
882  punpcklbw            m1, m5
883  SUM_SSE              m0, m1, m2, m3, m6, m7
884
885  add                srcq, src_strideq
886  add                dstq, dst_strideq
887%else ; %1 < 16
888  movh                 m0, [srcq]
889  movh                 m1, [srcq+1]
890  movh                 m2, [srcq+src_strideq]
891  movh                 m4, [srcq+src_strideq+1]
892  movh                 m3, [dstq+dst_strideq]
893%if cpuflag(ssse3)
894  punpcklbw            m0, m1
895  movh                 m1, [dstq]
896  punpcklbw            m2, m4
897  pmaddubsw            m0, filter_x_a
898  pmaddubsw            m2, filter_x_a
899  punpcklbw            m3, m5
900  paddw                m0, filter_rnd
901  paddw                m2, filter_rnd
902%else
903  punpcklbw            m0, m5
904  punpcklbw            m1, m5
905  punpcklbw            m2, m5
906  punpcklbw            m4, m5
907  pmullw               m0, filter_x_a
908  pmullw               m1, filter_x_b
909  punpcklbw            m3, m5
910  paddw                m0, filter_rnd
911  pmullw               m2, filter_x_a
912  pmullw               m4, filter_x_b
913  paddw                m0, m1
914  paddw                m2, filter_rnd
915  movh                 m1, [dstq]
916  paddw                m2, m4
917%endif
918  psraw                m0, 4
919  psraw                m2, 4
920%if %2 == 1 ; avg
921  ; FIXME(rbultje) pipeline
922  packuswb             m0, m2
923  pavgb                m0, [secq]
924  punpckhbw            m2, m0, m5
925  punpcklbw            m0, m5
926%endif
927  punpcklbw            m1, m5
928  SUM_SSE              m0, m1, m2, m3, m6, m7
929
930  lea                srcq, [srcq+src_strideq*2]
931  lea                dstq, [dstq+dst_strideq*2]
932%endif
933%if %2 == 1 ; avg
934  add                secq, sec_str
935%endif
936  dec                   h
937  jg .x_other_y_zero_loop
938%undef filter_x_a
939%undef filter_x_b
940%undef filter_rnd
941  STORE_AND_RET
942
943.x_nonhalf_y_nonzero:
944  cmp           y_offsetd, 8
945  jne .x_nonhalf_y_nonhalf
946
947  ; x_offset == bilin interpolation && y_offset == 0.5
948%ifdef PIC
949  lea        bilin_filter, [bilin_filter_m]
950%endif
951  shl           x_offsetd, filter_idx_shift
952%if ARCH_X86_64 && mmsize == 16
953  mova                 m8, [bilin_filter+x_offsetq]
954%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
955  mova                 m9, [bilin_filter+x_offsetq+16]
956%endif
957  mova                m10, [pw_8]
958%define filter_x_a m8
959%define filter_x_b m9
960%define filter_rnd m10
961%else    ; x86-32
962%if ARCH_X86=1 && CONFIG_PIC=1
963; y_offset == 0.5. We can reuse y_offset reg.
964%define tempq y_offsetq
965  add x_offsetq, g_bilin_filterm
966%define filter_x_a [x_offsetq]
967%define filter_x_b [x_offsetq+16]
968  mov tempq, g_pw_8m
969%define filter_rnd [tempq]
970%else
971  add           x_offsetq, bilin_filter
972%define filter_x_a [x_offsetq]
973%define filter_x_b [x_offsetq+16]
974%define filter_rnd [pw_8]
975%endif
976%endif
977
978%if %1 == 16
979  movu                 m0, [srcq]
980  movu                 m1, [srcq+1]
981%if cpuflag(ssse3)
982  punpckhbw            m2, m0, m1
983  punpcklbw            m0, m1
984  pmaddubsw            m2, filter_x_a
985  pmaddubsw            m0, filter_x_a
986  paddw                m2, filter_rnd
987  paddw                m0, filter_rnd
988%else
989  punpckhbw            m2, m0, m5
990  punpckhbw            m3, m1, m5
991  punpcklbw            m0, m5
992  punpcklbw            m1, m5
993  pmullw               m0, filter_x_a
994  pmullw               m1, filter_x_b
995  paddw                m0, filter_rnd
996  pmullw               m2, filter_x_a
997  pmullw               m3, filter_x_b
998  paddw                m2, filter_rnd
999  paddw                m0, m1
1000  paddw                m2, m3
1001%endif
1002  psraw                m0, 4
1003  psraw                m2, 4
1004  add                srcq, src_strideq
1005  packuswb             m0, m2
1006.x_other_y_half_loop:
1007  movu                 m4, [srcq]
1008  movu                 m3, [srcq+1]
1009%if cpuflag(ssse3)
1010  mova                 m1, [dstq]
1011  punpckhbw            m2, m4, m3
1012  punpcklbw            m4, m3
1013  pmaddubsw            m2, filter_x_a
1014  pmaddubsw            m4, filter_x_a
1015  paddw                m2, filter_rnd
1016  paddw                m4, filter_rnd
1017  psraw                m2, 4
1018  psraw                m4, 4
1019  packuswb             m4, m2
1020  pavgb                m0, m4
1021  punpckhbw            m3, m1, m5
1022  punpcklbw            m1, m5
1023%else
1024  punpckhbw            m2, m4, m5
1025  punpckhbw            m1, m3, m5
1026  punpcklbw            m4, m5
1027  punpcklbw            m3, m5
1028  pmullw               m4, filter_x_a
1029  pmullw               m3, filter_x_b
1030  paddw                m4, filter_rnd
1031  pmullw               m2, filter_x_a
1032  pmullw               m1, filter_x_b
1033  paddw                m2, filter_rnd
1034  paddw                m4, m3
1035  paddw                m2, m1
1036  mova                 m1, [dstq]
1037  psraw                m4, 4
1038  psraw                m2, 4
1039  punpckhbw            m3, m1, m5
1040  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
1041  ; have a 1-register shortage to be able to store the backup of the bilin
1042  ; filtered second line as words as cache for the next line. Packing into
1043  ; a byte costs 1 pack and 2 unpacks, but saves a register.
1044  packuswb             m4, m2
1045  punpcklbw            m1, m5
1046  pavgb                m0, m4
1047%endif
1048%if %2 == 1 ; avg
1049  ; FIXME(rbultje) pipeline
1050  pavgb                m0, [secq]
1051%endif
1052  punpckhbw            m2, m0, m5
1053  punpcklbw            m0, m5
1054  SUM_SSE              m0, m1, m2, m3, m6, m7
1055  mova                 m0, m4
1056
1057  add                srcq, src_strideq
1058  add                dstq, dst_strideq
1059%else ; %1 < 16
1060  movh                 m0, [srcq]
1061  movh                 m1, [srcq+1]
1062%if cpuflag(ssse3)
1063  punpcklbw            m0, m1
1064  pmaddubsw            m0, filter_x_a
1065  paddw                m0, filter_rnd
1066%else
1067  punpcklbw            m0, m5
1068  punpcklbw            m1, m5
1069  pmullw               m0, filter_x_a
1070  pmullw               m1, filter_x_b
1071  paddw                m0, filter_rnd
1072  paddw                m0, m1
1073%endif
1074  add                srcq, src_strideq
1075  psraw                m0, 4
1076.x_other_y_half_loop:
1077  movh                 m2, [srcq]
1078  movh                 m1, [srcq+1]
1079  movh                 m4, [srcq+src_strideq]
1080  movh                 m3, [srcq+src_strideq+1]
1081%if cpuflag(ssse3)
1082  punpcklbw            m2, m1
1083  punpcklbw            m4, m3
1084  pmaddubsw            m2, filter_x_a
1085  pmaddubsw            m4, filter_x_a
1086  movh                 m1, [dstq]
1087  movh                 m3, [dstq+dst_strideq]
1088  paddw                m2, filter_rnd
1089  paddw                m4, filter_rnd
1090%else
1091  punpcklbw            m2, m5
1092  punpcklbw            m1, m5
1093  punpcklbw            m4, m5
1094  punpcklbw            m3, m5
1095  pmullw               m2, filter_x_a
1096  pmullw               m1, filter_x_b
1097  paddw                m2, filter_rnd
1098  pmullw               m4, filter_x_a
1099  pmullw               m3, filter_x_b
1100  paddw                m4, filter_rnd
1101  paddw                m2, m1
1102  movh                 m1, [dstq]
1103  paddw                m4, m3
1104  movh                 m3, [dstq+dst_strideq]
1105%endif
1106  psraw                m2, 4
1107  psraw                m4, 4
1108  pavgw                m0, m2
1109  pavgw                m2, m4
1110%if %2 == 1 ; avg
1111  ; FIXME(rbultje) pipeline - also consider going to bytes here
1112  packuswb             m0, m2
1113  pavgb                m0, [secq]
1114  punpckhbw            m2, m0, m5
1115  punpcklbw            m0, m5
1116%endif
1117  punpcklbw            m3, m5
1118  punpcklbw            m1, m5
1119  SUM_SSE              m0, m1, m2, m3, m6, m7
1120  mova                 m0, m4
1121
1122  lea                srcq, [srcq+src_strideq*2]
1123  lea                dstq, [dstq+dst_strideq*2]
1124%endif
1125%if %2 == 1 ; avg
1126  add                secq, sec_str
1127%endif
1128  dec                   h
1129  jg .x_other_y_half_loop
1130%undef filter_x_a
1131%undef filter_x_b
1132%undef filter_rnd
1133  STORE_AND_RET
1134
1135.x_nonhalf_y_nonhalf:
1136%ifdef PIC
1137  lea        bilin_filter, [bilin_filter_m]
1138%endif
1139  shl           x_offsetd, filter_idx_shift
1140  shl           y_offsetd, filter_idx_shift
1141%if ARCH_X86_64 && mmsize == 16
1142  mova                 m8, [bilin_filter+x_offsetq]
1143%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1144  mova                 m9, [bilin_filter+x_offsetq+16]
1145%endif
1146  mova                m10, [bilin_filter+y_offsetq]
1147%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1148  mova                m11, [bilin_filter+y_offsetq+16]
1149%endif
1150  mova                m12, [pw_8]
1151%define filter_x_a m8
1152%define filter_x_b m9
1153%define filter_y_a m10
1154%define filter_y_b m11
1155%define filter_rnd m12
1156%else   ; x86-32
1157%if ARCH_X86=1 && CONFIG_PIC=1
1158; In this case, there is NO unused register. Used src_stride register. Later,
1159; src_stride has to be loaded from stack when it is needed.
1160%define tempq src_strideq
1161  mov tempq, g_bilin_filterm
1162  add           x_offsetq, tempq
1163  add           y_offsetq, tempq
1164%define filter_x_a [x_offsetq]
1165%define filter_x_b [x_offsetq+16]
1166%define filter_y_a [y_offsetq]
1167%define filter_y_b [y_offsetq+16]
1168
1169  mov tempq, g_pw_8m
1170%define filter_rnd [tempq]
1171%else
1172  add           x_offsetq, bilin_filter
1173  add           y_offsetq, bilin_filter
1174%define filter_x_a [x_offsetq]
1175%define filter_x_b [x_offsetq+16]
1176%define filter_y_a [y_offsetq]
1177%define filter_y_b [y_offsetq+16]
1178%define filter_rnd [pw_8]
1179%endif
1180%endif
1181
1182  ; x_offset == bilin interpolation && y_offset == bilin interpolation
1183%if %1 == 16
1184  movu                 m0, [srcq]
1185  movu                 m1, [srcq+1]
1186%if cpuflag(ssse3)
1187  punpckhbw            m2, m0, m1
1188  punpcklbw            m0, m1
1189  pmaddubsw            m2, filter_x_a
1190  pmaddubsw            m0, filter_x_a
1191  paddw                m2, filter_rnd
1192  paddw                m0, filter_rnd
1193%else
1194  punpckhbw            m2, m0, m5
1195  punpckhbw            m3, m1, m5
1196  punpcklbw            m0, m5
1197  punpcklbw            m1, m5
1198  pmullw               m0, filter_x_a
1199  pmullw               m1, filter_x_b
1200  paddw                m0, filter_rnd
1201  pmullw               m2, filter_x_a
1202  pmullw               m3, filter_x_b
1203  paddw                m2, filter_rnd
1204  paddw                m0, m1
1205  paddw                m2, m3
1206%endif
1207  psraw                m0, 4
1208  psraw                m2, 4
1209
1210  INC_SRC_BY_SRC_STRIDE
1211
1212  packuswb             m0, m2
1213.x_other_y_other_loop:
1214%if cpuflag(ssse3)
1215  movu                 m4, [srcq]
1216  movu                 m3, [srcq+1]
1217  mova                 m1, [dstq]
1218  punpckhbw            m2, m4, m3
1219  punpcklbw            m4, m3
1220  pmaddubsw            m2, filter_x_a
1221  pmaddubsw            m4, filter_x_a
1222  punpckhbw            m3, m1, m5
1223  paddw                m2, filter_rnd
1224  paddw                m4, filter_rnd
1225  psraw                m2, 4
1226  psraw                m4, 4
1227  packuswb             m4, m2
1228  punpckhbw            m2, m0, m4
1229  punpcklbw            m0, m4
1230  pmaddubsw            m2, filter_y_a
1231  pmaddubsw            m0, filter_y_a
1232  punpcklbw            m1, m5
1233  paddw                m2, filter_rnd
1234  paddw                m0, filter_rnd
1235  psraw                m2, 4
1236  psraw                m0, 4
1237%else
1238  movu                 m3, [srcq]
1239  movu                 m4, [srcq+1]
1240  punpckhbw            m1, m3, m5
1241  punpckhbw            m2, m4, m5
1242  punpcklbw            m3, m5
1243  punpcklbw            m4, m5
1244  pmullw               m3, filter_x_a
1245  pmullw               m4, filter_x_b
1246  paddw                m3, filter_rnd
1247  pmullw               m1, filter_x_a
1248  pmullw               m2, filter_x_b
1249  paddw                m1, filter_rnd
1250  paddw                m3, m4
1251  paddw                m1, m2
1252  psraw                m3, 4
1253  psraw                m1, 4
1254  packuswb             m4, m3, m1
1255  punpckhbw            m2, m0, m5
1256  punpcklbw            m0, m5
1257  pmullw               m2, filter_y_a
1258  pmullw               m1, filter_y_b
1259  paddw                m2, filter_rnd
1260  pmullw               m0, filter_y_a
1261  pmullw               m3, filter_y_b
1262  paddw                m2, m1
1263  mova                 m1, [dstq]
1264  paddw                m0, filter_rnd
1265  psraw                m2, 4
1266  paddw                m0, m3
1267  punpckhbw            m3, m1, m5
1268  psraw                m0, 4
1269  punpcklbw            m1, m5
1270%endif
1271%if %2 == 1 ; avg
1272  ; FIXME(rbultje) pipeline
1273  packuswb             m0, m2
1274  pavgb                m0, [secq]
1275  punpckhbw            m2, m0, m5
1276  punpcklbw            m0, m5
1277%endif
1278  SUM_SSE              m0, m1, m2, m3, m6, m7
1279  mova                 m0, m4
1280
1281  INC_SRC_BY_SRC_STRIDE
1282  add                dstq, dst_strideq
1283%else ; %1 < 16
1284  movh                 m0, [srcq]
1285  movh                 m1, [srcq+1]
1286%if cpuflag(ssse3)
1287  punpcklbw            m0, m1
1288  pmaddubsw            m0, filter_x_a
1289  paddw                m0, filter_rnd
1290%else
1291  punpcklbw            m0, m5
1292  punpcklbw            m1, m5
1293  pmullw               m0, filter_x_a
1294  pmullw               m1, filter_x_b
1295  paddw                m0, filter_rnd
1296  paddw                m0, m1
1297%endif
1298  psraw                m0, 4
1299%if cpuflag(ssse3)
1300  packuswb             m0, m0
1301%endif
1302
1303  INC_SRC_BY_SRC_STRIDE
1304
1305.x_other_y_other_loop:
1306  movh                 m2, [srcq]
1307  movh                 m1, [srcq+1]
1308
1309  INC_SRC_BY_SRC_STRIDE
1310  movh                 m4, [srcq]
1311  movh                 m3, [srcq+1]
1312
1313%if cpuflag(ssse3)
1314  punpcklbw            m2, m1
1315  punpcklbw            m4, m3
1316  pmaddubsw            m2, filter_x_a
1317  pmaddubsw            m4, filter_x_a
1318  movh                 m3, [dstq+dst_strideq]
1319  movh                 m1, [dstq]
1320  paddw                m2, filter_rnd
1321  paddw                m4, filter_rnd
1322  psraw                m2, 4
1323  psraw                m4, 4
1324  packuswb             m2, m2
1325  packuswb             m4, m4
1326  punpcklbw            m0, m2
1327  punpcklbw            m2, m4
1328  pmaddubsw            m0, filter_y_a
1329  pmaddubsw            m2, filter_y_a
1330  punpcklbw            m3, m5
1331  paddw                m0, filter_rnd
1332  paddw                m2, filter_rnd
1333  psraw                m0, 4
1334  psraw                m2, 4
1335  punpcklbw            m1, m5
1336%else
1337  punpcklbw            m2, m5
1338  punpcklbw            m1, m5
1339  punpcklbw            m4, m5
1340  punpcklbw            m3, m5
1341  pmullw               m2, filter_x_a
1342  pmullw               m1, filter_x_b
1343  paddw                m2, filter_rnd
1344  pmullw               m4, filter_x_a
1345  pmullw               m3, filter_x_b
1346  paddw                m4, filter_rnd
1347  paddw                m2, m1
1348  paddw                m4, m3
1349  psraw                m2, 4
1350  psraw                m4, 4
1351  pmullw               m0, filter_y_a
1352  pmullw               m3, m2, filter_y_b
1353  paddw                m0, filter_rnd
1354  pmullw               m2, filter_y_a
1355  pmullw               m1, m4, filter_y_b
1356  paddw                m2, filter_rnd
1357  paddw                m0, m3
1358  movh                 m3, [dstq+dst_strideq]
1359  paddw                m2, m1
1360  movh                 m1, [dstq]
1361  psraw                m0, 4
1362  psraw                m2, 4
1363  punpcklbw            m3, m5
1364  punpcklbw            m1, m5
1365%endif
1366%if %2 == 1 ; avg
1367  ; FIXME(rbultje) pipeline
1368  packuswb             m0, m2
1369  pavgb                m0, [secq]
1370  punpckhbw            m2, m0, m5
1371  punpcklbw            m0, m5
1372%endif
1373  SUM_SSE              m0, m1, m2, m3, m6, m7
1374  mova                 m0, m4
1375
1376  INC_SRC_BY_SRC_STRIDE
1377  lea                dstq, [dstq+dst_strideq*2]
1378%endif
1379%if %2 == 1 ; avg
1380  add                secq, sec_str
1381%endif
1382  dec                   h
1383  jg .x_other_y_other_loop
1384%undef filter_x_a
1385%undef filter_x_b
1386%undef filter_y_a
1387%undef filter_y_b
1388%undef filter_rnd
1389  STORE_AND_RET
1390%endmacro
1391
1392; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
1393; between the ssse3 and non-ssse3 version. It may make sense to merge their
1394; code in the sense that the ssse3 version would jump to the appropriate
1395; location in the sse/2 version, rather than duplicating that code in the
1396; binary.
1397
1398INIT_MMX sse
1399SUBPEL_VARIANCE  4
1400INIT_XMM sse2
1401SUBPEL_VARIANCE  8
1402SUBPEL_VARIANCE 16
1403
1404INIT_MMX ssse3
1405SUBPEL_VARIANCE  4
1406INIT_XMM ssse3
1407SUBPEL_VARIANCE  8
1408SUBPEL_VARIANCE 16
1409
1410INIT_MMX sse
1411SUBPEL_VARIANCE  4, 1
1412INIT_XMM sse2
1413SUBPEL_VARIANCE  8, 1
1414SUBPEL_VARIANCE 16, 1
1415
1416INIT_MMX ssse3
1417SUBPEL_VARIANCE  4, 1
1418INIT_XMM ssse3
1419SUBPEL_VARIANCE  8, 1
1420SUBPEL_VARIANCE 16, 1
1421