1;
2;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_64:    times 8 dw 64
15
16; %define USE_PMULHRSW
17; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
18; when using this instruction.
19;
20; The add order below (based on ffvp9) must be followed to prevent outranges.
21; x = k0k1 + k4k5
22; y = k2k3 + k6k7
23; z = signed SAT(x + y)
24
25SECTION .text
26%define LOCAL_VARS_SIZE 16*6
27
28%macro SETUP_LOCAL_VARS 0
29    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
30    ; pmaddubsw has a higher latency on some platforms, this might be eased by
31    ; interleaving the instructions.
32    %define    k0k1  [rsp + 16*0]
33    %define    k2k3  [rsp + 16*1]
34    %define    k4k5  [rsp + 16*2]
35    %define    k6k7  [rsp + 16*3]
36    packsswb     m4, m4
37    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
38    ; some platforms.
39    pshuflw      m0, m4, 0b              ;k0_k1
40    pshuflw      m1, m4, 01010101b       ;k2_k3
41    pshuflw      m2, m4, 10101010b       ;k4_k5
42    pshuflw      m3, m4, 11111111b       ;k6_k7
43    punpcklqdq   m0, m0
44    punpcklqdq   m1, m1
45    punpcklqdq   m2, m2
46    punpcklqdq   m3, m3
47    mova       k0k1, m0
48    mova       k2k3, m1
49    mova       k4k5, m2
50    mova       k6k7, m3
51%if ARCH_X86_64
52    %define     krd  m12
53    %define    tmp0  [rsp + 16*4]
54    %define    tmp1  [rsp + 16*5]
55    mova        krd, [GLOBAL(pw_64)]
56%else
57    %define     krd  [rsp + 16*4]
58%if CONFIG_PIC=0
59    mova         m6, [GLOBAL(pw_64)]
60%else
61    ; build constants without accessing global memory
62    pcmpeqb      m6, m6                  ;all ones
63    psrlw        m6, 15
64    psllw        m6, 6                   ;aka pw_64
65%endif
66    mova        krd, m6
67%endif
68%endm
69
70;-------------------------------------------------------------------------------
71%if ARCH_X86_64
72  %define LOCAL_VARS_SIZE_H4 0
73%else
74  %define LOCAL_VARS_SIZE_H4 16*4
75%endif
76
77%macro SUBPIX_HFILTER4 1
78cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
79                            src, sstride, dst, dstride, height, filter
80    mova                m4, [filterq]
81    packsswb            m4, m4
82%if ARCH_X86_64
83    %define       k0k1k4k5  m8
84    %define       k2k3k6k7  m9
85    %define            krd  m10
86    mova               krd, [GLOBAL(pw_64)]
87    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
88    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
89    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
90    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
91%else
92    %define       k0k1k4k5  [rsp + 16*0]
93    %define       k2k3k6k7  [rsp + 16*1]
94    %define            krd  [rsp + 16*2]
95    pshuflw             m6, m4, 0b              ;k0_k1
96    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
97    pshuflw             m7, m4, 01010101b       ;k2_k3
98    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
99%if CONFIG_PIC=0
100    mova                m1, [GLOBAL(pw_64)]
101%else
102    ; build constants without accessing global memory
103    pcmpeqb             m1, m1                  ;all ones
104    psrlw               m1, 15
105    psllw               m1, 6                   ;aka pw_64
106%endif
107    mova          k0k1k4k5, m6
108    mova          k2k3k6k7, m7
109    mova               krd, m1
110%endif
111    dec            heightd
112
113.loop:
114    ;Do two rows at once
115    movu                m4, [srcq - 3]
116    movu                m5, [srcq + sstrideq - 3]
117    punpckhbw           m1, m4, m4
118    punpcklbw           m4, m4
119    punpckhbw           m3, m5, m5
120    punpcklbw           m5, m5
121    palignr             m0, m1, m4, 1
122    pmaddubsw           m0, k0k1k4k5
123    palignr             m1, m4, 5
124    pmaddubsw           m1, k2k3k6k7
125    palignr             m2, m3, m5, 1
126    pmaddubsw           m2, k0k1k4k5
127    palignr             m3, m5, 5
128    pmaddubsw           m3, k2k3k6k7
129    punpckhqdq          m4, m0, m2
130    punpcklqdq          m0, m2
131    punpckhqdq          m5, m1, m3
132    punpcklqdq          m1, m3
133    paddsw              m0, m4
134    paddsw              m1, m5
135%ifidn %1, h8_avg
136    movd                m4, [dstq]
137    movd                m5, [dstq + dstrideq]
138%endif
139    paddsw              m0, m1
140    paddsw              m0, krd
141    psraw               m0, 7
142    packuswb            m0, m0
143    psrldq              m1, m0, 4
144
145%ifidn %1, h8_avg
146    pavgb               m0, m4
147    pavgb               m1, m5
148%endif
149    movd            [dstq], m0
150    movd [dstq + dstrideq], m1
151
152    lea               srcq, [srcq + sstrideq        ]
153    prefetcht0              [srcq + 4 * sstrideq - 3]
154    lea               srcq, [srcq + sstrideq        ]
155    lea               dstq, [dstq + 2 * dstrideq    ]
156    prefetcht0              [srcq + 2 * sstrideq - 3]
157
158    sub            heightd, 2
159    jg               .loop
160
161    ; Do last row if output_height is odd
162    jne              .done
163
164    movu                m4, [srcq - 3]
165    punpckhbw           m1, m4, m4
166    punpcklbw           m4, m4
167    palignr             m0, m1, m4, 1
168    palignr             m1, m4, 5
169    pmaddubsw           m0, k0k1k4k5
170    pmaddubsw           m1, k2k3k6k7
171    psrldq              m2, m0, 8
172    psrldq              m3, m1, 8
173    paddsw              m0, m2
174    paddsw              m1, m3
175    paddsw              m0, m1
176    paddsw              m0, krd
177    psraw               m0, 7
178    packuswb            m0, m0
179%ifidn %1, h8_avg
180    movd                m4, [dstq]
181    pavgb               m0, m4
182%endif
183    movd            [dstq], m0
184.done:
185    REP_RET
186%endm
187
188;-------------------------------------------------------------------------------
189%macro SUBPIX_HFILTER8 1
190cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
191                            src, sstride, dst, dstride, height, filter
192    mova                 m4, [filterq]
193    SETUP_LOCAL_VARS
194    dec             heightd
195
196.loop:
197    ;Do two rows at once
198    movu                 m0, [srcq - 3]
199    movu                 m4, [srcq + sstrideq - 3]
200    punpckhbw            m1, m0, m0
201    punpcklbw            m0, m0
202    palignr              m5, m1, m0, 13
203    pmaddubsw            m5, k6k7
204    palignr              m2, m1, m0, 5
205    palignr              m3, m1, m0, 9
206    palignr              m1, m0, 1
207    pmaddubsw            m1, k0k1
208    punpckhbw            m6, m4, m4
209    punpcklbw            m4, m4
210    pmaddubsw            m2, k2k3
211    pmaddubsw            m3, k4k5
212
213    palignr              m7, m6, m4, 13
214    palignr              m0, m6, m4, 5
215    pmaddubsw            m7, k6k7
216    paddsw               m1, m3
217    paddsw               m2, m5
218    paddsw               m1, m2
219%ifidn %1, h8_avg
220    movh                 m2, [dstq]
221    movhps               m2, [dstq + dstrideq]
222%endif
223    palignr              m5, m6, m4, 9
224    palignr              m6, m4, 1
225    pmaddubsw            m0, k2k3
226    pmaddubsw            m6, k0k1
227    paddsw               m1, krd
228    pmaddubsw            m5, k4k5
229    psraw                m1, 7
230    paddsw               m0, m7
231    paddsw               m6, m5
232    paddsw               m6, m0
233    paddsw               m6, krd
234    psraw                m6, 7
235    packuswb             m1, m6
236%ifidn %1, h8_avg
237    pavgb                m1, m2
238%endif
239    movh              [dstq], m1
240    movhps [dstq + dstrideq], m1
241
242    lea                srcq, [srcq + sstrideq        ]
243    prefetcht0               [srcq + 4 * sstrideq - 3]
244    lea                srcq, [srcq + sstrideq        ]
245    lea                dstq, [dstq + 2 * dstrideq    ]
246    prefetcht0               [srcq + 2 * sstrideq - 3]
247    sub             heightd, 2
248    jg                .loop
249
250    ; Do last row if output_height is odd
251    jne               .done
252
253    movu                 m0, [srcq - 3]
254    punpckhbw            m3, m0, m0
255    punpcklbw            m0, m0
256    palignr              m1, m3, m0, 1
257    palignr              m2, m3, m0, 5
258    palignr              m4, m3, m0, 13
259    palignr              m3, m0, 9
260    pmaddubsw            m1, k0k1
261    pmaddubsw            m2, k2k3
262    pmaddubsw            m3, k4k5
263    pmaddubsw            m4, k6k7
264    paddsw               m1, m3
265    paddsw               m4, m2
266    paddsw               m1, m4
267    paddsw               m1, krd
268    psraw                m1, 7
269    packuswb             m1, m1
270%ifidn %1, h8_avg
271    movh                 m0, [dstq]
272    pavgb                m1, m0
273%endif
274    movh             [dstq], m1
275.done:
276    REP_RET
277%endm
278
279;-------------------------------------------------------------------------------
280%macro SUBPIX_HFILTER16 1
281cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
282                             src, sstride, dst, dstride, height, filter
283    mova          m4, [filterq]
284    SETUP_LOCAL_VARS
285
286.loop:
287    prefetcht0        [srcq + 2 * sstrideq -3]
288
289    movu          m0, [srcq - 3]
290    movu          m4, [srcq - 2]
291    pmaddubsw     m0, k0k1
292    pmaddubsw     m4, k0k1
293    movu          m1, [srcq - 1]
294    movu          m5, [srcq + 0]
295    pmaddubsw     m1, k2k3
296    pmaddubsw     m5, k2k3
297    movu          m2, [srcq + 1]
298    movu          m6, [srcq + 2]
299    pmaddubsw     m2, k4k5
300    pmaddubsw     m6, k4k5
301    movu          m3, [srcq + 3]
302    movu          m7, [srcq + 4]
303    pmaddubsw     m3, k6k7
304    pmaddubsw     m7, k6k7
305    paddsw        m0, m2
306    paddsw        m1, m3
307    paddsw        m0, m1
308    paddsw        m4, m6
309    paddsw        m5, m7
310    paddsw        m4, m5
311    paddsw        m0, krd
312    paddsw        m4, krd
313    psraw         m0, 7
314    psraw         m4, 7
315    packuswb      m0, m0
316    packuswb      m4, m4
317    punpcklbw     m0, m4
318%ifidn %1, h8_avg
319    pavgb         m0, [dstq]
320%endif
321    lea         srcq, [srcq + sstrideq]
322    mova      [dstq], m0
323    lea         dstq, [dstq + dstrideq]
324    dec      heightd
325    jnz        .loop
326    REP_RET
327%endm
328
329INIT_XMM ssse3
330SUBPIX_HFILTER16 h8
331SUBPIX_HFILTER16 h8_avg
332SUBPIX_HFILTER8  h8
333SUBPIX_HFILTER8  h8_avg
334SUBPIX_HFILTER4  h8
335SUBPIX_HFILTER4  h8_avg
336
337;-------------------------------------------------------------------------------
338
339; TODO(Linfeng): Detect cpu type and choose the code with better performance.
340%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
341
342%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
343    %define NUM_GENERAL_REG_USED 9
344%else
345    %define NUM_GENERAL_REG_USED 6
346%endif
347
348%macro SUBPIX_VFILTER 2
349cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
350                             src, sstride, dst, dstride, height, filter
351    mova          m4, [filterq]
352    SETUP_LOCAL_VARS
353
354%ifidn %2, 8
355    %define                movx  movh
356%else
357    %define                movx  movd
358%endif
359
360    dec                 heightd
361
362%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
363
364%if ARCH_X86_64
365    %define               src1q  r7
366    %define           sstride6q  r8
367    %define          dst_stride  dstrideq
368%else
369    %define               src1q  filterq
370    %define           sstride6q  dstrideq
371    %define          dst_stride  dstridemp
372%endif
373    mov                   src1q, srcq
374    add                   src1q, sstrideq
375    lea               sstride6q, [sstrideq + sstrideq * 4]
376    add               sstride6q, sstrideq                   ;pitch * 6
377
378.loop:
379    ;Do two rows at once
380    movx                     m0, [srcq                ]     ;A
381    movx                     m1, [src1q               ]     ;B
382    punpcklbw                m0, m1                         ;A B
383    movx                     m2, [srcq + sstrideq * 2 ]     ;C
384    pmaddubsw                m0, k0k1
385    mova                     m6, m2
386    movx                     m3, [src1q + sstrideq * 2]     ;D
387    punpcklbw                m2, m3                         ;C D
388    pmaddubsw                m2, k2k3
389    movx                     m4, [srcq + sstrideq * 4 ]     ;E
390    mova                     m7, m4
391    movx                     m5, [src1q + sstrideq * 4]     ;F
392    punpcklbw                m4, m5                         ;E F
393    pmaddubsw                m4, k4k5
394    punpcklbw                m1, m6                         ;A B next iter
395    movx                     m6, [srcq + sstride6q    ]     ;G
396    punpcklbw                m5, m6                         ;E F next iter
397    punpcklbw                m3, m7                         ;C D next iter
398    pmaddubsw                m5, k4k5
399    movx                     m7, [src1q + sstride6q   ]     ;H
400    punpcklbw                m6, m7                         ;G H
401    pmaddubsw                m6, k6k7
402    pmaddubsw                m3, k2k3
403    pmaddubsw                m1, k0k1
404    paddsw                   m0, m4
405    paddsw                   m2, m6
406    movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
407    punpcklbw                m7, m6
408    pmaddubsw                m7, k6k7
409    paddsw                   m0, m2
410    paddsw                   m0, krd
411    psraw                    m0, 7
412    paddsw                   m1, m5
413    packuswb                 m0, m0
414
415    paddsw                   m3, m7
416    paddsw                   m1, m3
417    paddsw                   m1, krd
418    psraw                    m1, 7
419    lea                    srcq, [srcq + sstrideq * 2 ]
420    lea                   src1q, [src1q + sstrideq * 2]
421    packuswb                 m1, m1
422
423%ifidn %1, v8_avg
424    movx                     m2, [dstq]
425    pavgb                    m0, m2
426%endif
427    movx                 [dstq], m0
428    add                    dstq, dst_stride
429%ifidn %1, v8_avg
430    movx                     m3, [dstq]
431    pavgb                    m1, m3
432%endif
433    movx                 [dstq], m1
434    add                    dstq, dst_stride
435    sub                 heightd, 2
436    jg                    .loop
437
438    ; Do last row if output_height is odd
439    jne                   .done
440
441    movx                     m0, [srcq                ]     ;A
442    movx                     m1, [srcq + sstrideq     ]     ;B
443    movx                     m6, [srcq + sstride6q    ]     ;G
444    punpcklbw                m0, m1                         ;A B
445    movx                     m7, [src1q + sstride6q   ]     ;H
446    pmaddubsw                m0, k0k1
447    movx                     m2, [srcq + sstrideq * 2 ]     ;C
448    punpcklbw                m6, m7                         ;G H
449    movx                     m3, [src1q + sstrideq * 2]     ;D
450    pmaddubsw                m6, k6k7
451    movx                     m4, [srcq + sstrideq * 4 ]     ;E
452    punpcklbw                m2, m3                         ;C D
453    movx                     m5, [src1q + sstrideq * 4]     ;F
454    punpcklbw                m4, m5                         ;E F
455    pmaddubsw                m2, k2k3
456    pmaddubsw                m4, k4k5
457    paddsw                   m2, m6
458    paddsw                   m0, m4
459    paddsw                   m0, m2
460    paddsw                   m0, krd
461    psraw                    m0, 7
462    packuswb                 m0, m0
463%ifidn %1, v8_avg
464    movx                     m1, [dstq]
465    pavgb                    m0, m1
466%endif
467    movx                 [dstq], m0
468
469%else
470    ; ARCH_X86_64
471
472    movx                     m0, [srcq                ]     ;A
473    movx                     m1, [srcq + sstrideq     ]     ;B
474    lea                    srcq, [srcq + sstrideq * 2 ]
475    movx                     m2, [srcq]                     ;C
476    movx                     m3, [srcq + sstrideq]          ;D
477    lea                    srcq, [srcq + sstrideq * 2 ]
478    movx                     m4, [srcq]                     ;E
479    movx                     m5, [srcq + sstrideq]          ;F
480    lea                    srcq, [srcq + sstrideq * 2 ]
481    movx                     m6, [srcq]                     ;G
482    punpcklbw                m0, m1                         ;A B
483    punpcklbw                m1, m2                         ;A B next iter
484    punpcklbw                m2, m3                         ;C D
485    punpcklbw                m3, m4                         ;C D next iter
486    punpcklbw                m4, m5                         ;E F
487    punpcklbw                m5, m6                         ;E F next iter
488
489.loop:
490    ;Do two rows at once
491    movx                     m7, [srcq + sstrideq]          ;H
492    lea                    srcq, [srcq + sstrideq * 2 ]
493    movx                    m14, [srcq]                     ;H next iter
494    punpcklbw                m6, m7                         ;G H
495    punpcklbw                m7, m14                        ;G H next iter
496    pmaddubsw                m8, m0, k0k1
497    pmaddubsw                m9, m1, k0k1
498    mova                     m0, m2
499    mova                     m1, m3
500    pmaddubsw               m10, m2, k2k3
501    pmaddubsw               m11, m3, k2k3
502    mova                     m2, m4
503    mova                     m3, m5
504    pmaddubsw                m4, k4k5
505    pmaddubsw                m5, k4k5
506    paddsw                   m8, m4
507    paddsw                   m9, m5
508    mova                     m4, m6
509    mova                     m5, m7
510    pmaddubsw                m6, k6k7
511    pmaddubsw                m7, k6k7
512    paddsw                  m10, m6
513    paddsw                  m11, m7
514    paddsw                   m8, m10
515    paddsw                   m9, m11
516    mova                     m6, m14
517    paddsw                   m8, krd
518    paddsw                   m9, krd
519    psraw                    m8, 7
520    psraw                    m9, 7
521%ifidn %2, 4
522    packuswb                 m8, m8
523    packuswb                 m9, m9
524%else
525    packuswb                 m8, m9
526%endif
527
528%ifidn %1, v8_avg
529    movx                     m7, [dstq]
530%ifidn %2, 4
531    movx                    m10, [dstq + dstrideq]
532    pavgb                    m9, m10
533%else
534    movhpd                   m7, [dstq + dstrideq]
535%endif
536    pavgb                    m8, m7
537%endif
538    movx                 [dstq], m8
539%ifidn %2, 4
540    movx      [dstq + dstrideq], m9
541%else
542    movhpd    [dstq + dstrideq], m8
543%endif
544
545    lea                    dstq, [dstq + dstrideq * 2 ]
546    sub                 heightd, 2
547    jg                    .loop
548
549    ; Do last row if output_height is odd
550    jne                   .done
551
552    movx                     m7, [srcq + sstrideq]          ;H
553    punpcklbw                m6, m7                         ;G H
554    pmaddubsw                m0, k0k1
555    pmaddubsw                m2, k2k3
556    pmaddubsw                m4, k4k5
557    pmaddubsw                m6, k6k7
558    paddsw                   m0, m4
559    paddsw                   m2, m6
560    paddsw                   m0, m2
561    paddsw                   m0, krd
562    psraw                    m0, 7
563    packuswb                 m0, m0
564%ifidn %1, v8_avg
565    movx                     m1, [dstq]
566    pavgb                    m0, m1
567%endif
568    movx                 [dstq], m0
569
570%endif ; ARCH_X86_64
571
572.done:
573    REP_RET
574
575%endm
576
577;-------------------------------------------------------------------------------
578%macro SUBPIX_VFILTER16 1
579cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
580                             src, sstride, dst, dstride, height, filter
581    mova                     m4, [filterq]
582    SETUP_LOCAL_VARS
583
584%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
585
586%if ARCH_X86_64
587    %define               src1q  r7
588    %define           sstride6q  r8
589    %define          dst_stride  dstrideq
590%else
591    %define               src1q  filterq
592    %define           sstride6q  dstrideq
593    %define          dst_stride  dstridemp
594%endif
595    lea                   src1q, [srcq + sstrideq]
596    lea               sstride6q, [sstrideq + sstrideq * 4]
597    add               sstride6q, sstrideq                   ;pitch * 6
598
599.loop:
600    movh                     m0, [srcq                ]     ;A
601    movh                     m1, [src1q               ]     ;B
602    movh                     m2, [srcq + sstrideq * 2 ]     ;C
603    movh                     m3, [src1q + sstrideq * 2]     ;D
604    movh                     m4, [srcq + sstrideq * 4 ]     ;E
605    movh                     m5, [src1q + sstrideq * 4]     ;F
606
607    punpcklbw                m0, m1                         ;A B
608    movh                     m6, [srcq + sstride6q]         ;G
609    punpcklbw                m2, m3                         ;C D
610    movh                     m7, [src1q + sstride6q]        ;H
611    punpcklbw                m4, m5                         ;E F
612    pmaddubsw                m0, k0k1
613    movh                     m3, [srcq + 8]                 ;A
614    pmaddubsw                m2, k2k3
615    punpcklbw                m6, m7                         ;G H
616    movh                     m5, [srcq + sstrideq + 8]      ;B
617    pmaddubsw                m4, k4k5
618    punpcklbw                m3, m5                         ;A B
619    movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
620    pmaddubsw                m6, k6k7
621    movh                     m5, [src1q + sstrideq * 2 + 8] ;D
622    punpcklbw                m7, m5                         ;C D
623    paddsw                   m2, m6
624    pmaddubsw                m3, k0k1
625    movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
626    paddsw                   m0, m4
627    pmaddubsw                m7, k2k3
628    movh                     m6, [src1q + sstrideq * 4 + 8] ;F
629    punpcklbw                m1, m6                         ;E F
630    paddsw                   m0, m2
631    paddsw                   m0, krd
632    movh                     m2, [srcq + sstride6q + 8]     ;G
633    pmaddubsw                m1, k4k5
634    movh                     m5, [src1q + sstride6q + 8]    ;H
635    psraw                    m0, 7
636    punpcklbw                m2, m5                         ;G H
637    pmaddubsw                m2, k6k7
638    paddsw                   m7, m2
639    paddsw                   m3, m1
640    paddsw                   m3, m7
641    paddsw                   m3, krd
642    psraw                    m3, 7
643    packuswb                 m0, m3
644
645    add                    srcq, sstrideq
646    add                   src1q, sstrideq
647%ifidn %1, v8_avg
648    pavgb                    m0, [dstq]
649%endif
650    mova                 [dstq], m0
651    add                    dstq, dst_stride
652    dec                 heightd
653    jnz                   .loop
654    REP_RET
655
656%else
657    ; ARCH_X86_64
658    dec                 heightd
659
660    movu                     m1, [srcq                ]     ;A
661    movu                     m3, [srcq + sstrideq     ]     ;B
662    lea                    srcq, [srcq + sstrideq * 2]
663    punpcklbw                m0, m1, m3                     ;A B
664    punpckhbw                m1, m3                         ;A B
665    movu                     m5, [srcq]                     ;C
666    punpcklbw                m2, m3, m5                     ;A B next iter
667    punpckhbw                m3, m5                         ;A B next iter
668    mova                   tmp0, m2                         ;store to stack
669    mova                   tmp1, m3                         ;store to stack
670    movu                     m7, [srcq + sstrideq]          ;D
671    lea                    srcq, [srcq + sstrideq * 2]
672    punpcklbw                m4, m5, m7                     ;C D
673    punpckhbw                m5, m7                         ;C D
674    movu                     m9, [srcq]                     ;E
675    punpcklbw                m6, m7, m9                     ;C D next iter
676    punpckhbw                m7, m9                         ;C D next iter
677    movu                    m11, [srcq + sstrideq]          ;F
678    lea                    srcq, [srcq + sstrideq * 2]
679    punpcklbw                m8, m9, m11                    ;E F
680    punpckhbw                m9, m11                        ;E F
681    movu                     m2, [srcq]                     ;G
682    punpcklbw               m10, m11, m2                    ;E F next iter
683    punpckhbw               m11, m2                         ;E F next iter
684
685.loop:
686    ;Do two rows at once
687    pmaddubsw               m13, m0, k0k1
688    mova                     m0, m4
689    pmaddubsw               m14, m8, k4k5
690    pmaddubsw               m15, m4, k2k3
691    mova                     m4, m8
692    paddsw                  m13, m14
693    movu                     m3, [srcq + sstrideq]          ;H
694    lea                    srcq, [srcq + sstrideq * 2]
695    punpcklbw               m14, m2, m3                     ;G H
696    mova                     m8, m14
697    pmaddubsw               m14, k6k7
698    paddsw                  m15, m14
699    paddsw                  m13, m15
700    paddsw                  m13, krd
701    psraw                   m13, 7
702
703    pmaddubsw               m14, m1, k0k1
704    pmaddubsw                m1, m9, k4k5
705    pmaddubsw               m15, m5, k2k3
706    paddsw                  m14, m1
707    mova                     m1, m5
708    mova                     m5, m9
709    punpckhbw                m2, m3                         ;G H
710    mova                     m9, m2
711    pmaddubsw                m2, k6k7
712    paddsw                  m15, m2
713    paddsw                  m14, m15
714    paddsw                  m14, krd
715    psraw                   m14, 7
716    packuswb                m13, m14
717%ifidn %1, v8_avg
718    pavgb                   m13, [dstq]
719%endif
720    mova                 [dstq], m13
721
722    ; next iter
723    pmaddubsw               m15, tmp0, k0k1
724    pmaddubsw               m14, m10, k4k5
725    pmaddubsw               m13, m6, k2k3
726    paddsw                  m15, m14
727    mova                   tmp0, m6
728    mova                     m6, m10
729    movu                     m2, [srcq]                     ;G next iter
730    punpcklbw               m14, m3, m2                     ;G H next iter
731    mova                    m10, m14
732    pmaddubsw               m14, k6k7
733    paddsw                  m13, m14
734    paddsw                  m15, m13
735    paddsw                  m15, krd
736    psraw                   m15, 7
737
738    pmaddubsw               m14, tmp1, k0k1
739    mova                   tmp1, m7
740    pmaddubsw               m13, m7, k2k3
741    mova                     m7, m11
742    pmaddubsw               m11, k4k5
743    paddsw                  m14, m11
744    punpckhbw                m3, m2                         ;G H next iter
745    mova                    m11, m3
746    pmaddubsw                m3, k6k7
747    paddsw                  m13, m3
748    paddsw                  m14, m13
749    paddsw                  m14, krd
750    psraw                   m14, 7
751    packuswb                m15, m14
752%ifidn %1, v8_avg
753    pavgb                   m15, [dstq + dstrideq]
754%endif
755    mova      [dstq + dstrideq], m15
756    lea                    dstq, [dstq + dstrideq * 2]
757    sub                 heightd, 2
758    jg                    .loop
759
760    ; Do last row if output_height is odd
761    jne                   .done
762
763    movu                     m3, [srcq + sstrideq]          ;H
764    punpcklbw                m6, m2, m3                     ;G H
765    punpckhbw                m2, m3                         ;G H
766    pmaddubsw                m0, k0k1
767    pmaddubsw                m1, k0k1
768    pmaddubsw                m4, k2k3
769    pmaddubsw                m5, k2k3
770    pmaddubsw                m8, k4k5
771    pmaddubsw                m9, k4k5
772    pmaddubsw                m6, k6k7
773    pmaddubsw                m2, k6k7
774    paddsw                   m0, m8
775    paddsw                   m1, m9
776    paddsw                   m4, m6
777    paddsw                   m5, m2
778    paddsw                   m0, m4
779    paddsw                   m1, m5
780    paddsw                   m0, krd
781    paddsw                   m1, krd
782    psraw                    m0, 7
783    psraw                    m1, 7
784    packuswb                 m0, m1
785%ifidn %1, v8_avg
786    pavgb                    m0, [dstq]
787%endif
788    mova                 [dstq], m0
789
790.done:
791    REP_RET
792
793%endif ; ARCH_X86_64
794
795%endm
796
797INIT_XMM ssse3
798SUBPIX_VFILTER16     v8
799SUBPIX_VFILTER16 v8_avg
800SUBPIX_VFILTER       v8, 8
801SUBPIX_VFILTER   v8_avg, 8
802SUBPIX_VFILTER       v8, 4
803SUBPIX_VFILTER   v8_avg, 4
804