1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pb_1: times 16 db 1
15pw_4:  times 8 dw 4
16pw_8:  times 8 dw 8
17pw_16: times 8 dw 16
18pw_32: times 8 dw 32
19dc_128: times 16 db 128
20pw2_4:  times 8 dw 2
21pw2_8:  times 8 dw 4
22pw2_16:  times 8 dw 8
23pw2_32:  times 8 dw 16
24
25SECTION .text
26
27; ------------------------------------------
28; input: x, y, z, result
29;
30; trick from pascal
31; (x+2y+z+2)>>2 can be calculated as:
32; result = avg(x,z)
33; result -= xor(x,z) & 1
34; result = avg(result,y)
35; ------------------------------------------
36%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
37  pavgb               %4, %1, %3
38  pxor                %3, %1
39  pand                %3, [GLOBAL(pb_1)]
40  psubb               %4, %3
41  pavgb               %4, %2
42%endmacro
43
44INIT_XMM sse2
45cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
46  GET_GOT     goffsetq
47
48  movq                 m0, [aboveq]
49  DEFINE_ARGS dst, stride, temp
50  psrldq               m1, m0, 1
51  psrldq               m2, m0, 2
52  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
53
54  ; store 4 lines
55  movd   [dstq          ], m3
56  psrlq                m3, 8
57  movd   [dstq+strideq  ], m3
58  lea                dstq, [dstq+strideq*2]
59  psrlq                m3, 8
60  movd   [dstq          ], m3
61  psrlq                m3, 8
62  movd   [dstq+strideq  ], m3
63  psrlq                m0, 56
64  movd              tempq, m0
65  mov    [dstq+strideq+3], tempb
66
67  RESTORE_GOT
68  RET
69
70INIT_XMM sse2
71cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
72  GET_GOT     goffsetq
73
74  movu                m1, [aboveq]
75  pslldq              m0, m1, 1
76  psrldq              m2, m1, 1
77  DEFINE_ARGS dst, stride, stride3
78  lea           stride3q, [strideq*3]
79  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
80  punpckhbw           m0, m0 ; 7 7
81  punpcklwd           m0, m0 ; 7 7 7 7
82  punpckldq           m0, m0 ; 7 7 7 7 7 7 7 7
83  punpcklqdq          m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
84
85 ; store 4 lines
86  psrldq                m3, 1
87  movq    [dstq          ], m3
88  psrldq                m3, 1
89  movq    [dstq+strideq  ], m3
90  psrldq                m3, 1
91  movq    [dstq+strideq*2], m3
92  psrldq                m3, 1
93  movq    [dstq+stride3q ], m3
94  lea                 dstq, [dstq+strideq*4]
95
96  ; store next 4 lines
97  psrldq                m3, 1
98  movq    [dstq          ], m3
99  psrldq                m3, 1
100  movq    [dstq+strideq  ], m3
101  psrldq                m3, 1
102  movq    [dstq+strideq*2], m3
103  psrldq                m3, 1
104  movq    [dstq+stride3q ], m3
105
106  RESTORE_GOT
107  RET
108
109INIT_XMM sse2
110cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
111  GET_GOT     goffsetq
112
113  movd                m0, [leftq]                ; abcd [byte]
114  punpcklbw           m4, m0, m0                 ; aabb ccdd
115  punpcklwd           m4, m4                     ; aaaa bbbb cccc dddd
116  psrldq              m4, 12                     ; dddd
117  punpckldq           m0, m4                     ; abcd dddd
118  psrldq              m1, m0, 1                  ; bcdd
119  psrldq              m2, m0, 2                  ; cddd
120
121  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3   ; a2bc b2cd c3d d
122  pavgb               m1, m0                     ; ab, bc, cd, d [byte]
123
124  punpcklbw           m1, m3             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
125  movd    [dstq        ], m1
126  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
127  movd    [dstq+strideq], m1
128
129  lea               dstq, [dstq+strideq*2]
130  psrlq               m1, 16             ; cd, c3d, d, d
131  movd    [dstq        ], m1
132  movd    [dstq+strideq], m4             ; d, d, d, d
133  RESTORE_GOT
134  RET
135
136INIT_XMM sse2
137cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
138  GET_GOT     goffsetq
139
140  movd                  m2, [leftq]
141  movd                  m0, [aboveq]
142  pxor                  m1, m1
143  punpckldq             m0, m2
144  psadbw                m0, m1
145  paddw                 m0, [GLOBAL(pw_4)]
146  psraw                 m0, 3
147  pshuflw               m0, m0, 0x0
148  packuswb              m0, m0
149  movd      [dstq        ], m0
150  movd      [dstq+strideq], m0
151  lea                 dstq, [dstq+strideq*2]
152  movd      [dstq        ], m0
153  movd      [dstq+strideq], m0
154
155  RESTORE_GOT
156  RET
157
158INIT_XMM sse2
159cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
160  movifnidn          leftq, leftmp
161  GET_GOT     goffsetq
162
163  pxor                  m1, m1
164  movd                  m0, [leftq]
165  psadbw                m0, m1
166  paddw                 m0, [GLOBAL(pw2_4)]
167  psraw                 m0, 2
168  pshuflw               m0, m0, 0x0
169  packuswb              m0, m0
170  movd      [dstq        ], m0
171  movd      [dstq+strideq], m0
172  lea                 dstq, [dstq+strideq*2]
173  movd      [dstq        ], m0
174  movd      [dstq+strideq], m0
175
176  RESTORE_GOT
177  RET
178
179INIT_XMM sse2
180cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
181  GET_GOT     goffsetq
182
183  pxor                  m1, m1
184  movd                  m0, [aboveq]
185  psadbw                m0, m1
186  paddw                 m0, [GLOBAL(pw2_4)]
187  psraw                 m0, 2
188  pshuflw               m0, m0, 0x0
189  packuswb              m0, m0
190  movd      [dstq        ], m0
191  movd      [dstq+strideq], m0
192  lea                 dstq, [dstq+strideq*2]
193  movd      [dstq        ], m0
194  movd      [dstq+strideq], m0
195
196  RESTORE_GOT
197  RET
198
199INIT_XMM sse2
200cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
201  GET_GOT     goffsetq
202
203  pxor                  m1, m1
204  movq                  m0, [aboveq]
205  movq                  m2, [leftq]
206  DEFINE_ARGS dst, stride, stride3
207  lea             stride3q, [strideq*3]
208  psadbw                m0, m1
209  psadbw                m2, m1
210  paddw                 m0, m2
211  paddw                 m0, [GLOBAL(pw_8)]
212  psraw                 m0, 4
213  punpcklbw             m0, m0
214  pshuflw               m0, m0, 0x0
215  movq    [dstq          ], m0
216  movq    [dstq+strideq  ], m0
217  movq    [dstq+strideq*2], m0
218  movq    [dstq+stride3q ], m0
219  lea                 dstq, [dstq+strideq*4]
220  movq    [dstq          ], m0
221  movq    [dstq+strideq  ], m0
222  movq    [dstq+strideq*2], m0
223  movq    [dstq+stride3q ], m0
224
225  RESTORE_GOT
226  RET
227
228INIT_XMM sse2
229cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
230  GET_GOT     goffsetq
231
232  pxor                  m1, m1
233  movq                  m0, [aboveq]
234  DEFINE_ARGS dst, stride, stride3
235  lea             stride3q, [strideq*3]
236  psadbw                m0, m1
237  paddw                 m0, [GLOBAL(pw2_8)]
238  psraw                 m0, 3
239  punpcklbw             m0, m0
240  pshuflw               m0, m0, 0x0
241  movq    [dstq          ], m0
242  movq    [dstq+strideq  ], m0
243  movq    [dstq+strideq*2], m0
244  movq    [dstq+stride3q ], m0
245  lea                 dstq, [dstq+strideq*4]
246  movq    [dstq          ], m0
247  movq    [dstq+strideq  ], m0
248  movq    [dstq+strideq*2], m0
249  movq    [dstq+stride3q ], m0
250
251  RESTORE_GOT
252  RET
253
254INIT_XMM sse2
255cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
256  movifnidn          leftq, leftmp
257  GET_GOT     goffsetq
258
259  pxor                  m1, m1
260  movq                  m0, [leftq]
261  DEFINE_ARGS dst, stride, stride3
262  lea             stride3q, [strideq*3]
263  psadbw                m0, m1
264  paddw                 m0, [GLOBAL(pw2_8)]
265  psraw                 m0, 3
266  punpcklbw             m0, m0
267  pshuflw               m0, m0, 0x0
268  movq    [dstq          ], m0
269  movq    [dstq+strideq  ], m0
270  movq    [dstq+strideq*2], m0
271  movq    [dstq+stride3q ], m0
272  lea                 dstq, [dstq+strideq*4]
273  movq    [dstq          ], m0
274  movq    [dstq+strideq  ], m0
275  movq    [dstq+strideq*2], m0
276  movq    [dstq+stride3q ], m0
277
278  RESTORE_GOT
279  RET
280
281INIT_XMM sse2
282cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
283  GET_GOT     goffsetq
284
285  DEFINE_ARGS dst, stride, stride3
286  lea             stride3q, [strideq*3]
287  movd     m0,        [GLOBAL(dc_128)]
288  movd    [dstq          ], m0
289  movd    [dstq+strideq  ], m0
290  movd    [dstq+strideq*2], m0
291  movd    [dstq+stride3q ], m0
292  RESTORE_GOT
293  RET
294
295INIT_XMM sse2
296cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
297  GET_GOT     goffsetq
298
299  DEFINE_ARGS dst, stride, stride3
300  lea             stride3q, [strideq*3]
301  movq    m0,        [GLOBAL(dc_128)]
302  movq    [dstq          ], m0
303  movq    [dstq+strideq  ], m0
304  movq    [dstq+strideq*2], m0
305  movq    [dstq+stride3q ], m0
306  lea                 dstq, [dstq+strideq*4]
307  movq    [dstq          ], m0
308  movq    [dstq+strideq  ], m0
309  movq    [dstq+strideq*2], m0
310  movq    [dstq+stride3q ], m0
311  RESTORE_GOT
312  RET
313
314INIT_XMM sse2
315cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
316  GET_GOT     goffsetq
317
318  pxor                  m1, m1
319  mova                  m0, [aboveq]
320  mova                  m2, [leftq]
321  DEFINE_ARGS dst, stride, stride3, lines4
322  lea             stride3q, [strideq*3]
323  mov              lines4d, 4
324  psadbw                m0, m1
325  psadbw                m2, m1
326  paddw                 m0, m2
327  movhlps               m2, m0
328  paddw                 m0, m2
329  paddw                 m0, [GLOBAL(pw_16)]
330  psraw                 m0, 5
331  pshuflw               m0, m0, 0x0
332  punpcklqdq            m0, m0
333  packuswb              m0, m0
334.loop:
335  mova    [dstq          ], m0
336  mova    [dstq+strideq  ], m0
337  mova    [dstq+strideq*2], m0
338  mova    [dstq+stride3q ], m0
339  lea                 dstq, [dstq+strideq*4]
340  dec              lines4d
341  jnz .loop
342
343  RESTORE_GOT
344  REP_RET
345
346
347INIT_XMM sse2
348cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
349  GET_GOT     goffsetq
350
351  pxor                  m1, m1
352  mova                  m0, [aboveq]
353  DEFINE_ARGS dst, stride, stride3, lines4
354  lea             stride3q, [strideq*3]
355  mov              lines4d, 4
356  psadbw                m0, m1
357  movhlps               m2, m0
358  paddw                 m0, m2
359  paddw                 m0, [GLOBAL(pw2_16)]
360  psraw                 m0, 4
361  pshuflw               m0, m0, 0x0
362  punpcklqdq            m0, m0
363  packuswb              m0, m0
364.loop:
365  mova    [dstq          ], m0
366  mova    [dstq+strideq  ], m0
367  mova    [dstq+strideq*2], m0
368  mova    [dstq+stride3q ], m0
369  lea                 dstq, [dstq+strideq*4]
370  dec              lines4d
371  jnz .loop
372
373  RESTORE_GOT
374  REP_RET
375
376INIT_XMM sse2
377cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
378  GET_GOT     goffsetq
379
380  pxor                  m1, m1
381  mova                  m0, [leftq]
382  DEFINE_ARGS dst, stride, stride3, lines4
383  lea             stride3q, [strideq*3]
384  mov              lines4d, 4
385  psadbw                m0, m1
386  movhlps               m2, m0
387  paddw                 m0, m2
388  paddw                 m0, [GLOBAL(pw2_16)]
389  psraw                 m0, 4
390  pshuflw               m0, m0, 0x0
391  punpcklqdq            m0, m0
392  packuswb              m0, m0
393.loop:
394  mova    [dstq          ], m0
395  mova    [dstq+strideq  ], m0
396  mova    [dstq+strideq*2], m0
397  mova    [dstq+stride3q ], m0
398  lea                 dstq, [dstq+strideq*4]
399  dec              lines4d
400  jnz .loop
401
402  RESTORE_GOT
403  REP_RET
404
405INIT_XMM sse2
406cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
407  GET_GOT     goffsetq
408
409  DEFINE_ARGS dst, stride, stride3, lines4
410  lea             stride3q, [strideq*3]
411  mov              lines4d, 4
412  mova    m0,        [GLOBAL(dc_128)]
413.loop:
414  mova    [dstq          ], m0
415  mova    [dstq+strideq  ], m0
416  mova    [dstq+strideq*2], m0
417  mova    [dstq+stride3q ], m0
418  lea                 dstq, [dstq+strideq*4]
419  dec              lines4d
420  jnz .loop
421  RESTORE_GOT
422  RET
423
424
425INIT_XMM sse2
426cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
427  GET_GOT     goffsetq
428
429  pxor                  m1, m1
430  mova                  m0, [aboveq]
431  mova                  m2, [aboveq+16]
432  mova                  m3, [leftq]
433  mova                  m4, [leftq+16]
434  DEFINE_ARGS dst, stride, stride3, lines4
435  lea             stride3q, [strideq*3]
436  mov              lines4d, 8
437  psadbw                m0, m1
438  psadbw                m2, m1
439  psadbw                m3, m1
440  psadbw                m4, m1
441  paddw                 m0, m2
442  paddw                 m0, m3
443  paddw                 m0, m4
444  movhlps               m2, m0
445  paddw                 m0, m2
446  paddw                 m0, [GLOBAL(pw_32)]
447  psraw                 m0, 6
448  pshuflw               m0, m0, 0x0
449  punpcklqdq            m0, m0
450  packuswb              m0, m0
451.loop:
452  mova [dstq             ], m0
453  mova [dstq          +16], m0
454  mova [dstq+strideq     ], m0
455  mova [dstq+strideq  +16], m0
456  mova [dstq+strideq*2   ], m0
457  mova [dstq+strideq*2+16], m0
458  mova [dstq+stride3q    ], m0
459  mova [dstq+stride3q +16], m0
460  lea                 dstq, [dstq+strideq*4]
461  dec              lines4d
462  jnz .loop
463
464  RESTORE_GOT
465  REP_RET
466
467INIT_XMM sse2
468cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
469  GET_GOT     goffsetq
470
471  pxor                  m1, m1
472  mova                  m0, [aboveq]
473  mova                  m2, [aboveq+16]
474  DEFINE_ARGS dst, stride, stride3, lines4
475  lea             stride3q, [strideq*3]
476  mov              lines4d, 8
477  psadbw                m0, m1
478  psadbw                m2, m1
479  paddw                 m0, m2
480  movhlps               m2, m0
481  paddw                 m0, m2
482  paddw                 m0, [GLOBAL(pw2_32)]
483  psraw                 m0, 5
484  pshuflw               m0, m0, 0x0
485  punpcklqdq            m0, m0
486  packuswb              m0, m0
487.loop:
488  mova [dstq             ], m0
489  mova [dstq          +16], m0
490  mova [dstq+strideq     ], m0
491  mova [dstq+strideq  +16], m0
492  mova [dstq+strideq*2   ], m0
493  mova [dstq+strideq*2+16], m0
494  mova [dstq+stride3q    ], m0
495  mova [dstq+stride3q +16], m0
496  lea                 dstq, [dstq+strideq*4]
497  dec              lines4d
498  jnz .loop
499
500  RESTORE_GOT
501  REP_RET
502
503INIT_XMM sse2
504cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
505  GET_GOT     goffsetq
506
507  pxor                  m1, m1
508  mova                  m0, [leftq]
509  mova                  m2, [leftq+16]
510  DEFINE_ARGS dst, stride, stride3, lines4
511  lea             stride3q, [strideq*3]
512  mov              lines4d, 8
513  psadbw                m0, m1
514  psadbw                m2, m1
515  paddw                 m0, m2
516  movhlps               m2, m0
517  paddw                 m0, m2
518  paddw                 m0, [GLOBAL(pw2_32)]
519  psraw                 m0, 5
520  pshuflw               m0, m0, 0x0
521  punpcklqdq            m0, m0
522  packuswb              m0, m0
523.loop:
524  mova [dstq             ], m0
525  mova [dstq          +16], m0
526  mova [dstq+strideq     ], m0
527  mova [dstq+strideq  +16], m0
528  mova [dstq+strideq*2   ], m0
529  mova [dstq+strideq*2+16], m0
530  mova [dstq+stride3q    ], m0
531  mova [dstq+stride3q +16], m0
532  lea                 dstq, [dstq+strideq*4]
533  dec              lines4d
534  jnz .loop
535
536  RESTORE_GOT
537  REP_RET
538
539INIT_XMM sse2
540cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
541  GET_GOT     goffsetq
542
543  DEFINE_ARGS dst, stride, stride3, lines4
544  lea             stride3q, [strideq*3]
545  mov              lines4d, 8
546  mova    m0,        [GLOBAL(dc_128)]
547.loop:
548  mova [dstq             ], m0
549  mova [dstq          +16], m0
550  mova [dstq+strideq     ], m0
551  mova [dstq+strideq  +16], m0
552  mova [dstq+strideq*2   ], m0
553  mova [dstq+strideq*2+16], m0
554  mova [dstq+stride3q    ], m0
555  mova [dstq+stride3q +16], m0
556  lea                 dstq, [dstq+strideq*4]
557  dec              lines4d
558  jnz .loop
559  RESTORE_GOT
560  RET
561
562INIT_XMM sse2
563cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
564  movd                  m0, [aboveq]
565  movd      [dstq        ], m0
566  movd      [dstq+strideq], m0
567  lea                 dstq, [dstq+strideq*2]
568  movd      [dstq        ], m0
569  movd      [dstq+strideq], m0
570  RET
571
572INIT_XMM sse2
573cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
574  movq                  m0, [aboveq]
575  DEFINE_ARGS dst, stride, stride3
576  lea             stride3q, [strideq*3]
577  movq    [dstq          ], m0
578  movq    [dstq+strideq  ], m0
579  movq    [dstq+strideq*2], m0
580  movq    [dstq+stride3q ], m0
581  lea                 dstq, [dstq+strideq*4]
582  movq    [dstq          ], m0
583  movq    [dstq+strideq  ], m0
584  movq    [dstq+strideq*2], m0
585  movq    [dstq+stride3q ], m0
586  RET
587
588INIT_XMM sse2
589cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
590  mova                  m0, [aboveq]
591  DEFINE_ARGS dst, stride, stride3, nlines4
592  lea             stride3q, [strideq*3]
593  mov              nlines4d, 4
594.loop:
595  mova    [dstq          ], m0
596  mova    [dstq+strideq  ], m0
597  mova    [dstq+strideq*2], m0
598  mova    [dstq+stride3q ], m0
599  lea                 dstq, [dstq+strideq*4]
600  dec             nlines4d
601  jnz .loop
602  REP_RET
603
604INIT_XMM sse2
605cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
606  mova                  m0, [aboveq]
607  mova                  m1, [aboveq+16]
608  DEFINE_ARGS dst, stride, stride3, nlines4
609  lea             stride3q, [strideq*3]
610  mov              nlines4d, 8
611.loop:
612  mova [dstq             ], m0
613  mova [dstq          +16], m1
614  mova [dstq+strideq     ], m0
615  mova [dstq+strideq  +16], m1
616  mova [dstq+strideq*2   ], m0
617  mova [dstq+strideq*2+16], m1
618  mova [dstq+stride3q    ], m0
619  mova [dstq+stride3q +16], m1
620  lea                 dstq, [dstq+strideq*4]
621  dec             nlines4d
622  jnz .loop
623  REP_RET
624
625INIT_XMM sse2
626cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
627  movifnidn          leftq, leftmp
628  movd                  m0, [leftq]
629  punpcklbw             m0, m0
630  punpcklbw             m0, m0
631  pshufd                m1, m0, 0x1
632  movd      [dstq        ], m0
633  movd      [dstq+strideq], m1
634  pshufd                m2, m0, 0x2
635  lea                 dstq, [dstq+strideq*2]
636  pshufd                m3, m0, 0x3
637  movd      [dstq        ], m2
638  movd      [dstq+strideq], m3
639  RET
640
641INIT_XMM sse2
642cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
643  movifnidn          leftq, leftmp
644  mov                lineq, -2
645  DEFINE_ARGS  dst, stride, line, left, stride3
646  lea             stride3q, [strideq*3]
647  movq                  m0, [leftq    ]
648  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
649.loop:
650  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
651  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
652  movq      [dstq        ], m1
653  movq      [dstq+strideq], m2
654  pshuflw               m1, m0, 0xaa
655  pshuflw               m2, m0, 0xff
656  movq    [dstq+strideq*2], m1
657  movq    [dstq+stride3q ], m2
658  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
659  inc                lineq
660  lea                 dstq, [dstq+strideq*4]
661  jnz .loop
662  REP_RET
663
664INIT_XMM sse2
665cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
666  movifnidn          leftq, leftmp
667  mov                lineq, -4
668  DEFINE_ARGS dst, stride, line, left, stride3
669  lea             stride3q, [strideq*3]
670.loop:
671  movd                  m0, [leftq]
672  punpcklbw             m0, m0
673  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
674  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
675  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
676  mova    [dstq          ], m1
677  mova    [dstq+strideq  ], m2
678  pshufd            m1, m0, 0xaa
679  pshufd            m2, m0, 0xff
680  mova    [dstq+strideq*2], m1
681  mova    [dstq+stride3q ], m2
682  inc                lineq
683  lea                leftq, [leftq+4       ]
684  lea                 dstq, [dstq+strideq*4]
685  jnz .loop
686  REP_RET
687
688INIT_XMM sse2
689cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
690  movifnidn              leftq, leftmp
691  mov                    lineq, -8
692  DEFINE_ARGS dst, stride, line, left, stride3
693  lea                 stride3q, [strideq*3]
694.loop:
695  movd                      m0, [leftq]
696  punpcklbw                 m0, m0
697  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
698  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
699  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
700  mova     [dstq             ], m1
701  mova     [dstq+16          ], m1
702  mova     [dstq+strideq     ], m2
703  mova     [dstq+strideq+16  ], m2
704  pshufd                m1, m0, 0xaa
705  pshufd                m2, m0, 0xff
706  mova     [dstq+strideq*2   ], m1
707  mova     [dstq+strideq*2+16], m1
708  mova     [dstq+stride3q    ], m2
709  mova     [dstq+stride3q+16 ], m2
710  inc                    lineq
711  lea                    leftq, [leftq+4       ]
712  lea                     dstq, [dstq+strideq*4]
713  jnz .loop
714  REP_RET
715
716INIT_XMM sse2
717cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
718  pxor                  m1, m1
719  movq                  m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
720  punpcklbw             m0, m1
721  pshuflw               m2, m0, 0x0   ; [63:0] tl tl tl tl [word]
722  psrldq                m0, 2
723  psubw                 m0, m2        ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
724  movd                  m2, [leftq]
725  punpcklbw             m2, m1
726  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
727  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
728  paddw                 m4, m0
729  paddw                 m3, m0
730  packuswb              m4, m4
731  packuswb              m3, m3
732  movd      [dstq        ], m4
733  movd      [dstq+strideq], m3
734  lea                 dstq, [dstq+strideq*2]
735  pshuflw               m4, m2, 0xaa
736  pshuflw               m3, m2, 0xff
737  paddw                 m4, m0
738  paddw                 m3, m0
739  packuswb              m4, m4
740  packuswb              m3, m3
741  movd      [dstq        ], m4
742  movd      [dstq+strideq], m3
743  RET
744
745INIT_XMM sse2
746cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
747  pxor                  m1, m1
748  movd                  m2, [aboveq-1]
749  movq                  m0, [aboveq]
750  punpcklbw             m2, m1
751  punpcklbw             m0, m1        ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
752  pshuflw               m2, m2, 0x0   ; [63:0] tl tl tl tl [word]
753  DEFINE_ARGS dst, stride, line, left
754  mov                lineq, -4
755  punpcklqdq            m2, m2        ; tl tl tl tl tl tl tl tl [word]
756  psubw                 m0, m2        ; t1-tl t2-tl ... t8-tl [word]
757  movq                  m2, [leftq]
758  punpcklbw             m2, m1        ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
759.loop:
760  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
761  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
762  punpcklqdq            m4, m4        ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
763  punpcklqdq            m3, m3        ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
764  paddw                 m4, m0
765  paddw                 m3, m0
766  packuswb              m4, m3
767  movq      [dstq        ], m4
768  movhps    [dstq+strideq], m4
769  lea                 dstq, [dstq+strideq*2]
770  psrldq                m2, 4
771  inc                lineq
772  jnz .loop
773  REP_RET
774
775INIT_XMM sse2
776cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
777  pxor                  m1, m1
778  mova                  m2, [aboveq-16];
779  mova                  m0, [aboveq]   ; t1 t2 ... t16 [byte]
780  punpckhbw             m2, m1         ; [127:112] tl [word]
781  punpckhbw             m4, m0, m1
782  punpcklbw             m0, m1         ; m0:m4 t1 t2 ... t16 [word]
783  DEFINE_ARGS dst, stride, line, left, stride8
784  mov                lineq, -8
785  pshufhw               m2, m2, 0xff
786  mova                  m3, [leftq]    ; l1 l2 ... l16 [byte]
787  punpckhqdq            m2, m2         ; tl repeated 8 times [word]
788  psubw                 m0, m2
789  psubw                 m4, m2         ; m0:m4 t1-tl t2-tl ... t16-tl [word]
790  punpckhbw             m5, m3, m1
791  punpcklbw             m3, m1         ; m3:m5 l1 l2 ... l16 [word]
792  lea             stride8q, [strideq*8]
793.loop:
794  pshuflw               m6, m3, 0x0
795  pshuflw               m7, m5, 0x0
796  punpcklqdq            m6, m6         ; l1 repeated 8 times [word]
797  punpcklqdq            m7, m7         ; l8 repeated 8 times [word]
798  paddw                 m1, m6, m0
799  paddw                 m6, m4         ; m1:m6 ti-tl+l1 [i=1,15] [word]
800  psrldq                m5, 2
801  packuswb              m1, m6
802  mova     [dstq         ], m1
803  paddw                 m1, m7, m0
804  paddw                 m7, m4         ; m1:m7 ti-tl+l8 [i=1,15] [word]
805  psrldq                m3, 2
806  packuswb              m1, m7
807  mova     [dstq+stride8q], m1
808  inc                lineq
809  lea                 dstq, [dstq+strideq]
810  jnz .loop
811  REP_RET
812
813INIT_XMM sse2
814cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
815  pxor                  m1, m1
816  movd                  m2, [aboveq-1]
817  mova                  m0, [aboveq]
818  mova                  m4, [aboveq+16]
819  punpcklbw             m2, m1
820  punpckhbw             m3, m0, m1
821  punpckhbw             m5, m4, m1
822  punpcklbw             m0, m1
823  punpcklbw             m4, m1
824  pshuflw               m2, m2, 0x0
825  DEFINE_ARGS dst, stride, line, left
826  mov                lineq, -16
827  punpcklqdq            m2, m2
828  add                leftq, 32
829  psubw                 m0, m2
830  psubw                 m3, m2
831  psubw                 m4, m2
832  psubw                 m5, m2
833.loop:
834  movd                  m2, [leftq+lineq*2]
835  pxor                  m1, m1
836  punpcklbw             m2, m1
837  pshuflw               m7, m2, 0x55
838  pshuflw               m2, m2, 0x0
839  punpcklqdq            m2, m2
840  punpcklqdq            m7, m7
841  paddw                 m6, m2, m3
842  paddw                 m1, m2, m0
843  packuswb              m1, m6
844  mova   [dstq           ], m1
845  paddw                 m6, m2, m5
846  paddw                 m1, m2, m4
847  packuswb              m1, m6
848  mova   [dstq+16        ], m1
849  paddw                 m6, m7, m3
850  paddw                 m1, m7, m0
851  packuswb              m1, m6
852  mova   [dstq+strideq   ], m1
853  paddw                 m6, m7, m5
854  paddw                 m1, m7, m4
855  packuswb              m1, m6
856  mova   [dstq+strideq+16], m1
857  lea                 dstq, [dstq+strideq*2]
858  inc                lineq
859  jnz .loop
860  REP_RET
861