1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_4:  times 8 dw 4
15pw_8:  times 8 dw 8
16pw_16: times 8 dw 16
17pw_32: times 8 dw 32
18dc_128: times 16 db 128
19pw2_4:  times 8 dw 2
20pw2_8:  times 8 dw 4
21pw2_16:  times 8 dw 8
22pw2_32:  times 8 dw 16
23
24SECTION .text
25
26INIT_MMX sse
27cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
28  GET_GOT     goffsetq
29
30  pxor                  m1, m1
31  movd                  m0, [aboveq]
32  punpckldq             m0, [leftq]
33  psadbw                m0, m1
34  paddw                 m0, [GLOBAL(pw_4)]
35  psraw                 m0, 3
36  pshufw                m0, m0, 0x0
37  packuswb              m0, m0
38  movd      [dstq        ], m0
39  movd      [dstq+strideq], m0
40  lea                 dstq, [dstq+strideq*2]
41  movd      [dstq        ], m0
42  movd      [dstq+strideq], m0
43
44  RESTORE_GOT
45  RET
46
47INIT_MMX sse
48cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
49  GET_GOT     goffsetq
50
51  pxor                  m1, m1
52  movd                  m0, [leftq]
53  psadbw                m0, m1
54  paddw                 m0, [GLOBAL(pw2_4)]
55  psraw                 m0, 2
56  pshufw                m0, m0, 0x0
57  packuswb              m0, m0
58  movd      [dstq        ], m0
59  movd      [dstq+strideq], m0
60  lea                 dstq, [dstq+strideq*2]
61  movd      [dstq        ], m0
62  movd      [dstq+strideq], m0
63
64  RESTORE_GOT
65  RET
66
67INIT_MMX sse
68cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
69  GET_GOT     goffsetq
70
71  pxor                  m1, m1
72  movd                  m0, [aboveq]
73  psadbw                m0, m1
74  paddw                 m0, [GLOBAL(pw2_4)]
75  psraw                 m0, 2
76  pshufw                m0, m0, 0x0
77  packuswb              m0, m0
78  movd      [dstq        ], m0
79  movd      [dstq+strideq], m0
80  lea                 dstq, [dstq+strideq*2]
81  movd      [dstq        ], m0
82  movd      [dstq+strideq], m0
83
84  RESTORE_GOT
85  RET
86
87INIT_MMX sse
88cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
89  GET_GOT     goffsetq
90
91  pxor                  m1, m1
92  movq                  m0, [aboveq]
93  movq                  m2, [leftq]
94  DEFINE_ARGS dst, stride, stride3
95  lea             stride3q, [strideq*3]
96  psadbw                m0, m1
97  psadbw                m2, m1
98  paddw                 m0, m2
99  paddw                 m0, [GLOBAL(pw_8)]
100  psraw                 m0, 4
101  pshufw                m0, m0, 0x0
102  packuswb              m0, m0
103  movq    [dstq          ], m0
104  movq    [dstq+strideq  ], m0
105  movq    [dstq+strideq*2], m0
106  movq    [dstq+stride3q ], m0
107  lea                 dstq, [dstq+strideq*4]
108  movq    [dstq          ], m0
109  movq    [dstq+strideq  ], m0
110  movq    [dstq+strideq*2], m0
111  movq    [dstq+stride3q ], m0
112
113  RESTORE_GOT
114  RET
115
116INIT_MMX sse
117cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
118  GET_GOT     goffsetq
119
120  pxor                  m1, m1
121  movq                  m0, [aboveq]
122  DEFINE_ARGS dst, stride, stride3
123  lea             stride3q, [strideq*3]
124  psadbw                m0, m1
125  paddw                 m0, [GLOBAL(pw2_8)]
126  psraw                 m0, 3
127  pshufw                m0, m0, 0x0
128  packuswb              m0, m0
129  movq    [dstq          ], m0
130  movq    [dstq+strideq  ], m0
131  movq    [dstq+strideq*2], m0
132  movq    [dstq+stride3q ], m0
133  lea                 dstq, [dstq+strideq*4]
134  movq    [dstq          ], m0
135  movq    [dstq+strideq  ], m0
136  movq    [dstq+strideq*2], m0
137  movq    [dstq+stride3q ], m0
138
139  RESTORE_GOT
140  RET
141
142INIT_MMX sse
143cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
144  GET_GOT     goffsetq
145
146  pxor                  m1, m1
147  movq                  m0, [leftq]
148  DEFINE_ARGS dst, stride, stride3
149  lea             stride3q, [strideq*3]
150  psadbw                m0, m1
151  paddw                 m0, [GLOBAL(pw2_8)]
152  psraw                 m0, 3
153  pshufw                m0, m0, 0x0
154  packuswb              m0, m0
155  movq    [dstq          ], m0
156  movq    [dstq+strideq  ], m0
157  movq    [dstq+strideq*2], m0
158  movq    [dstq+stride3q ], m0
159  lea                 dstq, [dstq+strideq*4]
160  movq    [dstq          ], m0
161  movq    [dstq+strideq  ], m0
162  movq    [dstq+strideq*2], m0
163  movq    [dstq+stride3q ], m0
164
165  RESTORE_GOT
166  RET
167
168INIT_MMX sse
169cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
170  GET_GOT     goffsetq
171
172  DEFINE_ARGS dst, stride, stride3
173  lea             stride3q, [strideq*3]
174  movd     m0,        [GLOBAL(dc_128)]
175  movd    [dstq          ], m0
176  movd    [dstq+strideq  ], m0
177  movd    [dstq+strideq*2], m0
178  movd    [dstq+stride3q ], m0
179  RESTORE_GOT
180  RET
181
182INIT_MMX sse
183cglobal dc_128_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
184  GET_GOT     goffsetq
185
186  DEFINE_ARGS dst, stride, stride3
187  lea             stride3q, [strideq*3]
188  movq    m0,        [GLOBAL(dc_128)]
189  movq    [dstq          ], m0
190  movq    [dstq+strideq  ], m0
191  movq    [dstq+strideq*2], m0
192  movq    [dstq+stride3q ], m0
193  lea                 dstq, [dstq+strideq*4]
194  movq    [dstq          ], m0
195  movq    [dstq+strideq  ], m0
196  movq    [dstq+strideq*2], m0
197  movq    [dstq+stride3q ], m0
198  RESTORE_GOT
199  RET
200
201INIT_XMM sse2
202cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
203  GET_GOT     goffsetq
204
205  pxor                  m1, m1
206  mova                  m0, [aboveq]
207  mova                  m2, [leftq]
208  DEFINE_ARGS dst, stride, stride3, lines4
209  lea             stride3q, [strideq*3]
210  mov              lines4d, 4
211  psadbw                m0, m1
212  psadbw                m2, m1
213  paddw                 m0, m2
214  movhlps               m2, m0
215  paddw                 m0, m2
216  paddw                 m0, [GLOBAL(pw_16)]
217  psraw                 m0, 5
218  pshuflw               m0, m0, 0x0
219  punpcklqdq            m0, m0
220  packuswb              m0, m0
221.loop:
222  mova    [dstq          ], m0
223  mova    [dstq+strideq  ], m0
224  mova    [dstq+strideq*2], m0
225  mova    [dstq+stride3q ], m0
226  lea                 dstq, [dstq+strideq*4]
227  dec              lines4d
228  jnz .loop
229
230  RESTORE_GOT
231  REP_RET
232
233
234INIT_XMM sse2
235cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
236  GET_GOT     goffsetq
237
238  pxor                  m1, m1
239  pxor                  m2, m2
240  mova                  m0, [aboveq]
241  DEFINE_ARGS dst, stride, stride3, lines4
242  lea             stride3q, [strideq*3]
243  mov              lines4d, 4
244  psadbw                m0, m1
245  psadbw                m2, m1
246  paddw                 m0, m2
247  movhlps               m2, m0
248  paddw                 m0, m2
249  paddw                 m0, [GLOBAL(pw2_16)]
250  psraw                 m0, 4
251  pshuflw               m0, m0, 0x0
252  punpcklqdq            m0, m0
253  packuswb              m0, m0
254.loop:
255  mova    [dstq          ], m0
256  mova    [dstq+strideq  ], m0
257  mova    [dstq+strideq*2], m0
258  mova    [dstq+stride3q ], m0
259  lea                 dstq, [dstq+strideq*4]
260  dec              lines4d
261  jnz .loop
262
263  RESTORE_GOT
264  REP_RET
265
266INIT_XMM sse2
267cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
268  GET_GOT     goffsetq
269
270  pxor                  m1, m1
271  pxor                  m2, m2
272  mova                  m0, [leftq]
273  DEFINE_ARGS dst, stride, stride3, lines4
274  lea             stride3q, [strideq*3]
275  mov              lines4d, 4
276  psadbw                m0, m1
277  psadbw                m2, m1
278  paddw                 m0, m2
279  movhlps               m2, m0
280  paddw                 m0, m2
281  paddw                 m0, [GLOBAL(pw2_16)]
282  psraw                 m0, 4
283  pshuflw               m0, m0, 0x0
284  punpcklqdq            m0, m0
285  packuswb              m0, m0
286.loop:
287  mova    [dstq          ], m0
288  mova    [dstq+strideq  ], m0
289  mova    [dstq+strideq*2], m0
290  mova    [dstq+stride3q ], m0
291  lea                 dstq, [dstq+strideq*4]
292  dec              lines4d
293  jnz .loop
294
295  RESTORE_GOT
296  REP_RET
297
298INIT_XMM sse2
299cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
300  GET_GOT     goffsetq
301
302  DEFINE_ARGS dst, stride, stride3, lines4
303  lea             stride3q, [strideq*3]
304  mov              lines4d, 4
305  mova    m0,        [GLOBAL(dc_128)]
306.loop:
307  mova    [dstq          ], m0
308  mova    [dstq+strideq  ], m0
309  mova    [dstq+strideq*2], m0
310  mova    [dstq+stride3q ], m0
311  lea                 dstq, [dstq+strideq*4]
312  dec              lines4d
313  jnz .loop
314  RESTORE_GOT
315  RET
316
317
318INIT_XMM sse2
319cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
320  GET_GOT     goffsetq
321
322  pxor                  m1, m1
323  mova                  m0, [aboveq]
324  mova                  m2, [aboveq+16]
325  mova                  m3, [leftq]
326  mova                  m4, [leftq+16]
327  DEFINE_ARGS dst, stride, stride3, lines4
328  lea             stride3q, [strideq*3]
329  mov              lines4d, 8
330  psadbw                m0, m1
331  psadbw                m2, m1
332  psadbw                m3, m1
333  psadbw                m4, m1
334  paddw                 m0, m2
335  paddw                 m0, m3
336  paddw                 m0, m4
337  movhlps               m2, m0
338  paddw                 m0, m2
339  paddw                 m0, [GLOBAL(pw_32)]
340  psraw                 m0, 6
341  pshuflw               m0, m0, 0x0
342  punpcklqdq            m0, m0
343  packuswb              m0, m0
344.loop:
345  mova [dstq             ], m0
346  mova [dstq          +16], m0
347  mova [dstq+strideq     ], m0
348  mova [dstq+strideq  +16], m0
349  mova [dstq+strideq*2   ], m0
350  mova [dstq+strideq*2+16], m0
351  mova [dstq+stride3q    ], m0
352  mova [dstq+stride3q +16], m0
353  lea                 dstq, [dstq+strideq*4]
354  dec              lines4d
355  jnz .loop
356
357  RESTORE_GOT
358  REP_RET
359
360INIT_XMM sse2
361cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
362  GET_GOT     goffsetq
363
364  pxor                  m1, m1
365  mova                  m0, [aboveq]
366  mova                  m2, [aboveq+16]
367  DEFINE_ARGS dst, stride, stride3, lines4
368  lea             stride3q, [strideq*3]
369  mov              lines4d, 8
370  psadbw                m0, m1
371  psadbw                m2, m1
372  paddw                 m0, m2
373  movhlps               m2, m0
374  paddw                 m0, m2
375  paddw                 m0, [GLOBAL(pw2_32)]
376  psraw                 m0, 5
377  pshuflw               m0, m0, 0x0
378  punpcklqdq            m0, m0
379  packuswb              m0, m0
380.loop:
381  mova [dstq             ], m0
382  mova [dstq          +16], m0
383  mova [dstq+strideq     ], m0
384  mova [dstq+strideq  +16], m0
385  mova [dstq+strideq*2   ], m0
386  mova [dstq+strideq*2+16], m0
387  mova [dstq+stride3q    ], m0
388  mova [dstq+stride3q +16], m0
389  lea                 dstq, [dstq+strideq*4]
390  dec              lines4d
391  jnz .loop
392
393  RESTORE_GOT
394  REP_RET
395
396INIT_XMM sse2
397cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
398  GET_GOT     goffsetq
399
400  pxor                  m1, m1
401  mova                  m0, [leftq]
402  mova                  m2, [leftq+16]
403  DEFINE_ARGS dst, stride, stride3, lines4
404  lea             stride3q, [strideq*3]
405  mov              lines4d, 8
406  psadbw                m0, m1
407  psadbw                m2, m1
408  paddw                 m0, m2
409  movhlps               m2, m0
410  paddw                 m0, m2
411  paddw                 m0, [GLOBAL(pw2_32)]
412  psraw                 m0, 5
413  pshuflw               m0, m0, 0x0
414  punpcklqdq            m0, m0
415  packuswb              m0, m0
416.loop:
417  mova [dstq             ], m0
418  mova [dstq          +16], m0
419  mova [dstq+strideq     ], m0
420  mova [dstq+strideq  +16], m0
421  mova [dstq+strideq*2   ], m0
422  mova [dstq+strideq*2+16], m0
423  mova [dstq+stride3q    ], m0
424  mova [dstq+stride3q +16], m0
425  lea                 dstq, [dstq+strideq*4]
426  dec              lines4d
427  jnz .loop
428
429  RESTORE_GOT
430  REP_RET
431
432INIT_XMM sse2
433cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
434  GET_GOT     goffsetq
435
436  DEFINE_ARGS dst, stride, stride3, lines4
437  lea             stride3q, [strideq*3]
438  mov              lines4d, 8
439  mova    m0,        [GLOBAL(dc_128)]
440.loop:
441  mova [dstq             ], m0
442  mova [dstq          +16], m0
443  mova [dstq+strideq     ], m0
444  mova [dstq+strideq  +16], m0
445  mova [dstq+strideq*2   ], m0
446  mova [dstq+strideq*2+16], m0
447  mova [dstq+stride3q    ], m0
448  mova [dstq+stride3q +16], m0
449  lea                 dstq, [dstq+strideq*4]
450  dec              lines4d
451  jnz .loop
452  RESTORE_GOT
453  RET
454
455INIT_MMX sse
456cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
457  movd                  m0, [aboveq]
458  movd      [dstq        ], m0
459  movd      [dstq+strideq], m0
460  lea                 dstq, [dstq+strideq*2]
461  movd      [dstq        ], m0
462  movd      [dstq+strideq], m0
463  RET
464
465INIT_MMX sse
466cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
467  movq                  m0, [aboveq]
468  DEFINE_ARGS dst, stride, stride3
469  lea             stride3q, [strideq*3]
470  movq    [dstq          ], m0
471  movq    [dstq+strideq  ], m0
472  movq    [dstq+strideq*2], m0
473  movq    [dstq+stride3q ], m0
474  lea                 dstq, [dstq+strideq*4]
475  movq    [dstq          ], m0
476  movq    [dstq+strideq  ], m0
477  movq    [dstq+strideq*2], m0
478  movq    [dstq+stride3q ], m0
479  RET
480
481INIT_XMM sse2
482cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
483  mova                  m0, [aboveq]
484  DEFINE_ARGS dst, stride, stride3, nlines4
485  lea             stride3q, [strideq*3]
486  mov              nlines4d, 4
487.loop:
488  mova    [dstq          ], m0
489  mova    [dstq+strideq  ], m0
490  mova    [dstq+strideq*2], m0
491  mova    [dstq+stride3q ], m0
492  lea                 dstq, [dstq+strideq*4]
493  dec             nlines4d
494  jnz .loop
495  REP_RET
496
497INIT_XMM sse2
498cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
499  mova                  m0, [aboveq]
500  mova                  m1, [aboveq+16]
501  DEFINE_ARGS dst, stride, stride3, nlines4
502  lea             stride3q, [strideq*3]
503  mov              nlines4d, 8
504.loop:
505  mova [dstq             ], m0
506  mova [dstq          +16], m1
507  mova [dstq+strideq     ], m0
508  mova [dstq+strideq  +16], m1
509  mova [dstq+strideq*2   ], m0
510  mova [dstq+strideq*2+16], m1
511  mova [dstq+stride3q    ], m0
512  mova [dstq+stride3q +16], m1
513  lea                 dstq, [dstq+strideq*4]
514  dec             nlines4d
515  jnz .loop
516  REP_RET
517
518INIT_MMX sse
519cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
520  pxor                  m1, m1
521  movd                  m2, [aboveq-1]
522  movd                  m0, [aboveq]
523  punpcklbw             m2, m1
524  punpcklbw             m0, m1
525  pshufw                m2, m2, 0x0
526  DEFINE_ARGS dst, stride, line, left
527  mov                lineq, -2
528  add                leftq, 4
529  psubw                 m0, m2
530.loop:
531  movd                  m2, [leftq+lineq*2]
532  movd                  m3, [leftq+lineq*2+1]
533  punpcklbw             m2, m1
534  punpcklbw             m3, m1
535  pshufw                m2, m2, 0x0
536  pshufw                m3, m3, 0x0
537  paddw                 m2, m0
538  paddw                 m3, m0
539  packuswb              m2, m2
540  packuswb              m3, m3
541  movd      [dstq        ], m2
542  movd      [dstq+strideq], m3
543  lea                 dstq, [dstq+strideq*2]
544  inc                lineq
545  jnz .loop
546  REP_RET
547
548INIT_XMM sse2
549cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
550  pxor                  m1, m1
551  movd                  m2, [aboveq-1]
552  movq                  m0, [aboveq]
553  punpcklbw             m2, m1
554  punpcklbw             m0, m1
555  pshuflw               m2, m2, 0x0
556  DEFINE_ARGS dst, stride, line, left
557  mov                lineq, -4
558  punpcklqdq            m2, m2
559  add                leftq, 8
560  psubw                 m0, m2
561.loop:
562  movd                  m2, [leftq+lineq*2]
563  movd                  m3, [leftq+lineq*2+1]
564  punpcklbw             m2, m1
565  punpcklbw             m3, m1
566  pshuflw               m2, m2, 0x0
567  pshuflw               m3, m3, 0x0
568  punpcklqdq            m2, m2
569  punpcklqdq            m3, m3
570  paddw                 m2, m0
571  paddw                 m3, m0
572  packuswb              m2, m3
573  movq      [dstq        ], m2
574  movhps    [dstq+strideq], m2
575  lea                 dstq, [dstq+strideq*2]
576  inc                lineq
577  jnz .loop
578  REP_RET
579
580INIT_XMM sse2
581cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
582  pxor                  m1, m1
583  movd                  m2, [aboveq-1]
584  mova                  m0, [aboveq]
585  punpcklbw             m2, m1
586  punpckhbw             m4, m0, m1
587  punpcklbw             m0, m1
588  pshuflw               m2, m2, 0x0
589  DEFINE_ARGS dst, stride, line, left
590  mov                lineq, -8
591  punpcklqdq            m2, m2
592  add                leftq, 16
593  psubw                 m0, m2
594  psubw                 m4, m2
595.loop:
596  movd                  m2, [leftq+lineq*2]
597  movd                  m3, [leftq+lineq*2+1]
598  punpcklbw             m2, m1
599  punpcklbw             m3, m1
600  pshuflw               m2, m2, 0x0
601  pshuflw               m3, m3, 0x0
602  punpcklqdq            m2, m2
603  punpcklqdq            m3, m3
604  paddw                 m5, m2, m0
605  paddw                 m6, m3, m0
606  paddw                 m2, m4
607  paddw                 m3, m4
608  packuswb              m5, m2
609  packuswb              m6, m3
610  mova      [dstq        ], m5
611  mova      [dstq+strideq], m6
612  lea                 dstq, [dstq+strideq*2]
613  inc                lineq
614  jnz .loop
615  REP_RET
616
617%if ARCH_X86_64
618INIT_XMM sse2
619cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
620  pxor                  m1, m1
621  movd                  m2, [aboveq-1]
622  mova                  m0, [aboveq]
623  mova                  m4, [aboveq+16]
624  punpcklbw             m2, m1
625  punpckhbw             m3, m0, m1
626  punpckhbw             m5, m4, m1
627  punpcklbw             m0, m1
628  punpcklbw             m4, m1
629  pshuflw               m2, m2, 0x0
630  DEFINE_ARGS dst, stride, line, left
631  mov                lineq, -16
632  punpcklqdq            m2, m2
633  add                leftq, 32
634  psubw                 m0, m2
635  psubw                 m3, m2
636  psubw                 m4, m2
637  psubw                 m5, m2
638.loop:
639  movd                  m2, [leftq+lineq*2]
640  movd                  m6, [leftq+lineq*2+1]
641  punpcklbw             m2, m1
642  punpcklbw             m6, m1
643  pshuflw               m2, m2, 0x0
644  pshuflw               m6, m6, 0x0
645  punpcklqdq            m2, m2
646  punpcklqdq            m6, m6
647  paddw                 m7, m2, m0
648  paddw                 m8, m2, m3
649  paddw                 m9, m2, m4
650  paddw                 m2, m5
651  packuswb              m7, m8
652  packuswb              m9, m2
653  paddw                 m2, m6, m0
654  paddw                 m8, m6, m3
655  mova   [dstq           ], m7
656  paddw                 m7, m6, m4
657  paddw                 m6, m5
658  mova   [dstq        +16], m9
659  packuswb              m2, m8
660  packuswb              m7, m6
661  mova   [dstq+strideq   ], m2
662  mova   [dstq+strideq+16], m7
663  lea                 dstq, [dstq+strideq*2]
664  inc                lineq
665  jnz .loop
666  REP_RET
667%endif
668