1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_4:  times 8 dw 4
15pw_8:  times 8 dw 8
16pw_16: times 8 dw 16
17pw_32: times 8 dw 32
18
19SECTION .text
20
21INIT_MMX sse
22cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
23  GET_GOT     goffsetq
24
25  pxor                  m1, m1
26  movd                  m0, [aboveq]
27  punpckldq             m0, [leftq]
28  psadbw                m0, m1
29  paddw                 m0, [GLOBAL(pw_4)]
30  psraw                 m0, 3
31  pshufw                m0, m0, 0x0
32  packuswb              m0, m0
33  movd      [dstq        ], m0
34  movd      [dstq+strideq], m0
35  lea                 dstq, [dstq+strideq*2]
36  movd      [dstq        ], m0
37  movd      [dstq+strideq], m0
38
39  RESTORE_GOT
40  RET
41
42INIT_MMX sse
43cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
44  GET_GOT     goffsetq
45
46  pxor                  m1, m1
47  movq                  m0, [aboveq]
48  movq                  m2, [leftq]
49  DEFINE_ARGS dst, stride, stride3
50  lea             stride3q, [strideq*3]
51  psadbw                m0, m1
52  psadbw                m2, m1
53  paddw                 m0, m2
54  paddw                 m0, [GLOBAL(pw_8)]
55  psraw                 m0, 4
56  pshufw                m0, m0, 0x0
57  packuswb              m0, m0
58  movq    [dstq          ], m0
59  movq    [dstq+strideq  ], m0
60  movq    [dstq+strideq*2], m0
61  movq    [dstq+stride3q ], m0
62  lea                 dstq, [dstq+strideq*4]
63  movq    [dstq          ], m0
64  movq    [dstq+strideq  ], m0
65  movq    [dstq+strideq*2], m0
66  movq    [dstq+stride3q ], m0
67
68  RESTORE_GOT
69  RET
70
71INIT_XMM sse2
72cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
73  GET_GOT     goffsetq
74
75  pxor                  m1, m1
76  mova                  m0, [aboveq]
77  mova                  m2, [leftq]
78  DEFINE_ARGS dst, stride, stride3, lines4
79  lea             stride3q, [strideq*3]
80  mov              lines4d, 4
81  psadbw                m0, m1
82  psadbw                m2, m1
83  paddw                 m0, m2
84  movhlps               m2, m0
85  paddw                 m0, m2
86  paddw                 m0, [GLOBAL(pw_16)]
87  psraw                 m0, 5
88  pshuflw               m0, m0, 0x0
89  punpcklqdq            m0, m0
90  packuswb              m0, m0
91.loop:
92  mova    [dstq          ], m0
93  mova    [dstq+strideq  ], m0
94  mova    [dstq+strideq*2], m0
95  mova    [dstq+stride3q ], m0
96  lea                 dstq, [dstq+strideq*4]
97  dec              lines4d
98  jnz .loop
99
100  RESTORE_GOT
101  REP_RET
102
103INIT_XMM sse2
104cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
105  GET_GOT     goffsetq
106
107  pxor                  m1, m1
108  mova                  m0, [aboveq]
109  mova                  m2, [aboveq+16]
110  mova                  m3, [leftq]
111  mova                  m4, [leftq+16]
112  DEFINE_ARGS dst, stride, stride3, lines4
113  lea             stride3q, [strideq*3]
114  mov              lines4d, 8
115  psadbw                m0, m1
116  psadbw                m2, m1
117  psadbw                m3, m1
118  psadbw                m4, m1
119  paddw                 m0, m2
120  paddw                 m0, m3
121  paddw                 m0, m4
122  movhlps               m2, m0
123  paddw                 m0, m2
124  paddw                 m0, [GLOBAL(pw_32)]
125  psraw                 m0, 6
126  pshuflw               m0, m0, 0x0
127  punpcklqdq            m0, m0
128  packuswb              m0, m0
129.loop:
130  mova [dstq             ], m0
131  mova [dstq          +16], m0
132  mova [dstq+strideq     ], m0
133  mova [dstq+strideq  +16], m0
134  mova [dstq+strideq*2   ], m0
135  mova [dstq+strideq*2+16], m0
136  mova [dstq+stride3q    ], m0
137  mova [dstq+stride3q +16], m0
138  lea                 dstq, [dstq+strideq*4]
139  dec              lines4d
140  jnz .loop
141
142  RESTORE_GOT
143  REP_RET
144
145INIT_MMX sse
146cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
147  movd                  m0, [aboveq]
148  movd      [dstq        ], m0
149  movd      [dstq+strideq], m0
150  lea                 dstq, [dstq+strideq*2]
151  movd      [dstq        ], m0
152  movd      [dstq+strideq], m0
153  RET
154
155INIT_MMX sse
156cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
157  movq                  m0, [aboveq]
158  DEFINE_ARGS dst, stride, stride3
159  lea             stride3q, [strideq*3]
160  movq    [dstq          ], m0
161  movq    [dstq+strideq  ], m0
162  movq    [dstq+strideq*2], m0
163  movq    [dstq+stride3q ], m0
164  lea                 dstq, [dstq+strideq*4]
165  movq    [dstq          ], m0
166  movq    [dstq+strideq  ], m0
167  movq    [dstq+strideq*2], m0
168  movq    [dstq+stride3q ], m0
169  RET
170
171INIT_XMM sse2
172cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
173  mova                  m0, [aboveq]
174  DEFINE_ARGS dst, stride, stride3, nlines4
175  lea             stride3q, [strideq*3]
176  mov              nlines4d, 4
177.loop:
178  mova    [dstq          ], m0
179  mova    [dstq+strideq  ], m0
180  mova    [dstq+strideq*2], m0
181  mova    [dstq+stride3q ], m0
182  lea                 dstq, [dstq+strideq*4]
183  dec             nlines4d
184  jnz .loop
185  REP_RET
186
187INIT_XMM sse2
188cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
189  mova                  m0, [aboveq]
190  mova                  m1, [aboveq+16]
191  DEFINE_ARGS dst, stride, stride3, nlines4
192  lea             stride3q, [strideq*3]
193  mov              nlines4d, 8
194.loop:
195  mova [dstq             ], m0
196  mova [dstq          +16], m1
197  mova [dstq+strideq     ], m0
198  mova [dstq+strideq  +16], m1
199  mova [dstq+strideq*2   ], m0
200  mova [dstq+strideq*2+16], m1
201  mova [dstq+stride3q    ], m0
202  mova [dstq+stride3q +16], m1
203  lea                 dstq, [dstq+strideq*4]
204  dec             nlines4d
205  jnz .loop
206  REP_RET
207
208INIT_MMX sse
209cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
210  pxor                  m1, m1
211  movd                  m2, [aboveq-1]
212  movd                  m0, [aboveq]
213  punpcklbw             m2, m1
214  punpcklbw             m0, m1
215  pshufw                m2, m2, 0x0
216  DEFINE_ARGS dst, stride, line, left
217  mov                lineq, -2
218  add                leftq, 4
219  psubw                 m0, m2
220.loop:
221  movd                  m2, [leftq+lineq*2]
222  movd                  m3, [leftq+lineq*2+1]
223  punpcklbw             m2, m1
224  punpcklbw             m3, m1
225  pshufw                m2, m2, 0x0
226  pshufw                m3, m3, 0x0
227  paddw                 m2, m0
228  paddw                 m3, m0
229  packuswb              m2, m2
230  packuswb              m3, m3
231  movd      [dstq        ], m2
232  movd      [dstq+strideq], m3
233  lea                 dstq, [dstq+strideq*2]
234  inc                lineq
235  jnz .loop
236  REP_RET
237
238INIT_XMM sse2
239cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
240  pxor                  m1, m1
241  movd                  m2, [aboveq-1]
242  movq                  m0, [aboveq]
243  punpcklbw             m2, m1
244  punpcklbw             m0, m1
245  pshuflw               m2, m2, 0x0
246  DEFINE_ARGS dst, stride, line, left
247  mov                lineq, -4
248  punpcklqdq            m2, m2
249  add                leftq, 8
250  psubw                 m0, m2
251.loop:
252  movd                  m2, [leftq+lineq*2]
253  movd                  m3, [leftq+lineq*2+1]
254  punpcklbw             m2, m1
255  punpcklbw             m3, m1
256  pshuflw               m2, m2, 0x0
257  pshuflw               m3, m3, 0x0
258  punpcklqdq            m2, m2
259  punpcklqdq            m3, m3
260  paddw                 m2, m0
261  paddw                 m3, m0
262  packuswb              m2, m3
263  movq      [dstq        ], m2
264  movhps    [dstq+strideq], m2
265  lea                 dstq, [dstq+strideq*2]
266  inc                lineq
267  jnz .loop
268  REP_RET
269
270INIT_XMM sse2
271cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
272  pxor                  m1, m1
273  movd                  m2, [aboveq-1]
274  mova                  m0, [aboveq]
275  punpcklbw             m2, m1
276  punpckhbw             m4, m0, m1
277  punpcklbw             m0, m1
278  pshuflw               m2, m2, 0x0
279  DEFINE_ARGS dst, stride, line, left
280  mov                lineq, -8
281  punpcklqdq            m2, m2
282  add                leftq, 16
283  psubw                 m0, m2
284  psubw                 m4, m2
285.loop:
286  movd                  m2, [leftq+lineq*2]
287  movd                  m3, [leftq+lineq*2+1]
288  punpcklbw             m2, m1
289  punpcklbw             m3, m1
290  pshuflw               m2, m2, 0x0
291  pshuflw               m3, m3, 0x0
292  punpcklqdq            m2, m2
293  punpcklqdq            m3, m3
294  paddw                 m5, m2, m0
295  paddw                 m6, m3, m0
296  paddw                 m2, m4
297  paddw                 m3, m4
298  packuswb              m5, m2
299  packuswb              m6, m3
300  mova      [dstq        ], m5
301  mova      [dstq+strideq], m6
302  lea                 dstq, [dstq+strideq*2]
303  inc                lineq
304  jnz .loop
305  REP_RET
306
307%if ARCH_X86_64
308INIT_XMM sse2
309cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
310  pxor                  m1, m1
311  movd                  m2, [aboveq-1]
312  mova                  m0, [aboveq]
313  mova                  m4, [aboveq+16]
314  punpcklbw             m2, m1
315  punpckhbw             m3, m0, m1
316  punpckhbw             m5, m4, m1
317  punpcklbw             m0, m1
318  punpcklbw             m4, m1
319  pshuflw               m2, m2, 0x0
320  DEFINE_ARGS dst, stride, line, left
321  mov                lineq, -16
322  punpcklqdq            m2, m2
323  add                leftq, 32
324  psubw                 m0, m2
325  psubw                 m3, m2
326  psubw                 m4, m2
327  psubw                 m5, m2
328.loop:
329  movd                  m2, [leftq+lineq*2]
330  movd                  m6, [leftq+lineq*2+1]
331  punpcklbw             m2, m1
332  punpcklbw             m6, m1
333  pshuflw               m2, m2, 0x0
334  pshuflw               m6, m6, 0x0
335  punpcklqdq            m2, m2
336  punpcklqdq            m6, m6
337  paddw                 m7, m2, m0
338  paddw                 m8, m2, m3
339  paddw                 m9, m2, m4
340  paddw                 m2, m5
341  packuswb              m7, m8
342  packuswb              m9, m2
343  paddw                 m2, m6, m0
344  paddw                 m8, m6, m3
345  mova   [dstq           ], m7
346  paddw                 m7, m6, m4
347  paddw                 m6, m5
348  mova   [dstq        +16], m9
349  packuswb              m2, m8
350  packuswb              m7, m6
351  mova   [dstq+strideq   ], m2
352  mova   [dstq+strideq+16], m7
353  lea                 dstq, [dstq+strideq*2]
354  inc                lineq
355  jnz .loop
356  REP_RET
357%endif
358