1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_4:  times 8 dw 4
15pw_8:  times 8 dw 8
16pw_16: times 4 dd 16
17pw_32: times 4 dd 32
18
19SECTION .text
20INIT_MMX sse
21cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
22  GET_GOT     goffsetq
23
24  movq                  m0, [aboveq]
25  movq                  m2, [leftq]
26  DEFINE_ARGS dst, stride, one
27  mov                 oned, 0x0001
28  pxor                  m1, m1
29  movd                  m3, oned
30  pshufw                m3, m3, 0x0
31  paddw                 m0, m2
32  pmaddwd               m0, m3
33  packssdw              m0, m1
34  pmaddwd               m0, m3
35  paddw                 m0, [GLOBAL(pw_4)]
36  psraw                 m0, 3
37  pshufw                m0, m0, 0x0
38  movq    [dstq          ], m0
39  movq    [dstq+strideq*2], m0
40  lea                 dstq, [dstq+strideq*4]
41  movq    [dstq          ], m0
42  movq    [dstq+strideq*2], m0
43
44  RESTORE_GOT
45  RET
46
47INIT_XMM sse2
48cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
49  GET_GOT     goffsetq
50
51  pxor                  m1, m1
52  mova                  m0, [aboveq]
53  mova                  m2, [leftq]
54  DEFINE_ARGS dst, stride, stride3, one
55  mov                 oned, 0x00010001
56  lea             stride3q, [strideq*3]
57  movd                  m3, oned
58  pshufd                m3, m3, 0x0
59  paddw                 m0, m2
60  pmaddwd               m0, m3
61  packssdw              m0, m1
62  pmaddwd               m0, m3
63  packssdw              m0, m1
64  pmaddwd               m0, m3
65  paddw                 m0, [GLOBAL(pw_8)]
66  psrlw                 m0, 4
67  pshuflw               m0, m0, 0x0
68  punpcklqdq            m0, m0
69  mova   [dstq           ], m0
70  mova   [dstq+strideq*2 ], m0
71  mova   [dstq+strideq*4 ], m0
72  mova   [dstq+stride3q*2], m0
73  lea                 dstq, [dstq+strideq*8]
74  mova   [dstq           ], m0
75  mova   [dstq+strideq*2 ], m0
76  mova   [dstq+strideq*4 ], m0
77  mova   [dstq+stride3q*2], m0
78
79  RESTORE_GOT
80  RET
81
82INIT_XMM sse2
83cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
84  GET_GOT     goffsetq
85
86  pxor                  m1, m1
87  mova                  m0, [aboveq]
88  mova                  m3, [aboveq+16]
89  mova                  m2, [leftq]
90  mova                  m4, [leftq+16]
91  DEFINE_ARGS dst, stride, stride3, lines4
92  lea             stride3q, [strideq*3]
93  mov              lines4d, 4
94  paddw                 m0, m2
95  paddw                 m0, m3
96  paddw                 m0, m4
97  movhlps               m2, m0
98  paddw                 m0, m2
99  punpcklwd             m0, m1
100  movhlps               m2, m0
101  paddd                 m0, m2
102  punpckldq             m0, m1
103  movhlps               m2, m0
104  paddd                 m0, m2
105  paddd                 m0, [GLOBAL(pw_16)]
106  psrad                 m0, 5
107  pshuflw               m0, m0, 0x0
108  punpcklqdq            m0, m0
109.loop:
110  mova   [dstq              ], m0
111  mova   [dstq           +16], m0
112  mova   [dstq+strideq*2    ], m0
113  mova   [dstq+strideq*2 +16], m0
114  mova   [dstq+strideq*4    ], m0
115  mova   [dstq+strideq*4 +16], m0
116  mova   [dstq+stride3q*2   ], m0
117  mova   [dstq+stride3q*2+16], m0
118  lea                 dstq, [dstq+strideq*8]
119  dec              lines4d
120  jnz .loop
121
122  RESTORE_GOT
123  REP_RET
124
125%if ARCH_X86_64
126INIT_XMM sse2
127cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
128  GET_GOT     goffsetq
129
130  pxor                  m1, m1
131  mova                  m0, [aboveq]
132  mova                  m2, [aboveq+16]
133  mova                  m3, [aboveq+32]
134  mova                  m4, [aboveq+48]
135  mova                  m5, [leftq]
136  mova                  m6, [leftq+16]
137  mova                  m7, [leftq+32]
138  mova                  m8, [leftq+48]
139  DEFINE_ARGS dst, stride, stride3, lines4
140  lea             stride3q, [strideq*3]
141  mov              lines4d, 8
142  paddw                 m0, m2
143  paddw                 m0, m3
144  paddw                 m0, m4
145  paddw                 m0, m5
146  paddw                 m0, m6
147  paddw                 m0, m7
148  paddw                 m0, m8
149  movhlps               m2, m0
150  paddw                 m0, m2
151  punpcklwd             m0, m1
152  movhlps               m2, m0
153  paddd                 m0, m2
154  punpckldq             m0, m1
155  movhlps               m2, m0
156  paddd                 m0, m2
157  paddd                 m0, [GLOBAL(pw_32)]
158  psrad                 m0, 6
159  pshuflw               m0, m0, 0x0
160  punpcklqdq            m0, m0
161.loop:
162  mova [dstq               ], m0
163  mova [dstq          +16  ], m0
164  mova [dstq          +32  ], m0
165  mova [dstq          +48  ], m0
166  mova [dstq+strideq*2     ], m0
167  mova [dstq+strideq*2+16  ], m0
168  mova [dstq+strideq*2+32  ], m0
169  mova [dstq+strideq*2+48  ], m0
170  mova [dstq+strideq*4     ], m0
171  mova [dstq+strideq*4+16  ], m0
172  mova [dstq+strideq*4+32  ], m0
173  mova [dstq+strideq*4+48  ], m0
174  mova [dstq+stride3q*2    ], m0
175  mova [dstq+stride3q*2 +16], m0
176  mova [dstq+stride3q*2 +32], m0
177  mova [dstq+stride3q*2 +48], m0
178  lea                 dstq, [dstq+strideq*8]
179  dec              lines4d
180  jnz .loop
181
182  RESTORE_GOT
183  REP_RET
184%endif
185
186INIT_MMX sse
187cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
188  movq                  m0, [aboveq]
189  movq    [dstq          ], m0
190  movq    [dstq+strideq*2], m0
191  lea                 dstq, [dstq+strideq*4]
192  movq    [dstq          ], m0
193  movq    [dstq+strideq*2], m0
194  RET
195
196INIT_XMM sse2
197cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
198  mova                  m0, [aboveq]
199  DEFINE_ARGS dst, stride, stride3
200  lea             stride3q, [strideq*3]
201  mova   [dstq           ], m0
202  mova   [dstq+strideq*2 ], m0
203  mova   [dstq+strideq*4 ], m0
204  mova   [dstq+stride3q*2], m0
205  lea                 dstq, [dstq+strideq*8]
206  mova   [dstq           ], m0
207  mova   [dstq+strideq*2 ], m0
208  mova   [dstq+strideq*4 ], m0
209  mova   [dstq+stride3q*2], m0
210  RET
211
212INIT_XMM sse2
213cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
214  mova                  m0, [aboveq]
215  mova                  m1, [aboveq+16]
216  DEFINE_ARGS dst, stride, stride3, nlines4
217  lea             stride3q, [strideq*3]
218  mov              nlines4d, 4
219.loop:
220  mova    [dstq              ], m0
221  mova    [dstq           +16], m1
222  mova    [dstq+strideq*2    ], m0
223  mova    [dstq+strideq*2 +16], m1
224  mova    [dstq+strideq*4    ], m0
225  mova    [dstq+strideq*4 +16], m1
226  mova    [dstq+stride3q*2   ], m0
227  mova    [dstq+stride3q*2+16], m1
228  lea                 dstq, [dstq+strideq*8]
229  dec             nlines4d
230  jnz .loop
231  REP_RET
232
233INIT_XMM sse2
234cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
235  mova                  m0, [aboveq]
236  mova                  m1, [aboveq+16]
237  mova                  m2, [aboveq+32]
238  mova                  m3, [aboveq+48]
239  DEFINE_ARGS dst, stride, stride3, nlines4
240  lea             stride3q, [strideq*3]
241  mov              nlines4d, 8
242.loop:
243  mova [dstq               ], m0
244  mova [dstq            +16], m1
245  mova [dstq            +32], m2
246  mova [dstq            +48], m3
247  mova [dstq+strideq*2     ], m0
248  mova [dstq+strideq*2  +16], m1
249  mova [dstq+strideq*2  +32], m2
250  mova [dstq+strideq*2  +48], m3
251  mova [dstq+strideq*4     ], m0
252  mova [dstq+strideq*4  +16], m1
253  mova [dstq+strideq*4  +32], m2
254  mova [dstq+strideq*4  +48], m3
255  mova [dstq+stride3q*2    ], m0
256  mova [dstq+stride3q*2 +16], m1
257  mova [dstq+stride3q*2 +32], m2
258  mova [dstq+stride3q*2 +48], m3
259  lea                 dstq, [dstq+strideq*8]
260  dec             nlines4d
261  jnz .loop
262  REP_RET
263
264INIT_MMX sse
265cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
266  movd                  m1, [aboveq-2]
267  movq                  m0, [aboveq]
268  pshufw                m1, m1, 0x0
269  ; Get the values to compute the maximum value at this bit depth
270  mov                 oned, 1
271  movd                  m3, oned
272  movd                  m4, bpsd
273  pshufw                m3, m3, 0x0
274  DEFINE_ARGS dst, stride, line, left
275  mov                lineq, -2
276  mova                  m2, m3
277  psllw                 m3, m4
278  add                leftq, 8
279  psubw                 m3, m2 ; max possible value
280  pxor                  m4, m4 ; min possible value
281  psubw                 m0, m1
282.loop:
283  movq                  m1, [leftq+lineq*4]
284  movq                  m2, [leftq+lineq*4+2]
285  pshufw                m1, m1, 0x0
286  pshufw                m2, m2, 0x0
287  paddw                 m1, m0
288  paddw                 m2, m0
289  ;Clamp to the bit-depth
290  pminsw                m1, m3
291  pminsw                m2, m3
292  pmaxsw                m1, m4
293  pmaxsw                m2, m4
294  ;Store the values
295  movq    [dstq          ], m1
296  movq    [dstq+strideq*2], m2
297  lea                 dstq, [dstq+strideq*4]
298  inc                lineq
299  jnz .loop
300  REP_RET
301
302INIT_XMM sse2
303cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
304  movd                  m1, [aboveq-2]
305  mova                  m0, [aboveq]
306  pshuflw               m1, m1, 0x0
307  ; Get the values to compute the maximum value at this bit depth
308  mov                 oned, 1
309  pxor                  m3, m3
310  pxor                  m4, m4
311  pinsrw                m3, oned, 0
312  pinsrw                m4, bpsd, 0
313  pshuflw               m3, m3, 0x0
314  DEFINE_ARGS dst, stride, line, left
315  punpcklqdq            m3, m3
316  mov                lineq, -4
317  mova                  m2, m3
318  punpcklqdq            m1, m1
319  psllw                 m3, m4
320  add                leftq, 16
321  psubw                 m3, m2 ; max possible value
322  pxor                  m4, m4 ; min possible value
323  psubw                 m0, m1
324.loop:
325  movd                  m1, [leftq+lineq*4]
326  movd                  m2, [leftq+lineq*4+2]
327  pshuflw               m1, m1, 0x0
328  pshuflw               m2, m2, 0x0
329  punpcklqdq            m1, m1
330  punpcklqdq            m2, m2
331  paddw                 m1, m0
332  paddw                 m2, m0
333  ;Clamp to the bit-depth
334  pminsw                m1, m3
335  pminsw                m2, m3
336  pmaxsw                m1, m4
337  pmaxsw                m2, m4
338  ;Store the values
339  mova      [dstq          ], m1
340  mova      [dstq+strideq*2], m2
341  lea                 dstq, [dstq+strideq*4]
342  inc                lineq
343  jnz .loop
344  REP_RET
345
346%if ARCH_X86_64
347INIT_XMM sse2
348cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
349  movd                  m2, [aboveq-2]
350  mova                  m0, [aboveq]
351  mova                  m1, [aboveq+16]
352  pshuflw               m2, m2, 0x0
353  ; Get the values to compute the maximum value at this bit depth
354  mov                 oned, 1
355  pxor                  m7, m7
356  pxor                  m8, m8
357  pinsrw                m7, oned, 0
358  pinsrw                m8, bpsd, 0
359  pshuflw               m7, m7, 0x0
360  DEFINE_ARGS dst, stride, line, left
361  punpcklqdq            m7, m7
362  mov                lineq, -8
363  mova                  m5, m7
364  punpcklqdq            m2, m2
365  psllw                 m7, m8
366  add                leftq, 32
367  psubw                 m7, m5 ; max possible value
368  pxor                  m8, m8 ; min possible value
369  psubw                 m0, m2
370  psubw                 m1, m2
371.loop:
372  movd                  m2, [leftq+lineq*4]
373  movd                  m3, [leftq+lineq*4+2]
374  pshuflw               m2, m2, 0x0
375  pshuflw               m3, m3, 0x0
376  punpcklqdq            m2, m2
377  punpcklqdq            m3, m3
378  paddw                 m4, m2, m0
379  paddw                 m5, m3, m0
380  paddw                 m2, m1
381  paddw                 m3, m1
382  ;Clamp to the bit-depth
383  pminsw                m4, m7
384  pminsw                m5, m7
385  pminsw                m2, m7
386  pminsw                m3, m7
387  pmaxsw                m4, m8
388  pmaxsw                m5, m8
389  pmaxsw                m2, m8
390  pmaxsw                m3, m8
391  ;Store the values
392  mova   [dstq             ], m4
393  mova   [dstq+strideq*2   ], m5
394  mova   [dstq          +16], m2
395  mova   [dstq+strideq*2+16], m3
396  lea                 dstq, [dstq+strideq*4]
397  inc                lineq
398  jnz .loop
399  REP_RET
400
401INIT_XMM sse2
402cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
403  movd                  m0, [aboveq-2]
404  mova                  m1, [aboveq]
405  mova                  m2, [aboveq+16]
406  mova                  m3, [aboveq+32]
407  mova                  m4, [aboveq+48]
408  pshuflw               m0, m0, 0x0
409  ; Get the values to compute the maximum value at this bit depth
410  mov                 oned, 1
411  pxor                 m10, m10
412  pxor                 m11, m11
413  pinsrw               m10, oned, 0
414  pinsrw               m11, bpsd, 0
415  pshuflw              m10, m10, 0x0
416  DEFINE_ARGS dst, stride, line, left
417  punpcklqdq           m10, m10
418  mov                lineq, -16
419  mova                  m5, m10
420  punpcklqdq            m0, m0
421  psllw                m10, m11
422  add                leftq, 64
423  psubw                m10, m5 ; max possible value
424  pxor                 m11, m11 ; min possible value
425  psubw                 m1, m0
426  psubw                 m2, m0
427  psubw                 m3, m0
428  psubw                 m4, m0
429.loop:
430  movd                  m5, [leftq+lineq*4]
431  movd                  m6, [leftq+lineq*4+2]
432  pshuflw               m5, m5, 0x0
433  pshuflw               m6, m6, 0x0
434  punpcklqdq            m5, m5
435  punpcklqdq            m6, m6
436  paddw                 m7, m5, m1
437  paddw                 m8, m5, m2
438  paddw                 m9, m5, m3
439  paddw                 m5, m4
440  ;Clamp these values to the bit-depth
441  pminsw                m7, m10
442  pminsw                m8, m10
443  pminsw                m9, m10
444  pminsw                m5, m10
445  pmaxsw                m7, m11
446  pmaxsw                m8, m11
447  pmaxsw                m9, m11
448  pmaxsw                m5, m11
449  ;Store these values
450  mova   [dstq           ], m7
451  mova   [dstq        +16], m8
452  mova   [dstq        +32], m9
453  mova   [dstq        +48], m5
454  paddw                 m7, m6, m1
455  paddw                 m8, m6, m2
456  paddw                 m9, m6, m3
457  paddw                 m6, m4
458  ;Clamp these values to the bit-depth
459  pminsw                m7, m10
460  pminsw                m8, m10
461  pminsw                m9, m10
462  pminsw                m6, m10
463  pmaxsw                m7, m11
464  pmaxsw                m8, m11
465  pmaxsw                m9, m11
466  pmaxsw                m6, m11
467  ;Store these values
468  mova   [dstq+strideq*2   ], m7
469  mova   [dstq+strideq*2+16], m8
470  mova   [dstq+strideq*2+32], m9
471  mova   [dstq+strideq*2+48], m6
472  lea                 dstq, [dstq+strideq*4]
473  inc                lineq
474  jnz .loop
475  REP_RET
476%endif
477