1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14
15pb_1: times 16 db 1
16sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
17sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
18sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
19sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
20sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
21sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
22sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
23sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
24sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
25sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
26sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
27sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
28sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
29sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
30sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
31sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
32sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
33
34SECTION .text
35
36INIT_MMX ssse3
37cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left
38  movifnidn          leftq, leftmp
39  add                leftq, 4
40  mov                lineq, -2
41  pxor                  m0, m0
42.loop:
43  movd                  m1, [leftq+lineq*2  ]
44  movd                  m2, [leftq+lineq*2+1]
45  pshufb                m1, m0
46  pshufb                m2, m0
47  movd      [dstq        ], m1
48  movd      [dstq+strideq], m2
49  lea                 dstq, [dstq+strideq*2]
50  inc                lineq
51  jnz .loop
52  REP_RET
53
54INIT_MMX ssse3
55cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left
56  movifnidn          leftq, leftmp
57  add                leftq, 8
58  mov                lineq, -4
59  pxor                  m0, m0
60.loop:
61  movd                  m1, [leftq+lineq*2  ]
62  movd                  m2, [leftq+lineq*2+1]
63  pshufb                m1, m0
64  pshufb                m2, m0
65  movq      [dstq        ], m1
66  movq      [dstq+strideq], m2
67  lea                 dstq, [dstq+strideq*2]
68  inc                lineq
69  jnz .loop
70  REP_RET
71
72INIT_XMM ssse3
73cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left
74  movifnidn          leftq, leftmp
75  add                leftq, 16
76  mov                lineq, -8
77  pxor                  m0, m0
78.loop:
79  movd                  m1, [leftq+lineq*2  ]
80  movd                  m2, [leftq+lineq*2+1]
81  pshufb                m1, m0
82  pshufb                m2, m0
83  mova      [dstq        ], m1
84  mova      [dstq+strideq], m2
85  lea                 dstq, [dstq+strideq*2]
86  inc                lineq
87  jnz .loop
88  REP_RET
89
90INIT_XMM ssse3
91cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
92  movifnidn          leftq, leftmp
93  add                leftq, 32
94  mov                lineq, -16
95  pxor                  m0, m0
96.loop:
97  movd                  m1, [leftq+lineq*2  ]
98  movd                  m2, [leftq+lineq*2+1]
99  pshufb                m1, m0
100  pshufb                m2, m0
101  mova   [dstq           ], m1
102  mova   [dstq        +16], m1
103  mova   [dstq+strideq   ], m2
104  mova   [dstq+strideq+16], m2
105  lea                 dstq, [dstq+strideq*2]
106  inc                lineq
107  jnz .loop
108  REP_RET
109
110INIT_MMX ssse3
111cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
112  GET_GOT     goffsetq
113
114  movq                m0, [aboveq]
115  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
116  pshufb              m1, m0, [GLOBAL(sh_b01234577)]
117  pshufb              m0, [GLOBAL(sh_b12345677)]
118  pavgb               m3, m2, m1
119  pxor                m2, m1
120  pand                m2, [GLOBAL(pb_1)]
121  psubb               m3, m2
122  pavgb               m0, m3
123
124  ; store 4 lines
125  movd    [dstq        ], m0
126  psrlq               m0, 8
127  movd    [dstq+strideq], m0
128  lea               dstq, [dstq+strideq*2]
129  psrlq               m0, 8
130  movd    [dstq        ], m0
131  psrlq               m0, 8
132  movd    [dstq+strideq], m0
133
134  RESTORE_GOT
135  RET
136
137INIT_MMX ssse3
138cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
139  GET_GOT     goffsetq
140
141  movq                m0, [aboveq]
142  mova                m1, [GLOBAL(sh_b12345677)]
143  DEFINE_ARGS dst, stride, stride3
144  lea           stride3q, [strideq*3]
145  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
146  pavgb               m3, m2, m0
147  pxor                m2, m0
148  pshufb              m0, m1
149  pand                m2, [GLOBAL(pb_1)]
150  psubb               m3, m2
151  pavgb               m0, m3
152
153  ; store 4 lines
154  movq  [dstq          ], m0
155  pshufb              m0, m1
156  movq  [dstq+strideq  ], m0
157  pshufb              m0, m1
158  movq  [dstq+strideq*2], m0
159  pshufb              m0, m1
160  movq  [dstq+stride3q ], m0
161  pshufb              m0, m1
162  lea               dstq, [dstq+strideq*4]
163
164  ; store next 4 lines
165  movq  [dstq          ], m0
166  pshufb              m0, m1
167  movq  [dstq+strideq  ], m0
168  pshufb              m0, m1
169  movq  [dstq+strideq*2], m0
170  pshufb              m0, m1
171  movq  [dstq+stride3q ], m0
172
173  RESTORE_GOT
174  RET
175
176INIT_XMM ssse3
177cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
178  GET_GOT     goffsetq
179
180  mova                   m0, [aboveq]
181  DEFINE_ARGS dst, stride, stride3, dst8, line
182  lea              stride3q, [strideq*3]
183  lea                 dst8q, [dstq+strideq*8]
184  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
185  pshufb                 m2, m0, [GLOBAL(sh_b23456789abcdefff)]
186  pavgb                  m3, m2, m0
187  pxor                   m2, m0
188  pshufb                 m0, m1
189  pand                   m2, [GLOBAL(pb_1)]
190  psubb                  m3, m2
191  pavgb                  m0, m3
192
193  ; first 4 lines and first half of 3rd 4 lines
194  mov                 lined, 2
195.loop:
196  mova   [dstq            ], m0
197  movhps [dst8q           ], m0
198  pshufb                 m0, m1
199  mova   [dstq +strideq   ], m0
200  movhps [dst8q+strideq   ], m0
201  pshufb                 m0, m1
202  mova   [dstq +strideq*2 ], m0
203  movhps [dst8q+strideq*2 ], m0
204  pshufb                 m0, m1
205  mova   [dstq +stride3q  ], m0
206  movhps [dst8q+stride3q  ], m0
207  pshufb                 m0, m1
208  lea                  dstq, [dstq +strideq*4]
209  lea                 dst8q, [dst8q+strideq*4]
210  dec                 lined
211  jnz .loop
212
213  ; bottom-right 8x8 block
214  movhps [dstq          +8], m0
215  movhps [dstq+strideq  +8], m0
216  movhps [dstq+strideq*2+8], m0
217  movhps [dstq+stride3q +8], m0
218  lea                  dstq, [dstq+strideq*4]
219  movhps [dstq          +8], m0
220  movhps [dstq+strideq  +8], m0
221  movhps [dstq+strideq*2+8], m0
222  movhps [dstq+stride3q +8], m0
223
224  RESTORE_GOT
225  RET
226
227INIT_XMM ssse3
228cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
229  GET_GOT     goffsetq
230
231  mova                   m0, [aboveq]
232  mova                   m4, [aboveq+16]
233  DEFINE_ARGS dst, stride, stride3, dst16, line
234  lea              stride3q, [strideq*3]
235  lea                dst16q, [dstq  +strideq*8]
236  lea                dst16q, [dst16q+strideq*8]
237  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
238  pshufb                 m2, m4, [GLOBAL(sh_b23456789abcdefff)]
239  pavgb                  m3, m2, m4
240  pxor                   m2, m4
241  palignr                m5, m4, m0, 1
242  palignr                m6, m4, m0, 2
243  pshufb                 m4, m1
244  pand                   m2, [GLOBAL(pb_1)]
245  psubb                  m3, m2
246  pavgb                  m4, m3
247  pavgb                  m3, m0, m6
248  pxor                   m0, m6
249  pand                   m0, [GLOBAL(pb_1)]
250  psubb                  m3, m0
251  pavgb                  m5, m3
252
253  ; write 4x4 lines (and the first half of the second 4x4 lines)
254  mov                  lined, 4
255.loop:
256  mova [dstq               ], m5
257  mova [dstq            +16], m4
258  mova [dst16q             ], m4
259  palignr                 m3, m4, m5, 1
260  pshufb                  m4, m1
261  mova [dstq  +strideq     ], m3
262  mova [dstq  +strideq  +16], m4
263  mova [dst16q+strideq     ], m4
264  palignr                 m5, m4, m3, 1
265  pshufb                  m4, m1
266  mova [dstq  +strideq*2   ], m5
267  mova [dstq  +strideq*2+16], m4
268  mova [dst16q+strideq*2   ], m4
269  palignr                 m3, m4, m5, 1
270  pshufb                  m4, m1
271  mova [dstq  +stride3q    ], m3
272  mova [dstq  +stride3q +16], m4
273  mova [dst16q+stride3q    ], m4
274  palignr                 m5, m4, m3, 1
275  pshufb                  m4, m1
276  lea                  dstq, [dstq  +strideq*4]
277  lea                dst16q, [dst16q+strideq*4]
278  dec                 lined
279  jnz .loop
280
281  ; write second half of second 4x4 lines
282  mova [dstq            +16], m4
283  mova [dstq  +strideq  +16], m4
284  mova [dstq  +strideq*2+16], m4
285  mova [dstq  +stride3q +16], m4
286  lea                  dstq, [dstq  +strideq*4]
287  mova [dstq            +16], m4
288  mova [dstq  +strideq  +16], m4
289  mova [dstq  +strideq*2+16], m4
290  mova [dstq  +stride3q +16], m4
291  lea                  dstq, [dstq  +strideq*4]
292  mova [dstq            +16], m4
293  mova [dstq  +strideq  +16], m4
294  mova [dstq  +strideq*2+16], m4
295  mova [dstq  +stride3q +16], m4
296  lea                  dstq, [dstq  +strideq*4]
297  mova [dstq            +16], m4
298  mova [dstq  +strideq  +16], m4
299  mova [dstq  +strideq*2+16], m4
300  mova [dstq  +stride3q +16], m4
301
302  RESTORE_GOT
303  RET
304
305; ------------------------------------------
306; input: x, y, z, result
307;
308; trick from pascal
309; (x+2y+z+2)>>2 can be calculated as:
310; result = avg(x,z)
311; result -= xor(x,z) & 1
312; result = avg(result,y)
313; ------------------------------------------
314%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
315  pavgb               %4, %1, %3
316  pxor                %3, %1
317  pand                %3, [GLOBAL(pb_1)]
318  psubb               %4, %3
319  pavgb               %4, %2
320%endmacro
321
322INIT_XMM ssse3
323cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
324  GET_GOT     goffsetq
325
326  movq                m3, [aboveq]
327  pshufb              m1, m3, [GLOBAL(sh_b23456777)]
328  pshufb              m2, m3, [GLOBAL(sh_b12345677)]
329
330  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
331  pavgb               m3, m2
332
333  ; store 4 lines
334  movd    [dstq        ], m3
335  movd    [dstq+strideq], m4
336  lea               dstq, [dstq+strideq*2]
337  psrldq              m3, 1
338  psrldq              m4, 1
339  movd    [dstq        ], m3
340  movd    [dstq+strideq], m4
341  RESTORE_GOT
342  RET
343
344INIT_XMM ssse3
345cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
346  GET_GOT     goffsetq
347
348  movq                m3, [aboveq]
349  DEFINE_ARGS dst, stride, stride3
350  lea           stride3q, [strideq*3]
351  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
352  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
353  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
354  pshufb              m3, [GLOBAL(sh_b0123456777777777)]
355
356  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
357  pavgb               m3, m2
358
359  ; store 4 lines
360  movq    [dstq        ], m3
361  movq    [dstq+strideq], m4
362  psrldq              m3, 1
363  psrldq              m4, 1
364  movq  [dstq+strideq*2], m3
365  movq  [dstq+stride3q ], m4
366  lea               dstq, [dstq+strideq*4]
367  psrldq              m3, 1
368  psrldq              m4, 1
369
370  ; store 4 lines
371  movq    [dstq        ], m3
372  movq    [dstq+strideq], m4
373  psrldq              m3, 1
374  psrldq              m4, 1
375  movq  [dstq+strideq*2], m3
376  movq  [dstq+stride3q ], m4
377  RESTORE_GOT
378  RET
379
380INIT_XMM ssse3
381cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
382  GET_GOT     goffsetq
383
384  mova                m0, [aboveq]
385  DEFINE_ARGS dst, stride, stride3, line
386  lea           stride3q, [strideq*3]
387  mova                m1, [GLOBAL(sh_b123456789abcdeff)]
388  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
389  pshufb              m3, m0, m1
390
391  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
392  pavgb               m0, m3
393
394  mov              lined, 4
395.loop:
396  mova  [dstq          ], m0
397  mova  [dstq+strideq  ], m4
398  pshufb              m0, m1
399  pshufb              m4, m1
400  mova  [dstq+strideq*2], m0
401  mova  [dstq+stride3q ], m4
402  pshufb              m0, m1
403  pshufb              m4, m1
404  lea               dstq, [dstq+strideq*4]
405  dec              lined
406  jnz .loop
407  RESTORE_GOT
408  REP_RET
409
410INIT_XMM ssse3
411cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
412  GET_GOT     goffsetq
413
414  mova                   m0, [aboveq]
415  mova                   m7, [aboveq+16]
416  DEFINE_ARGS dst, stride, stride3, line
417  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
418  lea              stride3q, [strideq*3]
419  pshufb                 m2, m7, [GLOBAL(sh_b23456789abcdefff)]
420  pshufb                 m3, m7, m1
421
422  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
423  palignr                m6, m7, m0, 1
424  palignr                m5, m7, m0, 2
425  pavgb                  m7, m3
426
427  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
428  pavgb                  m0, m6
429
430  mov                 lined, 8
431.loop:
432  mova  [dstq             ], m0
433  mova  [dstq          +16], m7
434  mova  [dstq+strideq     ], m2
435  mova  [dstq+strideq  +16], m4
436  palignr                m3, m7, m0, 1
437  palignr                m5, m4, m2, 1
438  pshufb                 m7, m1
439  pshufb                 m4, m1
440
441  mova  [dstq+strideq*2   ], m3
442  mova  [dstq+strideq*2+16], m7
443  mova  [dstq+stride3q    ], m5
444  mova  [dstq+stride3q +16], m4
445  palignr                m0, m7, m3, 1
446  palignr                m2, m4, m5, 1
447  pshufb                 m7, m1
448  pshufb                 m4, m1
449  lea                  dstq, [dstq+strideq*4]
450  dec                 lined
451  jnz .loop
452  RESTORE_GOT
453  REP_RET
454
455INIT_XMM ssse3
456cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
457  GET_GOT     goffsetq
458  movd                m0, [leftq]               ; l1, l2, l3, l4
459  movd                m1, [aboveq-1]            ; tl, t1, t2, t3
460  punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
461  pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
462  psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
463  psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
464  ; comments below are for a predictor like this
465  ; A1 B1 C1 D1
466  ; A2 B2 A1 B1
467  ; A3 B3 A2 B2
468  ; A4 B4 A3 B3
469  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
470  pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
471
472  punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
473
474  DEFINE_ARGS dst, stride, stride3
475  lea           stride3q, [strideq*3]
476  pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
477  movd  [dstq+stride3q ], m3
478  psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
479  movd  [dstq+strideq*2], m3
480  psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
481  movd  [dstq+strideq  ], m3
482  psrldq              m3, 2                     ; A1 B1 C1 D1 ..
483  movd  [dstq          ], m3
484  RESTORE_GOT
485  RET
486
487INIT_XMM ssse3
488cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
489  GET_GOT     goffsetq
490  movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
491  movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
492  pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
493  pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
494  pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
495  pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
496  psrldq              m4, m0, 1                       ; t1-7 [word]
497  psrldq              m5, m0, 2                       ; t2-7 [word]
498  ; comments below are for a predictor like this
499  ; A1 B1 C1 D1 E1 F1 G1 H1
500  ; A2 B2 A1 B1 C1 D1 E1 F1
501  ; A3 B3 A2 B2 A1 B1 C1 D1
502  ; A4 B4 A3 B3 A2 B2 A1 B1
503  ; A5 B5 A4 B4 A3 B3 A2 B2
504  ; A6 B6 A5 B5 A4 B4 A3 B3
505  ; A7 B7 A6 B6 A5 B5 A4 B4
506  ; A8 B8 A7 B7 A6 B6 A5 B5
507  pavgb               m6, m1, m2                ; 2-tap avg A8-A1
508
509  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
510
511  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
512
513  punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
514
515  DEFINE_ARGS dst, stride, stride3
516  lea           stride3q, [strideq*3]
517
518  movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
519  palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
520  movq  [dstq+strideq*2], m0
521  psrldq              m0, 2                     ; A-B2, A-B1, C-H1
522  movq  [dstq+strideq  ], m0
523  psrldq              m0, 2                     ; A-H1
524  movq  [dstq          ], m0
525  lea               dstq, [dstq+strideq*4]
526  movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
527  psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
528  movq  [dstq+strideq*2], m6
529  psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
530  movq  [dstq+strideq  ], m6
531  psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
532  movq  [dstq          ], m6
533  RESTORE_GOT
534  RET
535
536INIT_XMM ssse3
537cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
538  GET_GOT     goffsetq
539  mova                m0, [leftq]
540  movu                m7, [aboveq-1]
541  ; comments below are for a predictor like this
542  ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
543  ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
544  ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
545  ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
546  ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
547  ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
548  ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
549  ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
550  ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
551  ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
552  ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
553  ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
554  ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
555  ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
556  ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
557  ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
558  pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
559  palignr             m5, m0, m6, 15
560  palignr             m3, m0, m6, 14
561
562  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
563  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
564  pavgb               m5, m0                            ; A1 - Ag
565
566  punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
567  punpckhbw           m4, m5                            ; A-B9 ... A-Bg
568
569  pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
570  pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
571
572  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
573
574  pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
575  DEFINE_ARGS dst, stride, stride3
576  lea           stride3q, [strideq*3]
577  palignr             m2, m1, m6, 14
578  mova  [dstq          ], m2
579  palignr             m2, m1, m6, 12
580  mova  [dstq+strideq  ], m2
581  palignr             m2, m1, m6, 10
582  mova  [dstq+strideq*2], m2
583  palignr             m2, m1, m6, 8
584  mova  [dstq+stride3q ], m2
585  lea               dstq, [dstq+strideq*4]
586  palignr             m2, m1, m6, 6
587  mova  [dstq          ], m2
588  palignr             m2, m1, m6, 4
589  mova  [dstq+strideq  ], m2
590  palignr             m2, m1, m6, 2
591  mova  [dstq+strideq*2], m2
592  pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
593  mova  [dstq+stride3q ], m6
594  lea               dstq, [dstq+strideq*4]
595
596  palignr             m2, m6, m4, 14
597  mova  [dstq          ], m2
598  palignr             m2, m6, m4, 12
599  mova  [dstq+strideq  ], m2
600  palignr             m2, m6, m4, 10
601  mova  [dstq+strideq*2], m2
602  palignr             m2, m6, m4, 8
603  mova  [dstq+stride3q ], m2
604  lea               dstq, [dstq+strideq*4]
605  palignr             m2, m6, m4, 6
606  mova  [dstq          ], m2
607  palignr             m2, m6, m4, 4
608  mova  [dstq+strideq  ], m2
609  palignr             m2, m6, m4, 2
610  mova  [dstq+strideq*2], m2
611  mova  [dstq+stride3q ], m4
612  RESTORE_GOT
613  RET
614
615INIT_XMM ssse3
616cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
617  GET_GOT     goffsetq
618  mova                  m0, [leftq]
619  movu                  m7, [aboveq-1]
620  movu                  m1, [aboveq+15]
621
622  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
623  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
624
625  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
626
627  palignr               m3, m1, m7, 1
628  palignr               m5, m1, m7, 2
629
630  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
631
632  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
633  palignr               m5, m0, m7, 15
634  palignr               m3, m0, m7, 14
635
636  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
637  pavgb                 m5, m0                            ; A1 - Ag
638  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
639  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
640  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
641  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
642
643  DEFINE_ARGS dst, stride, stride3, left, line
644  lea             stride3q, [strideq*3]
645
646  palignr               m5, m2, m1, 14
647  palignr               m7, m1, m6, 14
648  mova  [dstq            ], m7
649  mova  [dstq+16         ], m5
650  palignr               m5, m2, m1, 12
651  palignr               m7, m1, m6, 12
652  mova  [dstq+strideq    ], m7
653  mova  [dstq+strideq+16 ], m5
654  palignr                m5, m2, m1, 10
655  palignr                m7, m1, m6, 10
656  mova  [dstq+strideq*2   ], m7
657  mova  [dstq+strideq*2+16], m5
658  palignr                m5, m2, m1, 8
659  palignr                m7, m1, m6, 8
660  mova  [dstq+stride3q    ], m7
661  mova  [dstq+stride3q+16 ], m5
662  lea                  dstq, [dstq+strideq*4]
663  palignr                m5, m2, m1, 6
664  palignr                m7, m1, m6, 6
665  mova  [dstq             ], m7
666  mova  [dstq+16          ], m5
667  palignr                m5, m2, m1, 4
668  palignr                m7, m1, m6, 4
669  mova  [dstq+strideq     ], m7
670  mova  [dstq+strideq+16  ], m5
671  palignr                m5, m2, m1, 2
672  palignr                m7, m1, m6, 2
673  mova  [dstq+strideq*2   ], m7
674  mova  [dstq+strideq*2+16], m5
675  mova  [dstq+stride3q    ], m6
676  mova  [dstq+stride3q+16 ], m1
677  lea                  dstq, [dstq+strideq*4]
678
679  palignr                m5, m1, m6, 14
680  palignr                m3, m6, m4, 14
681  mova  [dstq             ], m3
682  mova  [dstq+16          ], m5
683  palignr                m5, m1, m6, 12
684  palignr                m3, m6, m4, 12
685  mova  [dstq+strideq     ], m3
686  mova  [dstq+strideq+16  ], m5
687  palignr                m5, m1, m6, 10
688  palignr                m3, m6, m4, 10
689  mova  [dstq+strideq*2   ], m3
690  mova  [dstq+strideq*2+16], m5
691  palignr                m5, m1, m6, 8
692  palignr                m3, m6, m4, 8
693  mova  [dstq+stride3q    ], m3
694  mova  [dstq+stride3q+16 ], m5
695  lea                  dstq, [dstq+strideq*4]
696  palignr                m5, m1, m6, 6
697  palignr                m3, m6, m4, 6
698  mova  [dstq             ], m3
699  mova  [dstq+16          ], m5
700  palignr                m5, m1, m6, 4
701  palignr                m3, m6, m4, 4
702  mova  [dstq+strideq     ], m3
703  mova  [dstq+strideq+16  ], m5
704  palignr                m5, m1, m6, 2
705  palignr                m3, m6, m4, 2
706  mova  [dstq+strideq*2   ], m3
707  mova  [dstq+strideq*2+16], m5
708  mova  [dstq+stride3q    ], m4
709  mova  [dstq+stride3q+16 ], m6
710  lea               dstq, [dstq+strideq*4]
711
712  mova                   m7, [leftq]
713  mova                   m3, [leftq+16]
714  palignr                m5, m3, m7, 15
715  palignr                m0, m3, m7, 14
716
717  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
718  pavgb                  m5, m3                            ; Ah -
719  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
720  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
721  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
722  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
723
724  palignr                m7, m6, m4, 14
725  palignr                m0, m4, m3, 14
726  mova  [dstq             ], m0
727  mova  [dstq+16          ], m7
728  palignr                m7, m6, m4, 12
729  palignr                m0, m4, m3, 12
730  mova  [dstq+strideq     ], m0
731  mova  [dstq+strideq+16  ], m7
732  palignr                m7, m6, m4, 10
733  palignr                m0, m4, m3, 10
734  mova  [dstq+strideq*2   ], m0
735  mova  [dstq+strideq*2+16], m7
736  palignr                m7, m6, m4, 8
737  palignr                m0, m4, m3, 8
738  mova  [dstq+stride3q    ], m0
739  mova  [dstq+stride3q+16 ], m7
740  lea                  dstq, [dstq+strideq*4]
741  palignr                m7, m6, m4, 6
742  palignr                m0, m4, m3, 6
743  mova  [dstq             ], m0
744  mova  [dstq+16          ], m7
745  palignr                m7, m6, m4, 4
746  palignr                m0, m4, m3, 4
747  mova  [dstq+strideq     ], m0
748  mova  [dstq+strideq+16  ], m7
749  palignr                m7, m6, m4, 2
750  palignr                m0, m4, m3, 2
751  mova  [dstq+strideq*2   ], m0
752  mova  [dstq+strideq*2+16], m7
753  mova  [dstq+stride3q    ], m3
754  mova  [dstq+stride3q+16 ], m4
755  lea                  dstq, [dstq+strideq*4]
756
757  palignr                m7, m4, m3, 14
758  palignr                m0, m3, m2, 14
759  mova  [dstq             ], m0
760  mova  [dstq+16          ], m7
761  palignr                m7, m4, m3, 12
762  palignr                m0, m3, m2, 12
763  mova  [dstq+strideq     ], m0
764  mova  [dstq+strideq+16  ], m7
765  palignr                m7, m4, m3, 10
766  palignr                m0, m3, m2, 10
767  mova  [dstq+strideq*2   ], m0
768  mova  [dstq+strideq*2+16], m7
769  palignr                m7, m4, m3, 8
770  palignr                m0, m3, m2, 8
771  mova  [dstq+stride3q    ], m0
772  mova  [dstq+stride3q+16 ], m7
773  lea                  dstq, [dstq+strideq*4]
774  palignr                m7, m4, m3, 6
775  palignr                m0, m3, m2, 6
776  mova  [dstq             ], m0
777  mova  [dstq+16          ], m7
778  palignr                m7, m4, m3, 4
779  palignr                m0, m3, m2, 4
780  mova  [dstq+strideq     ], m0
781  mova  [dstq+strideq+16  ], m7
782  palignr                m7, m4, m3, 2
783  palignr                m0, m3, m2, 2
784  mova  [dstq+strideq*2   ], m0
785  mova  [dstq+strideq*2+16], m7
786  mova  [dstq+stride3q    ], m2
787  mova  [dstq+stride3q+16 ], m3
788
789  RESTORE_GOT
790  RET
791
792INIT_MMX ssse3
793cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
794  GET_GOT     goffsetq
795  movd                m0, [leftq]                ; abcd [byte]
796  pshufb              m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
797  pshufb              m3, m0, [GLOBAL(sh_b2333)] ; cddd
798
799  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
800  pavgb               m1, m0             ; ab, bc, cd, d [byte]
801
802  punpcklbw           m1, m2             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
803  movd    [dstq        ], m1
804  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
805  movd    [dstq+strideq], m1
806  lea               dstq, [dstq+strideq*2]
807  psrlq               m1, 16             ; cd, c3d, d, d
808  movd    [dstq        ], m1
809  pshufw              m1, m1, q1111      ; d, d, d, d
810  movd    [dstq+strideq], m1
811  RESTORE_GOT
812  RET
813
814INIT_XMM ssse3
815cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
816  GET_GOT     goffsetq
817  movq                m3, [leftq]            ; abcdefgh [byte]
818  lea           stride3q, [strideq*3]
819
820  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
821  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
822  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
823
824  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
825  pavgb               m0, m2
826  punpcklbw           m0, m3        ; interleaved output
827
828  movq  [dstq          ], m0
829  psrldq              m0, 2
830  movq  [dstq+strideq  ], m0
831  psrldq              m0, 2
832  movq  [dstq+strideq*2], m0
833  psrldq              m0, 2
834  movq  [dstq+stride3q ], m0
835  lea               dstq, [dstq+strideq*4]
836  pshufhw             m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
837  psrldq              m0, 2
838  movq  [dstq          ], m0
839  psrldq              m0, 2
840  movq  [dstq+strideq  ], m0
841  psrldq              m0, 2
842  movq  [dstq+strideq*2], m0
843  psrldq              m0, 2
844  movq  [dstq+stride3q ], m0
845  RESTORE_GOT
846  RET
847
848INIT_XMM ssse3
849cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
850  GET_GOT     goffsetq
851  lea           stride3q, [strideq*3]
852  mova                m0, [leftq]            ; abcdefghijklmnop [byte]
853  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
854  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
855
856  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
857  pavgb               m1, m0                 ; ab, bc, cd .. no, op, pp [byte]
858
859  punpckhbw           m4, m1, m3    ; interleaved input
860  punpcklbw           m1, m3        ; interleaved output
861  mova  [dstq          ], m1
862  palignr             m3, m4, m1, 2
863  mova  [dstq+strideq  ], m3
864  palignr             m3, m4, m1, 4
865  mova  [dstq+strideq*2], m3
866  palignr             m3, m4, m1, 6
867  mova  [dstq+stride3q ], m3
868  lea               dstq, [dstq+strideq*4]
869  palignr             m3, m4, m1, 8
870  mova  [dstq          ], m3
871  palignr             m3, m4, m1, 10
872  mova  [dstq+strideq  ], m3
873  palignr             m3, m4, m1, 12
874  mova  [dstq+strideq*2], m3
875  palignr             m3, m4, m1, 14
876  mova  [dstq+stride3q ], m3
877  DEFINE_ARGS dst, stride, stride3, line
878  mov              lined, 2
879  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
880.loop:
881  lea               dstq, [dstq+strideq*4]
882  mova  [dstq          ], m4
883  pshufb              m4, m0
884  mova  [dstq+strideq  ], m4
885  pshufb              m4, m0
886  mova  [dstq+strideq*2], m4
887  pshufb              m4, m0
888  mova  [dstq+stride3q ], m4
889  pshufb              m4, m0
890  dec              lined
891  jnz .loop
892  RESTORE_GOT
893  REP_RET
894
895INIT_XMM ssse3
896cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
897  GET_GOT     goffsetq
898  lea           stride3q, [strideq*3]
899  mova                m1, [leftq]              ;  0-15 [byte]
900  mova                m2, [leftq+16]           ; 16-31 [byte]
901  pshufb              m0, m2, [GLOBAL(sh_b23456789abcdefff)]
902  pshufb              m4, m2, [GLOBAL(sh_b123456789abcdeff)]
903
904  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
905  palignr             m6, m2, m1, 1
906  palignr             m5, m2, m1, 2
907  pavgb               m2, m4         ; high 16px even lines
908
909  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
910  pavgb                   m1, m6         ; low 16px even lines
911
912  punpckhbw               m6, m1, m0               ; interleaved output 2
913  punpcklbw               m1, m0                   ; interleaved output 1
914
915  punpckhbw               m7, m2, m3               ; interleaved output 4
916  punpcklbw               m2, m3                   ; interleaved output 3
917
918  ; output 1st 8 lines (and half of 2nd 8 lines)
919  DEFINE_ARGS dst, stride, stride3, dst8
920  lea                  dst8q, [dstq+strideq*8]
921  mova  [dstq              ], m1
922  mova  [dstq           +16], m6
923  mova  [dst8q             ], m6
924  palignr             m0, m6, m1, 2
925  palignr             m4, m2, m6, 2
926  mova  [dstq +strideq     ], m0
927  mova  [dstq +strideq  +16], m4
928  mova  [dst8q+strideq     ], m4
929  palignr             m0, m6, m1, 4
930  palignr             m4, m2, m6, 4
931  mova  [dstq +strideq*2   ], m0
932  mova  [dstq +strideq*2+16], m4
933  mova  [dst8q+strideq*2   ], m4
934  palignr             m0, m6, m1, 6
935  palignr             m4, m2, m6, 6
936  mova  [dstq +stride3q    ], m0
937  mova  [dstq +stride3q +16], m4
938  mova  [dst8q+stride3q    ], m4
939  lea               dstq, [dstq +strideq*4]
940  lea              dst8q, [dst8q+strideq*4]
941  palignr             m0, m6, m1, 8
942  palignr             m4, m2, m6, 8
943  mova  [dstq              ], m0
944  mova  [dstq           +16], m4
945  mova  [dst8q             ], m4
946  palignr             m0, m6, m1, 10
947  palignr             m4, m2, m6, 10
948  mova  [dstq +strideq     ], m0
949  mova  [dstq +strideq  +16], m4
950  mova  [dst8q+strideq     ], m4
951  palignr             m0, m6, m1, 12
952  palignr             m4, m2, m6, 12
953  mova  [dstq +strideq*2   ], m0
954  mova  [dstq +strideq*2+16], m4
955  mova  [dst8q+strideq*2   ], m4
956  palignr             m0, m6, m1, 14
957  palignr             m4, m2, m6, 14
958  mova  [dstq +stride3q    ], m0
959  mova  [dstq +stride3q +16], m4
960  mova  [dst8q+stride3q    ], m4
961  lea               dstq, [dstq+strideq*4]
962  lea              dst8q, [dst8q+strideq*4]
963
964  ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
965  mova  [dstq           +16], m2
966  mova  [dst8q             ], m2
967  palignr             m4, m7, m2, 2
968  mova  [dstq +strideq  +16], m4
969  mova  [dst8q+strideq     ], m4
970  palignr             m4, m7, m2, 4
971  mova  [dstq +strideq*2+16], m4
972  mova  [dst8q+strideq*2   ], m4
973  palignr             m4, m7, m2, 6
974  mova  [dstq +stride3q +16], m4
975  mova  [dst8q+stride3q    ], m4
976  lea               dstq, [dstq+strideq*4]
977  lea              dst8q, [dst8q+strideq*4]
978  palignr             m4, m7, m2, 8
979  mova  [dstq           +16], m4
980  mova  [dst8q             ], m4
981  palignr             m4, m7, m2, 10
982  mova  [dstq +strideq  +16], m4
983  mova  [dst8q+strideq     ], m4
984  palignr             m4, m7, m2, 12
985  mova  [dstq +strideq*2+16], m4
986  mova  [dst8q+strideq*2   ], m4
987  palignr             m4, m7, m2, 14
988  mova  [dstq +stride3q +16], m4
989  mova  [dst8q+stride3q    ], m4
990  lea               dstq, [dstq+strideq*4]
991  lea              dst8q, [dst8q+strideq*4]
992
993  ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
994  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
995  mova  [dstq           +16], m7
996  mova  [dst8q             ], m7
997  pshufb              m7, m0
998  mova  [dstq +strideq  +16], m7
999  mova  [dst8q+strideq     ], m7
1000  pshufb              m7, m0
1001  mova  [dstq +strideq*2+16], m7
1002  mova  [dst8q+strideq*2   ], m7
1003  pshufb              m7, m0
1004  mova  [dstq +stride3q +16], m7
1005  mova  [dst8q+stride3q    ], m7
1006  pshufb              m7, m0
1007  lea               dstq, [dstq+strideq*4]
1008  lea              dst8q, [dst8q+strideq*4]
1009  mova  [dstq           +16], m7
1010  mova  [dst8q             ], m7
1011  pshufb              m7, m0
1012  mova  [dstq +strideq  +16], m7
1013  mova  [dst8q+strideq     ], m7
1014  pshufb              m7, m0
1015  mova  [dstq +strideq*2+16], m7
1016  mova  [dst8q+strideq*2   ], m7
1017  pshufb              m7, m0
1018  mova  [dstq +stride3q +16], m7
1019  mova  [dst8q+stride3q    ], m7
1020  pshufb              m7, m0
1021  lea               dstq, [dstq+strideq*4]
1022
1023  ; output last half of 4th 8 lines
1024  mova  [dstq           +16], m7
1025  mova  [dstq +strideq  +16], m7
1026  mova  [dstq +strideq*2+16], m7
1027  mova  [dstq +stride3q +16], m7
1028  lea               dstq, [dstq+strideq*4]
1029  mova  [dstq           +16], m7
1030  mova  [dstq +strideq  +16], m7
1031  mova  [dstq +strideq*2+16], m7
1032  mova  [dstq +stride3q +16], m7
1033
1034  ; done!
1035  RESTORE_GOT
1036  RET
1037