1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp8_idct_dequant_0_2x_sse2
15; (
16;   short *qcoeff       - 0
17;   short *dequant      - 1
18;   unsigned char *dst  - 2
19;   int dst_stride      - 3
20; )
21
22global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
23sym(vp8_idct_dequant_0_2x_sse2):
24    push        rbp
25    mov         rbp, rsp
26    SHADOW_ARGS_TO_STACK 4
27    GET_GOT     rbx
28    ; end prolog
29
30        mov         rdx,            arg(1) ; dequant
31        mov         rax,            arg(0) ; qcoeff
32
33        movd        xmm4,           [rax]
34        movd        xmm5,           [rdx]
35
36        pinsrw      xmm4,           [rax+32],   4
37        pinsrw      xmm5,           [rdx],      4
38
39        pmullw      xmm4,           xmm5
40
41    ; Zero out xmm5, for use unpacking
42        pxor        xmm5,           xmm5
43
44    ; clear coeffs
45        movd        [rax],          xmm5
46        movd        [rax+32],       xmm5
47;pshufb
48        mov         rax,            arg(2) ; dst
49        movsxd      rdx,            dword ptr arg(3) ; dst_stride
50
51        pshuflw     xmm4,           xmm4,       00000000b
52        pshufhw     xmm4,           xmm4,       00000000b
53
54        lea         rcx,            [rdx + rdx*2]
55        paddw       xmm4,           [GLOBAL(fours)]
56
57        psraw       xmm4,           3
58
59        movq        xmm0,           [rax]
60        movq        xmm1,           [rax+rdx]
61        movq        xmm2,           [rax+2*rdx]
62        movq        xmm3,           [rax+rcx]
63
64        punpcklbw   xmm0,           xmm5
65        punpcklbw   xmm1,           xmm5
66        punpcklbw   xmm2,           xmm5
67        punpcklbw   xmm3,           xmm5
68
69
70    ; Add to predict buffer
71        paddw       xmm0,           xmm4
72        paddw       xmm1,           xmm4
73        paddw       xmm2,           xmm4
74        paddw       xmm3,           xmm4
75
76    ; pack up before storing
77        packuswb    xmm0,           xmm5
78        packuswb    xmm1,           xmm5
79        packuswb    xmm2,           xmm5
80        packuswb    xmm3,           xmm5
81
82    ; store blocks back out
83        movq        [rax],          xmm0
84        movq        [rax + rdx],    xmm1
85
86        lea         rax,            [rax + 2*rdx]
87
88        movq        [rax],          xmm2
89        movq        [rax + rdx],    xmm3
90
91    ; begin epilog
92    RESTORE_GOT
93    UNSHADOW_ARGS
94    pop         rbp
95    ret
96
97;void vp8_idct_dequant_full_2x_sse2
98; (
99;   short *qcoeff       - 0
100;   short *dequant      - 1
101;   unsigned char *dst  - 2
102;   int dst_stride      - 3
103; )
104global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
105sym(vp8_idct_dequant_full_2x_sse2):
106    push        rbp
107    mov         rbp, rsp
108    SHADOW_ARGS_TO_STACK 4
109    SAVE_XMM 7
110    GET_GOT     rbx
111    push        rsi
112    push        rdi
113    ; end prolog
114
115    ; special case when 2 blocks have 0 or 1 coeffs
116    ; dc is set as first coeff, so no need to load qcoeff
117        mov         rax,            arg(0) ; qcoeff
118        mov         rdx,            arg(1)  ; dequant
119        mov         rdi,            arg(2) ; dst
120
121
122    ; Zero out xmm7, for use unpacking
123        pxor        xmm7,           xmm7
124
125
126    ; note the transpose of xmm1 and xmm2, necessary for shuffle
127    ;   to spit out sensicle data
128        movdqa      xmm0,           [rax]
129        movdqa      xmm2,           [rax+16]
130        movdqa      xmm1,           [rax+32]
131        movdqa      xmm3,           [rax+48]
132
133    ; Clear out coeffs
134        movdqa      [rax],          xmm7
135        movdqa      [rax+16],       xmm7
136        movdqa      [rax+32],       xmm7
137        movdqa      [rax+48],       xmm7
138
139    ; dequantize qcoeff buffer
140        pmullw      xmm0,           [rdx]
141        pmullw      xmm2,           [rdx+16]
142        pmullw      xmm1,           [rdx]
143        pmullw      xmm3,           [rdx+16]
144        movsxd      rdx,            dword ptr arg(3) ; dst_stride
145
146    ; repack so block 0 row x and block 1 row x are together
147        movdqa      xmm4,           xmm0
148        punpckldq   xmm0,           xmm1
149        punpckhdq   xmm4,           xmm1
150
151        pshufd      xmm0,           xmm0,       11011000b
152        pshufd      xmm1,           xmm4,       11011000b
153
154        movdqa      xmm4,           xmm2
155        punpckldq   xmm2,           xmm3
156        punpckhdq   xmm4,           xmm3
157
158        pshufd      xmm2,           xmm2,       11011000b
159        pshufd      xmm3,           xmm4,       11011000b
160
161    ; first pass
162        psubw       xmm0,           xmm2        ; b1 = 0-2
163        paddw       xmm2,           xmm2        ;
164
165        movdqa      xmm5,           xmm1
166        paddw       xmm2,           xmm0        ; a1 = 0+2
167
168        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
169        lea         rcx,            [rdx + rdx*2]   ;dst_stride * 3
170        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
171
172        movdqa      xmm7,           xmm3
173        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
174
175        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
176        psubw       xmm7,           xmm5        ; c1
177
178        movdqa      xmm5,           xmm1
179        movdqa      xmm4,           xmm3
180
181        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
182        paddw       xmm5,           xmm1
183
184        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
185        paddw       xmm3,           xmm4
186
187        paddw       xmm3,           xmm5        ; d1
188        movdqa      xmm6,           xmm2        ; a1
189
190        movdqa      xmm4,           xmm0        ; b1
191        paddw       xmm2,           xmm3        ;0
192
193        paddw       xmm4,           xmm7        ;1
194        psubw       xmm0,           xmm7        ;2
195
196        psubw       xmm6,           xmm3        ;3
197
198    ; transpose for the second pass
199        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
200        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
201        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
202
203        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
204        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
205        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
206
207
208        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
209        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
210        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
211
212        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
213        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
214        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
215
216
217        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
218        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
219        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
220
221        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
222        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
223        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
224
225        pshufd      xmm0,           xmm2,       11011000b
226        pshufd      xmm2,           xmm1,       11011000b
227
228        pshufd      xmm1,           xmm5,       11011000b
229        pshufd      xmm3,           xmm7,       11011000b
230
231    ; second pass
232        psubw       xmm0,           xmm2            ; b1 = 0-2
233        paddw       xmm2,           xmm2
234
235        movdqa      xmm5,           xmm1
236        paddw       xmm2,           xmm0            ; a1 = 0+2
237
238        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
239        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
240
241        movdqa      xmm7,           xmm3
242        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
243
244        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
245        psubw       xmm7,           xmm5            ; c1
246
247        movdqa      xmm5,           xmm1
248        movdqa      xmm4,           xmm3
249
250        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
251        paddw       xmm5,           xmm1
252
253        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
254        paddw       xmm3,           xmm4
255
256        paddw       xmm3,           xmm5            ; d1
257        paddw       xmm0,           [GLOBAL(fours)]
258
259        paddw       xmm2,           [GLOBAL(fours)]
260        movdqa      xmm6,           xmm2            ; a1
261
262        movdqa      xmm4,           xmm0            ; b1
263        paddw       xmm2,           xmm3            ;0
264
265        paddw       xmm4,           xmm7            ;1
266        psubw       xmm0,           xmm7            ;2
267
268        psubw       xmm6,           xmm3            ;3
269        psraw       xmm2,           3
270
271        psraw       xmm0,           3
272        psraw       xmm4,           3
273
274        psraw       xmm6,           3
275
276    ; transpose to save
277        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
278        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
279        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
280
281        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
282        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
283        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
284
285
286        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
287        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
288        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
289
290        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
291        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
292        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
293
294
295        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
296        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
297        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
298
299        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
300        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
301        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
302
303        pshufd      xmm0,           xmm2,       11011000b
304        pshufd      xmm2,           xmm1,       11011000b
305
306        pshufd      xmm1,           xmm5,       11011000b
307        pshufd      xmm3,           xmm7,       11011000b
308
309        pxor        xmm7,           xmm7
310
311    ; Load up predict blocks
312        movq        xmm4,           [rdi]
313        movq        xmm5,           [rdi+rdx]
314
315        punpcklbw   xmm4,           xmm7
316        punpcklbw   xmm5,           xmm7
317
318        paddw       xmm0,           xmm4
319        paddw       xmm1,           xmm5
320
321        movq        xmm4,           [rdi+2*rdx]
322        movq        xmm5,           [rdi+rcx]
323
324        punpcklbw   xmm4,           xmm7
325        punpcklbw   xmm5,           xmm7
326
327        paddw       xmm2,           xmm4
328        paddw       xmm3,           xmm5
329
330.finish:
331
332    ; pack up before storing
333        packuswb    xmm0,           xmm7
334        packuswb    xmm1,           xmm7
335        packuswb    xmm2,           xmm7
336        packuswb    xmm3,           xmm7
337
338    ; store blocks back out
339        movq        [rdi],          xmm0
340        movq        [rdi + rdx],    xmm1
341        movq        [rdi + rdx*2],  xmm2
342        movq        [rdi + rcx],    xmm3
343
344    ; begin epilog
345    pop         rdi
346    pop         rsi
347    RESTORE_GOT
348    RESTORE_XMM
349    UNSHADOW_ARGS
350    pop         rbp
351    ret
352
353;void vp8_idct_dequant_dc_0_2x_sse2
354; (
355;   short *qcoeff       - 0
356;   short *dequant      - 1
357;   unsigned char *dst  - 2
358;   int dst_stride      - 3
359;   short *dc           - 4
360; )
361global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
362sym(vp8_idct_dequant_dc_0_2x_sse2):
363    push        rbp
364    mov         rbp, rsp
365    SHADOW_ARGS_TO_STACK 5
366    GET_GOT     rbx
367    push        rdi
368    ; end prolog
369
370    ; special case when 2 blocks have 0 or 1 coeffs
371    ; dc is set as first coeff, so no need to load qcoeff
372        mov         rax,            arg(0) ; qcoeff
373
374        mov         rdi,            arg(2) ; dst
375        mov         rdx,            arg(4) ; dc
376
377    ; Zero out xmm5, for use unpacking
378        pxor        xmm5,           xmm5
379
380    ; load up 2 dc words here == 2*16 = doubleword
381        movd        xmm4,           [rdx]
382
383        movsxd      rdx,            dword ptr arg(3) ; dst_stride
384        lea         rcx, [rdx + rdx*2]
385    ; Load up predict blocks
386        movq        xmm0,           [rdi]
387        movq        xmm1,           [rdi+rdx*1]
388        movq        xmm2,           [rdi+rdx*2]
389        movq        xmm3,           [rdi+rcx]
390
391    ; Duplicate and expand dc across
392        punpcklwd   xmm4,           xmm4
393        punpckldq   xmm4,           xmm4
394
395    ; Rounding to dequant and downshift
396        paddw       xmm4,           [GLOBAL(fours)]
397        psraw       xmm4,           3
398
399    ; Predict buffer needs to be expanded from bytes to words
400        punpcklbw   xmm0,           xmm5
401        punpcklbw   xmm1,           xmm5
402        punpcklbw   xmm2,           xmm5
403        punpcklbw   xmm3,           xmm5
404
405    ; Add to predict buffer
406        paddw       xmm0,           xmm4
407        paddw       xmm1,           xmm4
408        paddw       xmm2,           xmm4
409        paddw       xmm3,           xmm4
410
411    ; pack up before storing
412        packuswb    xmm0,           xmm5
413        packuswb    xmm1,           xmm5
414        packuswb    xmm2,           xmm5
415        packuswb    xmm3,           xmm5
416
417    ; store blocks back out
418        movq        [rdi],          xmm0
419        movq        [rdi + rdx],    xmm1
420        movq        [rdi + rdx*2],  xmm2
421        movq        [rdi + rcx],    xmm3
422
423    ; begin epilog
424    pop         rdi
425    RESTORE_GOT
426    UNSHADOW_ARGS
427    pop         rbp
428    ret
429;void vp8_idct_dequant_dc_full_2x_sse2
430; (
431;   short *qcoeff       - 0
432;   short *dequant      - 1
433;   unsigned char *dst  - 2
434;   int dst_stride      - 3
435;   short *dc           - 4
436; )
437global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
438sym(vp8_idct_dequant_dc_full_2x_sse2):
439    push        rbp
440    mov         rbp, rsp
441    SHADOW_ARGS_TO_STACK 5
442    SAVE_XMM 7
443    GET_GOT     rbx
444    push        rdi
445    ; end prolog
446
447    ; special case when 2 blocks have 0 or 1 coeffs
448    ; dc is set as first coeff, so no need to load qcoeff
449        mov         rax,            arg(0) ; qcoeff
450        mov         rdx,            arg(1)  ; dequant
451
452        mov         rdi,            arg(2) ; dst
453
454    ; Zero out xmm7, for use unpacking
455        pxor        xmm7,           xmm7
456
457
458    ; note the transpose of xmm1 and xmm2, necessary for shuffle
459    ;   to spit out sensicle data
460        movdqa      xmm0,           [rax]
461        movdqa      xmm2,           [rax+16]
462        movdqa      xmm1,           [rax+32]
463        movdqa      xmm3,           [rax+48]
464
465    ; Clear out coeffs
466        movdqa      [rax],          xmm7
467        movdqa      [rax+16],       xmm7
468        movdqa      [rax+32],       xmm7
469        movdqa      [rax+48],       xmm7
470
471    ; dequantize qcoeff buffer
472        pmullw      xmm0,           [rdx]
473        pmullw      xmm2,           [rdx+16]
474        pmullw      xmm1,           [rdx]
475        pmullw      xmm3,           [rdx+16]
476
477    ; DC component
478        mov         rdx,            arg(4)
479
480    ; repack so block 0 row x and block 1 row x are together
481        movdqa      xmm4,           xmm0
482        punpckldq   xmm0,           xmm1
483        punpckhdq   xmm4,           xmm1
484
485        pshufd      xmm0,           xmm0,       11011000b
486        pshufd      xmm1,           xmm4,       11011000b
487
488        movdqa      xmm4,           xmm2
489        punpckldq   xmm2,           xmm3
490        punpckhdq   xmm4,           xmm3
491
492        pshufd      xmm2,           xmm2,       11011000b
493        pshufd      xmm3,           xmm4,       11011000b
494
495    ; insert DC component
496        pinsrw      xmm0,           [rdx],      0
497        pinsrw      xmm0,           [rdx+2],    4
498
499    ; first pass
500        psubw       xmm0,           xmm2        ; b1 = 0-2
501        paddw       xmm2,           xmm2        ;
502
503        movdqa      xmm5,           xmm1
504        paddw       xmm2,           xmm0        ; a1 = 0+2
505
506        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
507        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
508
509        movdqa      xmm7,           xmm3
510        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
511
512        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
513        psubw       xmm7,           xmm5        ; c1
514
515        movdqa      xmm5,           xmm1
516        movdqa      xmm4,           xmm3
517
518        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
519        paddw       xmm5,           xmm1
520
521        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
522        paddw       xmm3,           xmm4
523
524        paddw       xmm3,           xmm5        ; d1
525        movdqa      xmm6,           xmm2        ; a1
526
527        movdqa      xmm4,           xmm0        ; b1
528        paddw       xmm2,           xmm3        ;0
529
530        paddw       xmm4,           xmm7        ;1
531        psubw       xmm0,           xmm7        ;2
532
533        psubw       xmm6,           xmm3        ;3
534
535    ; transpose for the second pass
536        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
537        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
538        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
539
540        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
541        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
542        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
543
544
545        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
546        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
547        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
548
549        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
550        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
551        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
552
553
554        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
555        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
556        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
557
558        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
559        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
560        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
561
562        pshufd      xmm0,           xmm2,       11011000b
563        pshufd      xmm2,           xmm1,       11011000b
564
565        pshufd      xmm1,           xmm5,       11011000b
566        pshufd      xmm3,           xmm7,       11011000b
567
568    ; second pass
569        psubw       xmm0,           xmm2            ; b1 = 0-2
570        paddw       xmm2,           xmm2
571
572        movdqa      xmm5,           xmm1
573        paddw       xmm2,           xmm0            ; a1 = 0+2
574
575        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
576        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
577
578        movdqa      xmm7,           xmm3
579        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
580
581        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
582        psubw       xmm7,           xmm5            ; c1
583
584        movdqa      xmm5,           xmm1
585        movdqa      xmm4,           xmm3
586
587        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
588        paddw       xmm5,           xmm1
589
590        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
591        paddw       xmm3,           xmm4
592
593        paddw       xmm3,           xmm5            ; d1
594        paddw       xmm0,           [GLOBAL(fours)]
595
596        paddw       xmm2,           [GLOBAL(fours)]
597        movdqa      xmm6,           xmm2            ; a1
598
599        movdqa      xmm4,           xmm0            ; b1
600        paddw       xmm2,           xmm3            ;0
601
602        paddw       xmm4,           xmm7            ;1
603        psubw       xmm0,           xmm7            ;2
604
605        psubw       xmm6,           xmm3            ;3
606        psraw       xmm2,           3
607
608        psraw       xmm0,           3
609        psraw       xmm4,           3
610
611        psraw       xmm6,           3
612
613    ; transpose to save
614        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
615        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
616        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
617
618        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
619        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
620        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
621
622
623        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
624        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
625        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
626
627        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
628        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
629        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
630
631
632        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
633        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
634        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
635
636        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
637        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
638        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
639
640        pshufd      xmm0,           xmm2,       11011000b
641        pshufd      xmm2,           xmm1,       11011000b
642
643        pshufd      xmm1,           xmm5,       11011000b
644        pshufd      xmm3,           xmm7,       11011000b
645
646        pxor        xmm7,           xmm7
647
648    ; Load up predict blocks
649        movsxd      rdx,            dword ptr arg(3) ; dst_stride
650        movq        xmm4,           [rdi]
651        movq        xmm5,           [rdi+rdx]
652        lea         rcx,            [rdx + rdx*2]
653
654        punpcklbw   xmm4,           xmm7
655        punpcklbw   xmm5,           xmm7
656
657        paddw       xmm0,           xmm4
658        paddw       xmm1,           xmm5
659
660        movq        xmm4,           [rdi+rdx*2]
661        movq        xmm5,           [rdi+rcx]
662
663        punpcklbw   xmm4,           xmm7
664        punpcklbw   xmm5,           xmm7
665
666        paddw       xmm2,           xmm4
667        paddw       xmm3,           xmm5
668
669.finish:
670
671    ; pack up before storing
672        packuswb    xmm0,           xmm7
673        packuswb    xmm1,           xmm7
674        packuswb    xmm2,           xmm7
675        packuswb    xmm3,           xmm7
676
677    ; Load destination stride before writing out,
678    ;   doesn't need to persist
679        movsxd      rdx,            dword ptr arg(3) ; dst_stride
680
681    ; store blocks back out
682        movq        [rdi],          xmm0
683        movq        [rdi + rdx],    xmm1
684
685        lea         rdi,            [rdi + 2*rdx]
686
687        movq        [rdi],          xmm2
688        movq        [rdi + rdx],    xmm3
689
690
691    ; begin epilog
692    pop         rdi
693    RESTORE_GOT
694    RESTORE_XMM
695    UNSHADOW_ARGS
696    pop         rbp
697    ret
698
699SECTION_RODATA
700align 16
701fours:
702    times 8 dw 0x0004
703align 16
704x_s1sqr2:
705    times 8 dw 0x8A8C
706align 16
707x_c1sqr2less1:
708    times 8 dw 0x4E7B
709