1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void idct_dequant_0_2x_sse2
15; (
16;   short *qcoeff       - 0
17;   short *dequant      - 1
18;   unsigned char *pre  - 2
19;   unsigned char *dst  - 3
20;   int dst_stride      - 4
21;   int blk_stride      - 5
22; )
23
24global sym(idct_dequant_0_2x_sse2)
25sym(idct_dequant_0_2x_sse2):
26    push        rbp
27    mov         rbp, rsp
28    SHADOW_ARGS_TO_STACK 6
29    GET_GOT     rbx
30    ; end prolog
31
32        mov         rdx,            arg(1) ; dequant
33        mov         rax,            arg(0) ; qcoeff
34
35    ; Zero out xmm7, for use unpacking
36        pxor        xmm7,           xmm7
37
38        movd        xmm4,           [rax]
39        movd        xmm5,           [rdx]
40
41        pinsrw      xmm4,           [rax+32],   4
42        pinsrw      xmm5,           [rdx],      4
43
44        pmullw      xmm4,           xmm5
45
46    ; clear coeffs
47        movd        [rax],          xmm7
48        movd        [rax+32],       xmm7
49;pshufb
50        pshuflw     xmm4,           xmm4,       00000000b
51        pshufhw     xmm4,           xmm4,       00000000b
52
53        mov         rax,            arg(2) ; pre
54        paddw       xmm4,           [GLOBAL(fours)]
55
56        movsxd      rcx,            dword ptr arg(5) ; blk_stride
57        psraw       xmm4,           3
58
59        movq        xmm0,           [rax]
60        movq        xmm1,           [rax+rcx]
61        movq        xmm2,           [rax+2*rcx]
62        lea         rcx,            [3*rcx]
63        movq        xmm3,           [rax+rcx]
64
65        punpcklbw   xmm0,           xmm7
66        punpcklbw   xmm1,           xmm7
67        punpcklbw   xmm2,           xmm7
68        punpcklbw   xmm3,           xmm7
69
70        mov         rax,            arg(3) ; dst
71        movsxd      rdx,            dword ptr arg(4) ; dst_stride
72
73    ; Add to predict buffer
74        paddw       xmm0,           xmm4
75        paddw       xmm1,           xmm4
76        paddw       xmm2,           xmm4
77        paddw       xmm3,           xmm4
78
79    ; pack up before storing
80        packuswb    xmm0,           xmm7
81        packuswb    xmm1,           xmm7
82        packuswb    xmm2,           xmm7
83        packuswb    xmm3,           xmm7
84
85    ; store blocks back out
86        movq        [rax],          xmm0
87        movq        [rax + rdx],    xmm1
88
89        lea         rax,            [rax + 2*rdx]
90
91        movq        [rax],          xmm2
92        movq        [rax + rdx],    xmm3
93
94    ; begin epilog
95    RESTORE_GOT
96    UNSHADOW_ARGS
97    pop         rbp
98    ret
99
100global sym(idct_dequant_full_2x_sse2)
101sym(idct_dequant_full_2x_sse2):
102    push        rbp
103    mov         rbp, rsp
104    SHADOW_ARGS_TO_STACK 7
105    GET_GOT     rbx
106    push        rsi
107    push        rdi
108    ; end prolog
109
110    ; special case when 2 blocks have 0 or 1 coeffs
111    ; dc is set as first coeff, so no need to load qcoeff
112        mov         rax,            arg(0) ; qcoeff
113        mov         rsi,            arg(2) ; pre
114        mov         rdi,            arg(3) ; dst
115        movsxd      rcx,            dword ptr arg(5) ; blk_stride
116
117    ; Zero out xmm7, for use unpacking
118        pxor        xmm7,           xmm7
119
120        mov         rdx,            arg(1)  ; dequant
121
122    ; note the transpose of xmm1 and xmm2, necessary for shuffle
123    ;   to spit out sensicle data
124        movdqa      xmm0,           [rax]
125        movdqa      xmm2,           [rax+16]
126        movdqa      xmm1,           [rax+32]
127        movdqa      xmm3,           [rax+48]
128
129    ; Clear out coeffs
130        movdqa      [rax],          xmm7
131        movdqa      [rax+16],       xmm7
132        movdqa      [rax+32],       xmm7
133        movdqa      [rax+48],       xmm7
134
135    ; dequantize qcoeff buffer
136        pmullw      xmm0,           [rdx]
137        pmullw      xmm2,           [rdx+16]
138        pmullw      xmm1,           [rdx]
139        pmullw      xmm3,           [rdx+16]
140
141    ; repack so block 0 row x and block 1 row x are together
142        movdqa      xmm4,           xmm0
143        punpckldq   xmm0,           xmm1
144        punpckhdq   xmm4,           xmm1
145
146        pshufd      xmm0,           xmm0,       11011000b
147        pshufd      xmm1,           xmm4,       11011000b
148
149        movdqa      xmm4,           xmm2
150        punpckldq   xmm2,           xmm3
151        punpckhdq   xmm4,           xmm3
152
153        pshufd      xmm2,           xmm2,       11011000b
154        pshufd      xmm3,           xmm4,       11011000b
155
156    ; first pass
157        psubw       xmm0,           xmm2        ; b1 = 0-2
158        paddw       xmm2,           xmm2        ;
159
160        movdqa      xmm5,           xmm1
161        paddw       xmm2,           xmm0        ; a1 = 0+2
162
163        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
164        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
165
166        movdqa      xmm7,           xmm3
167        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
168
169        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
170        psubw       xmm7,           xmm5        ; c1
171
172        movdqa      xmm5,           xmm1
173        movdqa      xmm4,           xmm3
174
175        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
176        paddw       xmm5,           xmm1
177
178        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
179        paddw       xmm3,           xmm4
180
181        paddw       xmm3,           xmm5        ; d1
182        movdqa      xmm6,           xmm2        ; a1
183
184        movdqa      xmm4,           xmm0        ; b1
185        paddw       xmm2,           xmm3        ;0
186
187        paddw       xmm4,           xmm7        ;1
188        psubw       xmm0,           xmm7        ;2
189
190        psubw       xmm6,           xmm3        ;3
191
192    ; transpose for the second pass
193        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
194        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
195        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
196
197        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
198        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
199        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
200
201
202        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
203        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
204        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
205
206        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
207        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
208        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
209
210
211        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
212        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
213        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
214
215        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
216        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
217        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
218
219        pshufd      xmm0,           xmm2,       11011000b
220        pshufd      xmm2,           xmm1,       11011000b
221
222        pshufd      xmm1,           xmm5,       11011000b
223        pshufd      xmm3,           xmm7,       11011000b
224
225    ; second pass
226        psubw       xmm0,           xmm2            ; b1 = 0-2
227        paddw       xmm2,           xmm2
228
229        movdqa      xmm5,           xmm1
230        paddw       xmm2,           xmm0            ; a1 = 0+2
231
232        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
233        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
234
235        movdqa      xmm7,           xmm3
236        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
237
238        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
239        psubw       xmm7,           xmm5            ; c1
240
241        movdqa      xmm5,           xmm1
242        movdqa      xmm4,           xmm3
243
244        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
245        paddw       xmm5,           xmm1
246
247        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
248        paddw       xmm3,           xmm4
249
250        paddw       xmm3,           xmm5            ; d1
251        paddw       xmm0,           [GLOBAL(fours)]
252
253        paddw       xmm2,           [GLOBAL(fours)]
254        movdqa      xmm6,           xmm2            ; a1
255
256        movdqa      xmm4,           xmm0            ; b1
257        paddw       xmm2,           xmm3            ;0
258
259        paddw       xmm4,           xmm7            ;1
260        psubw       xmm0,           xmm7            ;2
261
262        psubw       xmm6,           xmm3            ;3
263        psraw       xmm2,           3
264
265        psraw       xmm0,           3
266        psraw       xmm4,           3
267
268        psraw       xmm6,           3
269
270    ; transpose to save
271        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
272        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
273        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
274
275        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
276        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
277        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
278
279
280        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
281        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
282        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
283
284        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
285        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
286        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
287
288
289        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
290        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
291        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
292
293        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
294        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
295        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
296
297        pshufd      xmm0,           xmm2,       11011000b
298        pshufd      xmm2,           xmm1,       11011000b
299
300        pshufd      xmm1,           xmm5,       11011000b
301        pshufd      xmm3,           xmm7,       11011000b
302
303        pxor        xmm7,           xmm7
304
305    ; Load up predict blocks
306        movq        xmm4,           [rsi]
307        movq        xmm5,           [rsi+rcx]
308
309        punpcklbw   xmm4,           xmm7
310        punpcklbw   xmm5,           xmm7
311
312        paddw       xmm0,           xmm4
313        paddw       xmm1,           xmm5
314
315        movq        xmm4,           [rsi+2*rcx]
316        lea         rcx,            [3*rcx]
317        movq        xmm5,           [rsi+rcx]
318
319        punpcklbw   xmm4,           xmm7
320        punpcklbw   xmm5,           xmm7
321
322        paddw       xmm2,           xmm4
323        paddw       xmm3,           xmm5
324
325.finish:
326
327    ; pack up before storing
328        packuswb    xmm0,           xmm7
329        packuswb    xmm1,           xmm7
330        packuswb    xmm2,           xmm7
331        packuswb    xmm3,           xmm7
332
333    ; Load destination stride before writing out,
334    ;   doesn't need to persist
335        movsxd      rdx,            dword ptr arg(4) ; dst_stride
336
337    ; store blocks back out
338        movq        [rdi],          xmm0
339        movq        [rdi + rdx],    xmm1
340
341        lea         rdi,            [rdi + 2*rdx]
342
343        movq        [rdi],          xmm2
344        movq        [rdi + rdx],    xmm3
345
346    ; begin epilog
347    pop         rdi
348    pop         rsi
349    RESTORE_GOT
350    UNSHADOW_ARGS
351    pop         rbp
352    ret
353
354;void idct_dequant_dc_0_2x_sse2
355; (
356;   short *qcoeff       - 0
357;   short *dequant      - 1
358;   unsigned char *pre  - 2
359;   unsigned char *dst  - 3
360;   int dst_stride      - 4
361;   short *dc           - 5
362; )
363global sym(idct_dequant_dc_0_2x_sse2)
364sym(idct_dequant_dc_0_2x_sse2):
365    push        rbp
366    mov         rbp, rsp
367    SHADOW_ARGS_TO_STACK 7
368    GET_GOT     rbx
369    push        rsi
370    push        rdi
371    ; end prolog
372
373    ; special case when 2 blocks have 0 or 1 coeffs
374    ; dc is set as first coeff, so no need to load qcoeff
375        mov         rax,            arg(0) ; qcoeff
376        mov         rsi,            arg(2) ; pre
377        mov         rdi,            arg(3) ; dst
378        mov         rdx,            arg(5) ; dc
379
380    ; Zero out xmm7, for use unpacking
381        pxor        xmm7,           xmm7
382
383    ; load up 2 dc words here == 2*16 = doubleword
384        movd        xmm4,           [rdx]
385
386    ; Load up predict blocks
387        movq        xmm0,           [rsi]
388        movq        xmm1,           [rsi+16]
389        movq        xmm2,           [rsi+32]
390        movq        xmm3,           [rsi+48]
391
392    ; Duplicate and expand dc across
393        punpcklwd   xmm4,           xmm4
394        punpckldq   xmm4,           xmm4
395
396    ; Rounding to dequant and downshift
397        paddw       xmm4,           [GLOBAL(fours)]
398        psraw       xmm4,           3
399
400    ; Predict buffer needs to be expanded from bytes to words
401        punpcklbw   xmm0,           xmm7
402        punpcklbw   xmm1,           xmm7
403        punpcklbw   xmm2,           xmm7
404        punpcklbw   xmm3,           xmm7
405
406    ; Add to predict buffer
407        paddw       xmm0,           xmm4
408        paddw       xmm1,           xmm4
409        paddw       xmm2,           xmm4
410        paddw       xmm3,           xmm4
411
412    ; pack up before storing
413        packuswb    xmm0,           xmm7
414        packuswb    xmm1,           xmm7
415        packuswb    xmm2,           xmm7
416        packuswb    xmm3,           xmm7
417
418    ; Load destination stride before writing out,
419    ;   doesn't need to persist
420        movsxd      rdx,            dword ptr arg(4) ; dst_stride
421
422    ; store blocks back out
423        movq        [rdi],          xmm0
424        movq        [rdi + rdx],    xmm1
425
426        lea         rdi,            [rdi + 2*rdx]
427
428        movq        [rdi],          xmm2
429        movq        [rdi + rdx],    xmm3
430
431    ; begin epilog
432    pop         rdi
433    pop         rsi
434    RESTORE_GOT
435    UNSHADOW_ARGS
436    pop         rbp
437    ret
438
439global sym(idct_dequant_dc_full_2x_sse2)
440sym(idct_dequant_dc_full_2x_sse2):
441    push        rbp
442    mov         rbp, rsp
443    SHADOW_ARGS_TO_STACK 7
444    GET_GOT     rbx
445    push        rsi
446    push        rdi
447    ; end prolog
448
449    ; special case when 2 blocks have 0 or 1 coeffs
450    ; dc is set as first coeff, so no need to load qcoeff
451        mov         rax,            arg(0) ; qcoeff
452        mov         rsi,            arg(2) ; pre
453        mov         rdi,            arg(3) ; dst
454
455    ; Zero out xmm7, for use unpacking
456        pxor        xmm7,           xmm7
457
458        mov         rdx,            arg(1)  ; dequant
459
460    ; note the transpose of xmm1 and xmm2, necessary for shuffle
461    ;   to spit out sensicle data
462        movdqa      xmm0,           [rax]
463        movdqa      xmm2,           [rax+16]
464        movdqa      xmm1,           [rax+32]
465        movdqa      xmm3,           [rax+48]
466
467    ; Clear out coeffs
468        movdqa      [rax],          xmm7
469        movdqa      [rax+16],       xmm7
470        movdqa      [rax+32],       xmm7
471        movdqa      [rax+48],       xmm7
472
473    ; dequantize qcoeff buffer
474        pmullw      xmm0,           [rdx]
475        pmullw      xmm2,           [rdx+16]
476        pmullw      xmm1,           [rdx]
477        pmullw      xmm3,           [rdx+16]
478
479    ; DC component
480        mov         rdx,            arg(5)
481
482    ; repack so block 0 row x and block 1 row x are together
483        movdqa      xmm4,           xmm0
484        punpckldq   xmm0,           xmm1
485        punpckhdq   xmm4,           xmm1
486
487        pshufd      xmm0,           xmm0,       11011000b
488        pshufd      xmm1,           xmm4,       11011000b
489
490        movdqa      xmm4,           xmm2
491        punpckldq   xmm2,           xmm3
492        punpckhdq   xmm4,           xmm3
493
494        pshufd      xmm2,           xmm2,       11011000b
495        pshufd      xmm3,           xmm4,       11011000b
496
497    ; insert DC component
498        pinsrw      xmm0,           [rdx],      0
499        pinsrw      xmm0,           [rdx+2],    4
500
501    ; first pass
502        psubw       xmm0,           xmm2        ; b1 = 0-2
503        paddw       xmm2,           xmm2        ;
504
505        movdqa      xmm5,           xmm1
506        paddw       xmm2,           xmm0        ; a1 = 0+2
507
508        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
509        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
510
511        movdqa      xmm7,           xmm3
512        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
513
514        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
515        psubw       xmm7,           xmm5        ; c1
516
517        movdqa      xmm5,           xmm1
518        movdqa      xmm4,           xmm3
519
520        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
521        paddw       xmm5,           xmm1
522
523        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
524        paddw       xmm3,           xmm4
525
526        paddw       xmm3,           xmm5        ; d1
527        movdqa      xmm6,           xmm2        ; a1
528
529        movdqa      xmm4,           xmm0        ; b1
530        paddw       xmm2,           xmm3        ;0
531
532        paddw       xmm4,           xmm7        ;1
533        psubw       xmm0,           xmm7        ;2
534
535        psubw       xmm6,           xmm3        ;3
536
537    ; transpose for the second pass
538        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
539        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
540        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
541
542        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
543        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
544        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
545
546
547        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
548        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
549        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
550
551        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
552        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
553        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
554
555
556        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
557        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
558        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
559
560        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
561        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
562        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
563
564        pshufd      xmm0,           xmm2,       11011000b
565        pshufd      xmm2,           xmm1,       11011000b
566
567        pshufd      xmm1,           xmm5,       11011000b
568        pshufd      xmm3,           xmm7,       11011000b
569
570    ; second pass
571        psubw       xmm0,           xmm2            ; b1 = 0-2
572        paddw       xmm2,           xmm2
573
574        movdqa      xmm5,           xmm1
575        paddw       xmm2,           xmm0            ; a1 = 0+2
576
577        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
578        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
579
580        movdqa      xmm7,           xmm3
581        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
582
583        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
584        psubw       xmm7,           xmm5            ; c1
585
586        movdqa      xmm5,           xmm1
587        movdqa      xmm4,           xmm3
588
589        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
590        paddw       xmm5,           xmm1
591
592        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
593        paddw       xmm3,           xmm4
594
595        paddw       xmm3,           xmm5            ; d1
596        paddw       xmm0,           [GLOBAL(fours)]
597
598        paddw       xmm2,           [GLOBAL(fours)]
599        movdqa      xmm6,           xmm2            ; a1
600
601        movdqa      xmm4,           xmm0            ; b1
602        paddw       xmm2,           xmm3            ;0
603
604        paddw       xmm4,           xmm7            ;1
605        psubw       xmm0,           xmm7            ;2
606
607        psubw       xmm6,           xmm3            ;3
608        psraw       xmm2,           3
609
610        psraw       xmm0,           3
611        psraw       xmm4,           3
612
613        psraw       xmm6,           3
614
615    ; transpose to save
616        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
617        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
618        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
619
620        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
621        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
622        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
623
624
625        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
626        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
627        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
628
629        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
630        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
631        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
632
633
634        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
635        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
636        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
637
638        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
639        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
640        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
641
642        pshufd      xmm0,           xmm2,       11011000b
643        pshufd      xmm2,           xmm1,       11011000b
644
645        pshufd      xmm1,           xmm5,       11011000b
646        pshufd      xmm3,           xmm7,       11011000b
647
648        pxor        xmm7,           xmm7
649
650    ; Load up predict blocks
651        movq        xmm4,           [rsi]
652        movq        xmm5,           [rsi+16]
653
654        punpcklbw   xmm4,           xmm7
655        punpcklbw   xmm5,           xmm7
656
657        paddw       xmm0,           xmm4
658        paddw       xmm1,           xmm5
659
660        movq        xmm4,           [rsi+32]
661        movq        xmm5,           [rsi+48]
662
663        punpcklbw   xmm4,           xmm7
664        punpcklbw   xmm5,           xmm7
665
666        paddw       xmm2,           xmm4
667        paddw       xmm3,           xmm5
668
669.finish:
670
671    ; pack up before storing
672        packuswb    xmm0,           xmm7
673        packuswb    xmm1,           xmm7
674        packuswb    xmm2,           xmm7
675        packuswb    xmm3,           xmm7
676
677    ; Load destination stride before writing out,
678    ;   doesn't need to persist
679        movsxd      rdx,            dword ptr arg(4) ; dst_stride
680
681    ; store blocks back out
682        movq        [rdi],          xmm0
683        movq        [rdi + rdx],    xmm1
684
685        lea         rdi,            [rdi + 2*rdx]
686
687        movq        [rdi],          xmm2
688        movq        [rdi + rdx],    xmm3
689
690
691    ; begin epilog
692    pop         rdi
693    pop         rsi
694    RESTORE_GOT
695    UNSHADOW_ARGS
696    pop         rbp
697    ret
698
699SECTION_RODATA
700align 16
701fours:
702    times 8 dw 0x0004
703align 16
704x_s1sqr2:
705    times 8 dw 0x8A8C
706align 16
707x_c1sqr2less1:
708    times 8 dw 0x4E7B
709