1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro LF_ABS 2
15        ; %1 value not preserved
16        ; %2 value preserved
17        ; output in %1
18        movdqa      scratch1, %2            ; v2
19
20        psubusb     scratch1, %1            ; v2 - v1
21        psubusb     %1, %2                  ; v1 - v2
22        por         %1, scratch1            ; abs(v2 - v1)
23%endmacro
24
25%macro LF_FILTER_HEV_MASK 8-9
26
27        LF_ABS      %1, %2                  ; abs(p3 - p2)
28        LF_ABS      %2, %3                  ; abs(p2 - p1)
29        pmaxub      %1, %2                  ; accumulate mask
30%if %0 == 8
31        movdqa      scratch2, %3            ; save p1
32        LF_ABS      scratch2, %4            ; abs(p1 - p0)
33%endif
34        LF_ABS      %4, %5                  ; abs(p0 - q0)
35        LF_ABS      %5, %6                  ; abs(q0 - q1)
36%if %0 == 8
37        pmaxub      %5, scratch2            ; accumulate hev
38%else
39        pmaxub      %5, %9
40%endif
41        pmaxub      %1, %5                  ; accumulate mask
42
43        LF_ABS      %3, %6                  ; abs(p1 - q1)
44        LF_ABS      %6, %7                  ; abs(q1 - q2)
45        pmaxub      %1, %6                  ; accumulate mask
46        LF_ABS      %7, %8                  ; abs(q2 - q3)
47        pmaxub      %1, %7                  ; accumulate mask
48
49        paddusb     %4, %4                  ; 2 * abs(p0 - q0)
50        pand        %3, [GLOBAL(tfe)]
51        psrlw       %3, 1                   ; abs(p1 - q1) / 2
52        paddusb     %4, %3                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
53
54        psubusb     %1, [limit]
55        psubusb     %4, [blimit]
56        por         %1, %4
57        pcmpeqb     %1, zero                ; mask
58
59        psubusb     %5, [thresh]
60        pcmpeqb     %5, zero                ; ~hev
61%endmacro
62
63%macro LF_FILTER 6
64        ; %1-%4: p1-q1
65        ; %5: mask
66        ; %6: hev
67
68        movdqa      scratch2, %6            ; save hev
69
70        pxor        %1, [GLOBAL(t80)]       ; ps1
71        pxor        %4, [GLOBAL(t80)]       ; qs1
72        movdqa      scratch1, %1
73        psubsb      scratch1, %4            ; signed_char_clamp(ps1 - qs1)
74        pandn       scratch2, scratch1      ; vp8_filter &= hev
75
76        pxor        %2, [GLOBAL(t80)]       ; ps0
77        pxor        %3, [GLOBAL(t80)]       ; qs0
78        movdqa      scratch1, %3
79        psubsb      scratch1, %2            ; qs0 - ps0
80        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
81        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
82        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
83        pand        %5, scratch2            ; &= mask
84
85        movdqa      scratch2, %5
86        paddsb      %5, [GLOBAL(t4)]        ; Filter1
87        paddsb      scratch2, [GLOBAL(t3)]  ; Filter2
88
89        ; Filter1 >> 3
90        movdqa      scratch1, zero
91        pcmpgtb     scratch1, %5
92        psrlw       %5, 3
93        pand        scratch1, [GLOBAL(te0)]
94        pand        %5, [GLOBAL(t1f)]
95        por         %5, scratch1
96
97        psubsb      %3, %5                  ; qs0 - Filter1
98        pxor        %3, [GLOBAL(t80)]
99
100        ; Filter2 >> 3
101        movdqa      scratch1, zero
102        pcmpgtb     scratch1, scratch2
103        psrlw       scratch2, 3
104        pand        scratch1, [GLOBAL(te0)]
105        pand        scratch2, [GLOBAL(t1f)]
106        por         scratch2, scratch1
107
108        paddsb      %2, scratch2            ; ps0 + Filter2
109        pxor        %2, [GLOBAL(t80)]
110
111        ; outer tap adjustments
112        paddsb      %5, [GLOBAL(t1)]
113        movdqa      scratch1, zero
114        pcmpgtb     scratch1, %5
115        psrlw       %5, 1
116        pand        scratch1, [GLOBAL(t80)]
117        pand        %5, [GLOBAL(t7f)]
118        por         %5, scratch1
119        pand        %5, %6                  ; vp8_filter &= ~hev
120
121        psubsb      %4, %5                  ; qs1 - vp8_filter
122        pxor        %4, [GLOBAL(t80)]
123
124        paddsb      %1, %5                  ; ps1 + vp8_filter
125        pxor        %1, [GLOBAL(t80)]
126%endmacro
127
128;void vp8_loop_filter_bh_y_sse2
129;(
130;    unsigned char *src_ptr,
131;    int            src_pixel_step,
132;    const char    *blimit,
133;    const char    *limit,
134;    const char    *thresh
135;)
136global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
137sym(vp8_loop_filter_bh_y_sse2):
138
139%if LIBVPX_YASM_WIN64
140    %define src      rcx ; src_ptr
141    %define stride   rdx ; src_pixel_step
142    %define blimit   r8
143    %define limit    r9
144    %define thresh   r10
145
146    %define spp      rax
147    %define stride3  r11
148    %define stride5  r12
149    %define stride7  r13
150
151    push    rbp
152    mov     rbp, rsp
153    SAVE_XMM 11
154    push    r12
155    push    r13
156    mov     thresh, arg(4)
157%else
158    %define src      rdi ; src_ptr
159    %define stride   rsi ; src_pixel_step
160    %define blimit   rdx
161    %define limit    rcx
162    %define thresh   r8
163
164    %define spp      rax
165    %define stride3  r9
166    %define stride5  r10
167    %define stride7  r11
168%endif
169
170    %define scratch1 xmm5
171    %define scratch2 xmm6
172    %define zero     xmm7
173
174    %define i0       [src]
175    %define i1       [spp]
176    %define i2       [src + 2 * stride]
177    %define i3       [spp + 2 * stride]
178    %define i4       [src + 4 * stride]
179    %define i5       [spp + 4 * stride]
180    %define i6       [src + 2 * stride3]
181    %define i7       [spp + 2 * stride3]
182    %define i8       [src + 8 * stride]
183    %define i9       [spp + 8 * stride]
184    %define i10      [src + 2 * stride5]
185    %define i11      [spp + 2 * stride5]
186    %define i12      [src + 4 * stride3]
187    %define i13      [spp + 4 * stride3]
188    %define i14      [src + 2 * stride7]
189    %define i15      [spp + 2 * stride7]
190
191    ; prep work
192    lea         spp, [src + stride]
193    lea         stride3, [stride + 2 * stride]
194    lea         stride5, [stride3 + 2 * stride]
195    lea         stride7, [stride3 + 4 * stride]
196    pxor        zero, zero
197
198        ; load the first set into registers
199        movdqa       xmm0, i0
200        movdqa       xmm1, i1
201        movdqa       xmm2, i2
202        movdqa       xmm3, i3
203        movdqa       xmm4, i4
204        movdqa       xmm8, i5
205        movdqa       xmm9, i6   ; q2, will contain abs(p1-p0)
206        movdqa       xmm10, i7
207LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
208
209        movdqa       xmm1, i2
210        movdqa       xmm2, i3
211        movdqa       xmm3, i4
212        movdqa       xmm8, i5
213LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
214        movdqa       i2, xmm1
215        movdqa       i3, xmm2
216
217; second set
218        movdqa       i4, xmm3
219        movdqa       i5, xmm8
220
221        movdqa       xmm0, i6
222        movdqa       xmm1, i7
223        movdqa       xmm2, i8
224        movdqa       xmm4, i9
225        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
226        movdqa       xmm11, i11
227LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
228
229        movdqa       xmm0, i6
230        movdqa       xmm1, i7
231        movdqa       xmm4, i8
232        movdqa       xmm8, i9
233LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
234        movdqa       i6, xmm0
235        movdqa       i7, xmm1
236
237; last set
238        movdqa       i8, xmm4
239        movdqa       i9, xmm8
240
241        movdqa       xmm0, i10
242        movdqa       xmm1, i11
243        movdqa       xmm2, i12
244        movdqa       xmm3, i13
245        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
246        movdqa       xmm11, i15
247LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
248
249        movdqa       xmm0, i10
250        movdqa       xmm1, i11
251        movdqa       xmm3, i12
252        movdqa       xmm8, i13
253LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
254        movdqa       i10, xmm0
255        movdqa       i11, xmm1
256        movdqa       i12, xmm3
257        movdqa       i13, xmm8
258
259%if LIBVPX_YASM_WIN64
260    pop    r13
261    pop    r12
262    RESTORE_XMM
263    pop    rbp
264%endif
265
266    ret
267
268
269;void vp8_loop_filter_bv_y_sse2
270;(
271;    unsigned char *src_ptr,
272;    int            src_pixel_step,
273;    const char    *blimit,
274;    const char    *limit,
275;    const char    *thresh
276;)
277
278global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
279sym(vp8_loop_filter_bv_y_sse2):
280
281%if LIBVPX_YASM_WIN64
282    %define src      rcx ; src_ptr
283    %define stride   rdx ; src_pixel_step
284    %define blimit   r8
285    %define limit    r9
286    %define thresh   r10
287
288    %define spp      rax
289    %define stride3  r11
290    %define stride5  r12
291    %define stride7  r13
292
293    push    rbp
294    mov     rbp, rsp
295    SAVE_XMM 15
296    push    r12
297    push    r13
298    mov     thresh, arg(4)
299%else
300    %define src      rdi
301    %define stride   rsi
302    %define blimit   rdx
303    %define limit    rcx
304    %define thresh   r8
305
306    %define spp      rax
307    %define stride3  r9
308    %define stride5  r10
309    %define stride7  r11
310%endif
311
312    %define scratch1 xmm5
313    %define scratch2 xmm6
314    %define zero     xmm7
315
316    %define s0       [src]
317    %define s1       [spp]
318    %define s2       [src + 2 * stride]
319    %define s3       [spp + 2 * stride]
320    %define s4       [src + 4 * stride]
321    %define s5       [spp + 4 * stride]
322    %define s6       [src + 2 * stride3]
323    %define s7       [spp + 2 * stride3]
324    %define s8       [src + 8 * stride]
325    %define s9       [spp + 8 * stride]
326    %define s10      [src + 2 * stride5]
327    %define s11      [spp + 2 * stride5]
328    %define s12      [src + 4 * stride3]
329    %define s13      [spp + 4 * stride3]
330    %define s14      [src + 2 * stride7]
331    %define s15      [spp + 2 * stride7]
332
333    %define i0       [rsp]
334    %define i1       [rsp + 16]
335    %define i2       [rsp + 32]
336    %define i3       [rsp + 48]
337    %define i4       [rsp + 64]
338    %define i5       [rsp + 80]
339    %define i6       [rsp + 96]
340    %define i7       [rsp + 112]
341    %define i8       [rsp + 128]
342    %define i9       [rsp + 144]
343    %define i10      [rsp + 160]
344    %define i11      [rsp + 176]
345    %define i12      [rsp + 192]
346    %define i13      [rsp + 208]
347    %define i14      [rsp + 224]
348    %define i15      [rsp + 240]
349
350    ALIGN_STACK 16, rax
351
352    ; reserve stack space
353    %define      temp_storage  0 ; size is 256 (16*16)
354    %define      stack_size 256
355    sub          rsp, stack_size
356
357    ; prep work
358    lea         spp, [src + stride]
359    lea         stride3, [stride + 2 * stride]
360    lea         stride5, [stride3 + 2 * stride]
361    lea         stride7, [stride3 + 4 * stride]
362
363        ; 8-f
364        movdqa      xmm0, s8
365        movdqa      xmm1, xmm0
366        punpcklbw   xmm0, s9                ; 80 90
367        punpckhbw   xmm1, s9                ; 88 98
368
369        movdqa      xmm2, s10
370        movdqa      xmm3, xmm2
371        punpcklbw   xmm2, s11 ; a0 b0
372        punpckhbw   xmm3, s11 ; a8 b8
373
374        movdqa      xmm4, xmm0
375        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
376        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
377
378        movdqa      xmm2, xmm1
379        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
380        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
381
382        ; using xmm[0124]
383        ; work on next 4 rows
384
385        movdqa      xmm3, s12
386        movdqa      xmm5, xmm3
387        punpcklbw   xmm3, s13 ; c0 d0
388        punpckhbw   xmm5, s13 ; c8 d8
389
390        movdqa      xmm6, s14
391        movdqa      xmm7, xmm6
392        punpcklbw   xmm6, s15 ; e0 f0
393        punpckhbw   xmm7, s15 ; e8 f8
394
395        movdqa      xmm8, xmm3
396        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
397        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
398
399        movdqa      xmm6, xmm5
400        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
401        punpckhwd   xmm6, xmm7              ; cc dc ec fc
402
403        ; pull the third and fourth sets together
404
405        movdqa      xmm7, xmm0
406        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
407        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
408
409        movdqa      xmm3, xmm4
410        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
411        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
412
413        movdqa      xmm8, xmm1
414        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
415        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
416
417        movdqa      xmm5, xmm2
418        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
419        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
420
421        ; save the calculations. we only have 15 registers ...
422        movdqa      i0, xmm0
423        movdqa      i1, xmm7
424        movdqa      i2, xmm4
425        movdqa      i3, xmm3
426        movdqa      i4, xmm1
427        movdqa      i5, xmm8
428        movdqa      i6, xmm2
429        movdqa      i7, xmm5
430
431        ; 0-7
432        movdqa      xmm0, s0
433        movdqa      xmm1, xmm0
434        punpcklbw   xmm0, s1 ; 00 10
435        punpckhbw   xmm1, s1 ; 08 18
436
437        movdqa      xmm2, s2
438        movdqa      xmm3, xmm2
439        punpcklbw   xmm2, s3 ; 20 30
440        punpckhbw   xmm3, s3 ; 28 38
441
442        movdqa      xmm4, xmm0
443        punpcklwd   xmm0, xmm2              ; 00 10 20 30
444        punpckhwd   xmm4, xmm2              ; 04 14 24 34
445
446        movdqa      xmm2, xmm1
447        punpcklwd   xmm1, xmm3              ; 08 18 28 38
448        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
449
450        ; using xmm[0124]
451        ; work on next 4 rows
452
453        movdqa      xmm3, s4
454        movdqa      xmm5, xmm3
455        punpcklbw   xmm3, s5 ; 40 50
456        punpckhbw   xmm5, s5 ; 48 58
457
458        movdqa      xmm6, s6
459        movdqa      xmm7, xmm6
460        punpcklbw   xmm6, s7   ; 60 70
461        punpckhbw   xmm7, s7   ; 68 78
462
463        movdqa      xmm8, xmm3
464        punpcklwd   xmm3, xmm6              ; 40 50 60 70
465        punpckhwd   xmm8, xmm6              ; 44 54 64 74
466
467        movdqa      xmm6, xmm5
468        punpcklwd   xmm5, xmm7              ; 48 58 68 78
469        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
470
471        ; pull the first two sets together
472
473        movdqa      xmm7, xmm0
474        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
475        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
476
477        movdqa      xmm3, xmm4
478        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
479        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
480
481        movdqa      xmm8, xmm1
482        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
483        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
484
485        movdqa      xmm5, xmm2
486        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
487        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
488        ; final combination
489
490        movdqa      xmm6, xmm0
491        punpcklqdq  xmm0, i0
492        punpckhqdq  xmm6, i0
493
494        movdqa      xmm9, xmm7
495        punpcklqdq  xmm7, i1
496        punpckhqdq  xmm9, i1
497
498        movdqa      xmm10, xmm4
499        punpcklqdq  xmm4, i2
500        punpckhqdq  xmm10, i2
501
502        movdqa      xmm11, xmm3
503        punpcklqdq  xmm3, i3
504        punpckhqdq  xmm11, i3
505
506        movdqa      xmm12, xmm1
507        punpcklqdq  xmm1, i4
508        punpckhqdq  xmm12, i4
509
510        movdqa      xmm13, xmm8
511        punpcklqdq  xmm8, i5
512        punpckhqdq  xmm13, i5
513
514        movdqa      xmm14, xmm2
515        punpcklqdq  xmm2, i6
516        punpckhqdq  xmm14, i6
517
518        movdqa      xmm15, xmm5
519        punpcklqdq  xmm5, i7
520        punpckhqdq  xmm15, i7
521
522        movdqa      i0, xmm0
523        movdqa      i1, xmm6
524        movdqa      i2, xmm7
525        movdqa      i3, xmm9
526        movdqa      i4, xmm4
527        movdqa      i5, xmm10
528        movdqa      i6, xmm3
529        movdqa      i7, xmm11
530        movdqa      i8, xmm1
531        movdqa      i9, xmm12
532        movdqa      i10, xmm8
533        movdqa      i11, xmm13
534        movdqa      i12, xmm2
535        movdqa      i13, xmm14
536        movdqa      i14, xmm5
537        movdqa      i15, xmm15
538
539; TRANSPOSED DATA AVAILABLE ON THE STACK
540
541        movdqa      xmm12, xmm6
542        movdqa      xmm13, xmm7
543
544        pxor        zero, zero
545
546LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
547
548        movdqa       xmm1, i2
549        movdqa       xmm2, i3
550        movdqa       xmm8, i4
551        movdqa       xmm9, i5
552LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
553        movdqa       i2, xmm1
554        movdqa       i3, xmm2
555
556; second set
557        movdqa       i4, xmm8
558        movdqa       i5, xmm9
559
560        movdqa       xmm0, i6
561        movdqa       xmm1, i7
562        movdqa       xmm2, i8
563        movdqa       xmm4, i9
564        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
565        movdqa       xmm11, i11
566LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
567
568        movdqa       xmm0, i6
569        movdqa       xmm1, i7
570        movdqa       xmm3, i8
571        movdqa       xmm4, i9
572LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
573        movdqa       i6, xmm0
574        movdqa       i7, xmm1
575
576; last set
577        movdqa       i8, xmm3
578        movdqa       i9, xmm4
579
580        movdqa       xmm0, i10
581        movdqa       xmm1, i11
582        movdqa       xmm2, i12
583        movdqa       xmm8, i13
584        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
585        movdqa       xmm11, i15
586LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
587
588        movdqa       xmm0, i10
589        movdqa       xmm1, i11
590        movdqa       xmm4, i12
591        movdqa       xmm8, i13
592LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
593        movdqa       i10, xmm0
594        movdqa       i11, xmm1
595        movdqa       i12, xmm4
596        movdqa       i13, xmm8
597
598
599; RESHUFFLE AND WRITE OUT
600        ; 8-f
601        movdqa      xmm0, i8
602        movdqa      xmm1, xmm0
603        punpcklbw   xmm0, i9                ; 80 90
604        punpckhbw   xmm1, i9                ; 88 98
605
606        movdqa      xmm2, i10
607        movdqa      xmm3, xmm2
608        punpcklbw   xmm2, i11               ; a0 b0
609        punpckhbw   xmm3, i11               ; a8 b8
610
611        movdqa      xmm4, xmm0
612        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
613        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
614
615        movdqa      xmm2, xmm1
616        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
617        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
618
619        ; using xmm[0124]
620        ; work on next 4 rows
621
622        movdqa      xmm3, i12
623        movdqa      xmm5, xmm3
624        punpcklbw   xmm3, i13               ; c0 d0
625        punpckhbw   xmm5, i13               ; c8 d8
626
627        movdqa      xmm6, i14
628        movdqa      xmm7, xmm6
629        punpcklbw   xmm6, i15               ; e0 f0
630        punpckhbw   xmm7, i15               ; e8 f8
631
632        movdqa      xmm8, xmm3
633        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
634        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
635
636        movdqa      xmm6, xmm5
637        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
638        punpckhwd   xmm6, xmm7              ; cc dc ec fc
639
640        ; pull the third and fourth sets together
641
642        movdqa      xmm7, xmm0
643        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
644        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
645
646        movdqa      xmm3, xmm4
647        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
648        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
649
650        movdqa      xmm8, xmm1
651        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
652        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
653
654        movdqa      xmm5, xmm2
655        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
656        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
657
658        ; save the calculations. we only have 15 registers ...
659        movdqa      i8, xmm0
660        movdqa      i9, xmm7
661        movdqa      i10, xmm4
662        movdqa      i11, xmm3
663        movdqa      i12, xmm1
664        movdqa      i13, xmm8
665        movdqa      i14, xmm2
666        movdqa      i15, xmm5
667
668        ; 0-7
669        movdqa      xmm0, i0
670        movdqa      xmm1, xmm0
671        punpcklbw   xmm0, i1                ; 00 10
672        punpckhbw   xmm1, i1                ; 08 18
673
674        movdqa      xmm2, i2
675        movdqa      xmm3, xmm2
676        punpcklbw   xmm2, i3                ; 20 30
677        punpckhbw   xmm3, i3                ; 28 38
678
679        movdqa      xmm4, xmm0
680        punpcklwd   xmm0, xmm2              ; 00 10 20 30
681        punpckhwd   xmm4, xmm2              ; 04 14 24 34
682
683        movdqa      xmm2, xmm1
684        punpcklwd   xmm1, xmm3              ; 08 18 28 38
685        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
686
687        ; using xmm[0124]
688        ; work on next 4 rows
689
690        movdqa      xmm3, i4
691        movdqa      xmm5, xmm3
692        punpcklbw   xmm3, i5                ; 40 50
693        punpckhbw   xmm5, i5                ; 48 58
694
695        movdqa      xmm6, i6
696        movdqa      xmm7, xmm6
697        punpcklbw   xmm6, i7                ; 60 70
698        punpckhbw   xmm7, i7                ; 68 78
699
700        movdqa      xmm8, xmm3
701        punpcklwd   xmm3, xmm6              ; 40 50 60 70
702        punpckhwd   xmm8, xmm6              ; 44 54 64 74
703
704        movdqa      xmm6, xmm5
705        punpcklwd   xmm5, xmm7              ; 48 58 68 78
706        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
707
708        ; pull the first two sets together
709
710        movdqa      xmm7, xmm0
711        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
712        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
713
714        movdqa      xmm3, xmm4
715        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
716        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
717
718        movdqa      xmm8, xmm1
719        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
720        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
721
722        movdqa      xmm5, xmm2
723        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
724        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
725        ; final combination
726
727        movdqa      xmm6, xmm0
728        punpcklqdq  xmm0, i8
729        punpckhqdq  xmm6, i8
730
731        movdqa      xmm9, xmm7
732        punpcklqdq  xmm7, i9
733        punpckhqdq  xmm9, i9
734
735        movdqa      xmm10, xmm4
736        punpcklqdq  xmm4, i10
737        punpckhqdq  xmm10, i10
738
739        movdqa      xmm11, xmm3
740        punpcklqdq  xmm3, i11
741        punpckhqdq  xmm11, i11
742
743        movdqa      xmm12, xmm1
744        punpcklqdq  xmm1, i12
745        punpckhqdq  xmm12, i12
746
747        movdqa      xmm13, xmm8
748        punpcklqdq  xmm8, i13
749        punpckhqdq  xmm13, i13
750
751        movdqa      xmm14, xmm2
752        punpcklqdq  xmm2, i14
753        punpckhqdq  xmm14, i14
754
755        movdqa      xmm15, xmm5
756        punpcklqdq  xmm5, i15
757        punpckhqdq  xmm15, i15
758
759        movdqa      s0, xmm0
760        movdqa      s1, xmm6
761        movdqa      s2, xmm7
762        movdqa      s3, xmm9
763        movdqa      s4, xmm4
764        movdqa      s5, xmm10
765        movdqa      s6, xmm3
766        movdqa      s7, xmm11
767        movdqa      s8, xmm1
768        movdqa      s9, xmm12
769        movdqa      s10, xmm8
770        movdqa      s11, xmm13
771        movdqa      s12, xmm2
772        movdqa      s13, xmm14
773        movdqa      s14, xmm5
774        movdqa      s15, xmm15
775
776    ; free stack space
777    add          rsp, stack_size
778
779    ; un-ALIGN_STACK
780    pop          rsp
781
782%if LIBVPX_YASM_WIN64
783    pop    r13
784    pop    r12
785    RESTORE_XMM
786    pop    rbp
787%endif
788
789    ret
790
791SECTION_RODATA
792align 16
793te0:
794    times 16 db 0xe0
795align 16
796t7f:
797    times 16 db 0x7f
798align 16
799tfe:
800    times 16 db 0xfe
801align 16
802t1f:
803    times 16 db 0x1f
804align 16
805t80:
806    times 16 db 0x80
807align 16
808t1:
809    times 16 db 0x01
810align 16
811t3:
812    times 16 db 0x03
813align 16
814t4:
815    times 16 db 0x04
816