1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14
15;void vp9_lpf_horizontal_4_mmx
16;(
17;    unsigned char *src_ptr,
18;    int src_pixel_step,
19;    const char *blimit,
20;    const char *limit,
21;    const char *thresh,
22;    int  count
23;)
24global sym(vp9_lpf_horizontal_4_mmx) PRIVATE
25sym(vp9_lpf_horizontal_4_mmx):
26    push        rbp
27    mov         rbp, rsp
28    SHADOW_ARGS_TO_STACK 6
29    GET_GOT     rbx
30    push        rsi
31    push        rdi
32    ; end prolog
33
34    ALIGN_STACK 16, rax
35    sub         rsp, 32                         ; reserve 32 bytes
36    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
37    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
38
39        mov         rsi, arg(0) ;src_ptr
40        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
41
42        movsxd      rcx, dword ptr arg(5) ;count
43.next8_h:
44        mov         rdx, arg(3) ;limit
45        movq        mm7, [rdx]
46        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
47        add         rdi, rax
48
49        ; calculate breakout conditions
50        movq        mm2, [rdi+2*rax]      ; q3
51        movq        mm1, [rsi+2*rax]      ; q2
52        movq        mm6, mm1              ; q2
53        psubusb     mm1, mm2              ; q2-=q3
54        psubusb     mm2, mm6              ; q3-=q2
55        por         mm1, mm2              ; abs(q3-q2)
56        psubusb     mm1, mm7              ;
57
58
59        movq        mm4, [rsi+rax]        ; q1
60        movq        mm3, mm4              ; q1
61        psubusb     mm4, mm6              ; q1-=q2
62        psubusb     mm6, mm3              ; q2-=q1
63        por         mm4, mm6              ; abs(q2-q1)
64
65        psubusb     mm4, mm7
66        por        mm1, mm4
67
68        movq        mm4, [rsi]            ; q0
69        movq        mm0, mm4              ; q0
70        psubusb     mm4, mm3              ; q0-=q1
71        psubusb     mm3, mm0              ; q1-=q0
72        por         mm4, mm3              ; abs(q0-q1)
73        movq        t0, mm4               ; save to t0
74        psubusb     mm4, mm7
75        por        mm1, mm4
76
77
78        neg         rax                   ; negate pitch to deal with above border
79
80        movq        mm2, [rsi+4*rax]      ; p3
81        movq        mm4, [rdi+4*rax]      ; p2
82        movq        mm5, mm4              ; p2
83        psubusb     mm4, mm2              ; p2-=p3
84        psubusb     mm2, mm5              ; p3-=p2
85        por         mm4, mm2              ; abs(p3 - p2)
86        psubusb     mm4, mm7
87        por        mm1, mm4
88
89
90        movq        mm4, [rsi+2*rax]      ; p1
91        movq        mm3, mm4              ; p1
92        psubusb     mm4, mm5              ; p1-=p2
93        psubusb     mm5, mm3              ; p2-=p1
94        por         mm4, mm5              ; abs(p2 - p1)
95        psubusb     mm4, mm7
96        por        mm1, mm4
97
98        movq        mm2, mm3              ; p1
99
100        movq        mm4, [rsi+rax]        ; p0
101        movq        mm5, mm4              ; p0
102        psubusb     mm4, mm3              ; p0-=p1
103        psubusb     mm3, mm5              ; p1-=p0
104        por         mm4, mm3              ; abs(p1 - p0)
105        movq        t1, mm4               ; save to t1
106        psubusb     mm4, mm7
107        por        mm1, mm4
108
109        movq        mm3, [rdi]            ; q1
110        movq        mm4, mm3              ; q1
111        psubusb     mm3, mm2              ; q1-=p1
112        psubusb     mm2, mm4              ; p1-=q1
113        por         mm2, mm3              ; abs(p1-q1)
114        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
115        psrlw       mm2, 1                ; abs(p1-q1)/2
116
117        movq        mm6, mm5              ; p0
118        movq        mm3, [rsi]            ; q0
119        psubusb     mm5, mm3              ; p0-=q0
120        psubusb     mm3, mm6              ; q0-=p0
121        por         mm5, mm3              ; abs(p0 - q0)
122        paddusb     mm5, mm5              ; abs(p0-q0)*2
123        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
124
125        mov         rdx, arg(2) ;blimit           ; get blimit
126        movq        mm7, [rdx]            ; blimit
127
128        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
129        por         mm1,    mm5
130        pxor        mm5,    mm5
131        pcmpeqb     mm1,    mm5           ; mask mm1
132
133        ; calculate high edge variance
134        mov         rdx, arg(4) ;thresh           ; get thresh
135        movq        mm7, [rdx]            ;
136        movq        mm4, t0               ; get abs (q1 - q0)
137        psubusb     mm4, mm7
138        movq        mm3, t1               ; get abs (p1 - p0)
139        psubusb     mm3, mm7
140        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
141
142        pcmpeqb     mm4,        mm5
143
144        pcmpeqb     mm5,        mm5
145        pxor        mm4,        mm5
146
147
148        ; start work on filters
149        movq        mm2, [rsi+2*rax]      ; p1
150        movq        mm7, [rdi]            ; q1
151        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
152        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
153        psubsb      mm2, mm7              ; p1 - q1
154        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
155        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
156        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
157        movq        mm3, mm0              ; q0
158        psubsb      mm0, mm6              ; q0 - p0
159        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
160        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
161        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
162        pand        mm1, mm2                  ; mask filter values we don't care about
163        movq        mm2, mm1
164        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
165        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
166
167        pxor        mm0, mm0             ;
168        pxor        mm5, mm5
169        punpcklbw   mm0, mm2            ;
170        punpckhbw   mm5, mm2            ;
171        psraw       mm0, 11             ;
172        psraw       mm5, 11
173        packsswb    mm0, mm5
174        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
175
176        pxor        mm0, mm0              ; 0
177        movq        mm5, mm1              ; abcdefgh
178        punpcklbw   mm0, mm1              ; e0f0g0h0
179        psraw       mm0, 11               ; sign extended shift right by 3
180        pxor        mm1, mm1              ; 0
181        punpckhbw   mm1, mm5              ; a0b0c0d0
182        psraw       mm1, 11               ; sign extended shift right by 3
183        movq        mm5, mm0              ; save results
184
185        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
186        paddsw      mm5, [GLOBAL(ones)]
187        paddsw      mm1, [GLOBAL(ones)]
188        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
189        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
190        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
191        pandn       mm4, mm5              ; high edge variance additive
192
193        paddsb      mm6, mm2              ; p0+= p0 add
194        pxor        mm6, [GLOBAL(t80)]    ; unoffset
195        movq        [rsi+rax], mm6        ; write back
196
197        movq        mm6, [rsi+2*rax]      ; p1
198        pxor        mm6, [GLOBAL(t80)]    ; reoffset
199        paddsb      mm6, mm4              ; p1+= p1 add
200        pxor        mm6, [GLOBAL(t80)]    ; unoffset
201        movq        [rsi+2*rax], mm6      ; write back
202
203        psubsb      mm3, mm0              ; q0-= q0 add
204        pxor        mm3, [GLOBAL(t80)]    ; unoffset
205        movq        [rsi], mm3            ; write back
206
207        psubsb      mm7, mm4              ; q1-= q1 add
208        pxor        mm7, [GLOBAL(t80)]    ; unoffset
209        movq        [rdi], mm7            ; write back
210
211        add         rsi,8
212        neg         rax
213        dec         rcx
214        jnz         .next8_h
215
216    add rsp, 32
217    pop rsp
218    ; begin epilog
219    pop rdi
220    pop rsi
221    RESTORE_GOT
222    UNSHADOW_ARGS
223    pop         rbp
224    ret
225
226
227;void vp9_lpf_vertical_4_mmx
228;(
229;    unsigned char *src_ptr,
230;    int  src_pixel_step,
231;    const char *blimit,
232;    const char *limit,
233;    const char *thresh,
234;    int count
235;)
236global sym(vp9_lpf_vertical_4_mmx) PRIVATE
237sym(vp9_lpf_vertical_4_mmx):
238    push        rbp
239    mov         rbp, rsp
240    SHADOW_ARGS_TO_STACK 6
241    GET_GOT     rbx
242    push        rsi
243    push        rdi
244    ; end prolog
245
246    ALIGN_STACK 16, rax
247    sub          rsp, 64      ; reserve 64 bytes
248    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
249    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
250    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
251
252        mov         rsi,        arg(0) ;src_ptr
253        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
254
255        lea         rsi,        [rsi + rax*4 - 4]
256
257        movsxd      rcx,        dword ptr arg(5) ;count
258.next8_v:
259        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
260        add         rdi,        rax
261
262
263        ;transpose
264        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
265        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
266
267        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
268        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
269
270        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
271        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
272
273        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
274        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
275
276        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
277        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
278
279        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
280        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
281
282        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
283        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
284
285        neg         rax
286        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
287
288        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
289        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
290
291        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
292        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
293
294        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
295        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
296
297        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
298        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
299
300        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
301        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
302
303        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
304
305        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
306        psubusb     mm5,        mm7                         ; q2-q3
307
308        psubusb     mm7,        mm6                         ; q3-q2
309        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
310
311        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
312        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
313
314        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
315        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
316
317        psubusb     mm3,        mm6                         ; q1-q2
318        psubusb     mm6,        mm5                         ; q2-q1
319
320        por         mm6,        mm3                         ; mm6=abs(q2-q1)
321        lea         rdx,        srct
322
323        movq        [rdx+24],   mm5                         ; save q1
324        movq        [rdx+16],   mm0                         ; save q0
325
326        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
327        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
328
329        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
330        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
331
332        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
333        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
334
335        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
336        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
337
338        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
339        psubusb     mm2,        mm0                         ; p2-p3
340
341        psubusb     mm0,        mm1                         ; p3-p2
342        por         mm0,        mm2                         ; mm0=abs(p3-p2)
343
344        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
345        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
346
347        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
348        movq        [rdx+8],    mm3                         ; save p0
349
350        movq        [rdx],      mm2                         ; save p1
351        movq        mm5,        mm2                         ; mm5 = p1
352
353        psubusb     mm2,        mm1                         ; p1-p2
354        psubusb     mm1,        mm5                         ; p2-p1
355
356        por         mm1,        mm2                         ; mm1=abs(p2-p1)
357        mov         rdx,        arg(3) ;limit
358
359        movq        mm4,        [rdx]                       ; mm4 = limit
360        psubusb     mm7,        mm4
361
362        psubusb     mm0,        mm4
363        psubusb     mm1,        mm4
364
365        psubusb     mm6,        mm4
366        por         mm7,        mm6
367
368        por         mm0,        mm1
369        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
370
371        movq        mm1,        mm5                         ; p1
372
373        movq        mm7,        mm3                         ; mm3=mm7=p0
374        psubusb     mm7,        mm5                         ; p0 - p1
375
376        psubusb     mm5,        mm3                         ; p1 - p0
377        por         mm5,        mm7                         ; abs(p1-p0)
378
379        movq        t0,         mm5                         ; save abs(p1-p0)
380        lea         rdx,        srct
381
382        psubusb     mm5,        mm4
383        por         mm0,        mm5                         ; mm0=mask
384
385        movq        mm5,        [rdx+16]                    ; mm5=q0
386        movq        mm7,        [rdx+24]                    ; mm7=q1
387
388        movq        mm6,        mm5                         ; mm6=q0
389        movq        mm2,        mm7                         ; q1
390        psubusb     mm5,        mm7                         ; q0-q1
391
392        psubusb     mm7,        mm6                         ; q1-q0
393        por         mm7,        mm5                         ; abs(q1-q0)
394
395        movq        t1,         mm7                         ; save abs(q1-q0)
396        psubusb     mm7,        mm4
397
398        por         mm0,        mm7                         ; mask
399
400        movq        mm5,        mm2                         ; q1
401        psubusb     mm5,        mm1                         ; q1-=p1
402        psubusb     mm1,        mm2                         ; p1-=q1
403        por         mm5,        mm1                         ; abs(p1-q1)
404        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
405        psrlw       mm5,        1                           ; abs(p1-q1)/2
406
407        mov         rdx,        arg(2) ;blimit                      ;
408
409        movq        mm4,        [rdx]                       ;blimit
410        movq        mm1,        mm3                         ; mm1=mm3=p0
411
412        movq        mm7,        mm6                         ; mm7=mm6=q0
413        psubusb     mm1,        mm7                         ; p0-q0
414
415        psubusb     mm7,        mm3                         ; q0-p0
416        por         mm1,        mm7                         ; abs(q0-p0)
417        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
418        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
419
420        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
421        por         mm1,        mm0;                        ; mask
422
423        pxor        mm0,        mm0
424        pcmpeqb     mm1,        mm0
425
426        ; calculate high edge variance
427        mov         rdx,        arg(4) ;thresh            ; get thresh
428        movq        mm7,        [rdx]
429        ;
430        movq        mm4,        t0              ; get abs (q1 - q0)
431        psubusb     mm4,        mm7
432
433        movq        mm3,        t1              ; get abs (p1 - p0)
434        psubusb     mm3,        mm7
435
436        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
437        pcmpeqb     mm4,        mm0
438
439        pcmpeqb     mm0,        mm0
440        pxor        mm4,        mm0
441
442
443
444        ; start work on filters
445        lea         rdx,        srct
446
447        movq        mm2,        [rdx]           ; p1
448        movq        mm7,        [rdx+24]        ; q1
449
450        movq        mm6,        [rdx+8]         ; p0
451        movq        mm0,        [rdx+16]        ; q0
452
453        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
454        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
455
456        psubsb      mm2,        mm7             ; p1 - q1
457        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
458
459        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
460        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
461
462        movq        mm3,        mm0             ; q0
463        psubsb      mm0,        mm6             ; q0 - p0
464
465        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
466        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
467
468        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
469        pand       mm1,        mm2              ; mask filter values we don't care about
470
471        movq        mm2,        mm1
472        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
473
474        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
475        pxor        mm0,        mm0          ;
476
477        pxor        mm5,        mm5
478        punpcklbw   mm0,        mm2         ;
479
480        punpckhbw   mm5,        mm2         ;
481        psraw       mm0,        11              ;
482
483        psraw       mm5,        11
484        packsswb    mm0,        mm5
485
486        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
487
488        pxor        mm0,        mm0           ; 0
489        movq        mm5,        mm1           ; abcdefgh
490
491        punpcklbw   mm0,        mm1           ; e0f0g0h0
492        psraw       mm0,        11                ; sign extended shift right by 3
493
494        pxor        mm1,        mm1           ; 0
495        punpckhbw   mm1,        mm5           ; a0b0c0d0
496
497        psraw       mm1,        11                ; sign extended shift right by 3
498        movq        mm5,        mm0              ; save results
499
500        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
501        paddsw      mm5,        [GLOBAL(ones)]
502
503        paddsw      mm1,        [GLOBAL(ones)]
504        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
505
506        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
507        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
508
509        pandn       mm4,        mm5             ; high edge variance additive
510
511        paddsb      mm6,        mm2             ; p0+= p0 add
512        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
513
514        ; mm6=p0                               ;
515        movq        mm1,        [rdx]           ; p1
516        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
517
518        paddsb      mm1,        mm4                 ; p1+= p1 add
519        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
520        ; mm6 = p0 mm1 = p1
521
522        psubsb      mm3,        mm0                 ; q0-= q0 add
523        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
524
525        ; mm3 = q0
526        psubsb      mm7,        mm4                 ; q1-= q1 add
527        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
528        ; mm7 = q1
529
530        ; transpose and write back
531        ; mm1 =    72 62 52 42 32 22 12 02
532        ; mm6 =    73 63 53 43 33 23 13 03
533        ; mm3 =    74 64 54 44 34 24 14 04
534        ; mm7 =    75 65 55 45 35 25 15 05
535
536        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
537        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
538
539        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
540        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
541
542        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
543        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
544
545        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
546        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
547
548        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
549        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
550
551        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
552        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
553
554
555        ; mm2 = 15 14 13 12 05 04 03 02
556        ; mm6 = 35 34 33 32 25 24 23 22
557        ; mm5 = 55 54 53 52 45 44 43 42
558        ; mm1 = 75 74 73 72 65 64 63 62
559
560
561
562        movd        [rsi+rax*4+2], mm2
563        psrlq       mm2,        32
564
565        movd        [rdi+rax*4+2], mm2
566        movd        [rsi+rax*2+2], mm6
567
568        psrlq       mm6,        32
569        movd        [rsi+rax+2],mm6
570
571        movd        [rsi+2],    mm1
572        psrlq       mm1,        32
573
574        movd        [rdi+2],    mm1
575        neg         rax
576
577        movd        [rdi+rax+2],mm5
578        psrlq       mm5,        32
579
580        movd        [rdi+rax*2+2], mm5
581
582        lea         rsi,        [rsi+rax*8]
583        dec         rcx
584        jnz         .next8_v
585
586    add rsp, 64
587    pop rsp
588    ; begin epilog
589    pop rdi
590    pop rsi
591    RESTORE_GOT
592    UNSHADOW_ARGS
593    pop         rbp
594    ret
595
596SECTION_RODATA
597align 16
598tfe:
599    times 8 db 0xfe
600align 16
601t80:
602    times 8 db 0x80
603align 16
604t1s:
605    times 8 db 0x01
606align 16
607t3:
608    times 8 db 0x03
609align 16
610t4:
611    times 8 db 0x04
612align 16
613ones:
614    times 4 dw 0x0001
615align 16
616s27:
617    times 4 dw 0x1b00
618align 16
619s18:
620    times 4 dw 0x1200
621align 16
622s9:
623    times 4 dw 0x0900
624align 16
625s63:
626    times 4 dw 0x003f
627