1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14
15;void vp8_loop_filter_horizontal_edge_mmx
16;(
17;    unsigned char *src_ptr,
18;    int src_pixel_step,
19;    const char *blimit,
20;    const char *limit,
21;    const char *thresh,
22;    int  count
23;)
24global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
25sym(vp8_loop_filter_horizontal_edge_mmx):
26    push        rbp
27    mov         rbp, rsp
28    SHADOW_ARGS_TO_STACK 6
29    GET_GOT     rbx
30    push        rsi
31    push        rdi
32    ; end prolog
33
34    ALIGN_STACK 16, rax
35    sub         rsp, 32                         ; reserve 32 bytes
36    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
37    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
38
39        mov         rsi, arg(0) ;src_ptr
40        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
41
42        movsxd      rcx, dword ptr arg(5) ;count
43.next8_h:
44        mov         rdx, arg(3) ;limit
45        movq        mm7, [rdx]
46        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
47        add         rdi, rax
48
49        ; calculate breakout conditions
50        movq        mm2, [rdi+2*rax]      ; q3
51        movq        mm1, [rsi+2*rax]      ; q2
52        movq        mm6, mm1              ; q2
53        psubusb     mm1, mm2              ; q2-=q3
54        psubusb     mm2, mm6              ; q3-=q2
55        por         mm1, mm2              ; abs(q3-q2)
56        psubusb     mm1, mm7              ;
57
58
59        movq        mm4, [rsi+rax]        ; q1
60        movq        mm3, mm4              ; q1
61        psubusb     mm4, mm6              ; q1-=q2
62        psubusb     mm6, mm3              ; q2-=q1
63        por         mm4, mm6              ; abs(q2-q1)
64
65        psubusb     mm4, mm7
66        por        mm1, mm4
67
68        movq        mm4, [rsi]            ; q0
69        movq        mm0, mm4              ; q0
70        psubusb     mm4, mm3              ; q0-=q1
71        psubusb     mm3, mm0              ; q1-=q0
72        por         mm4, mm3              ; abs(q0-q1)
73        movq        t0, mm4               ; save to t0
74        psubusb     mm4, mm7
75        por        mm1, mm4
76
77
78        neg         rax                   ; negate pitch to deal with above border
79
80        movq        mm2, [rsi+4*rax]      ; p3
81        movq        mm4, [rdi+4*rax]      ; p2
82        movq        mm5, mm4              ; p2
83        psubusb     mm4, mm2              ; p2-=p3
84        psubusb     mm2, mm5              ; p3-=p2
85        por         mm4, mm2              ; abs(p3 - p2)
86        psubusb     mm4, mm7
87        por        mm1, mm4
88
89
90        movq        mm4, [rsi+2*rax]      ; p1
91        movq        mm3, mm4              ; p1
92        psubusb     mm4, mm5              ; p1-=p2
93        psubusb     mm5, mm3              ; p2-=p1
94        por         mm4, mm5              ; abs(p2 - p1)
95        psubusb     mm4, mm7
96        por        mm1, mm4
97
98        movq        mm2, mm3              ; p1
99
100        movq        mm4, [rsi+rax]        ; p0
101        movq        mm5, mm4              ; p0
102        psubusb     mm4, mm3              ; p0-=p1
103        psubusb     mm3, mm5              ; p1-=p0
104        por         mm4, mm3              ; abs(p1 - p0)
105        movq        t1, mm4               ; save to t1
106        psubusb     mm4, mm7
107        por        mm1, mm4
108
109        movq        mm3, [rdi]            ; q1
110        movq        mm4, mm3              ; q1
111        psubusb     mm3, mm2              ; q1-=p1
112        psubusb     mm2, mm4              ; p1-=q1
113        por         mm2, mm3              ; abs(p1-q1)
114        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
115        psrlw       mm2, 1                ; abs(p1-q1)/2
116
117        movq        mm6, mm5              ; p0
118        movq        mm3, [rsi]            ; q0
119        psubusb     mm5, mm3              ; p0-=q0
120        psubusb     mm3, mm6              ; q0-=p0
121        por         mm5, mm3              ; abs(p0 - q0)
122        paddusb     mm5, mm5              ; abs(p0-q0)*2
123        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
124
125        mov         rdx, arg(2) ;blimit           ; get blimit
126        movq        mm7, [rdx]            ; blimit
127
128        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
129        por         mm1,    mm5
130        pxor        mm5,    mm5
131        pcmpeqb     mm1,    mm5           ; mask mm1
132
133        ; calculate high edge variance
134        mov         rdx, arg(4) ;thresh           ; get thresh
135        movq        mm7, [rdx]            ;
136        movq        mm4, t0               ; get abs (q1 - q0)
137        psubusb     mm4, mm7
138        movq        mm3, t1               ; get abs (p1 - p0)
139        psubusb     mm3, mm7
140        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
141
142        pcmpeqb     mm4,        mm5
143
144        pcmpeqb     mm5,        mm5
145        pxor        mm4,        mm5
146
147
148        ; start work on filters
149        movq        mm2, [rsi+2*rax]      ; p1
150        movq        mm7, [rdi]            ; q1
151        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
152        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
153        psubsb      mm2, mm7              ; p1 - q1
154        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
155        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
156        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
157        movq        mm3, mm0              ; q0
158        psubsb      mm0, mm6              ; q0 - p0
159        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
160        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
161        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
162        pand        mm1, mm2                  ; mask filter values we don't care about
163        movq        mm2, mm1
164        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
165        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
166
167        pxor        mm0, mm0             ;
168        pxor        mm5, mm5
169        punpcklbw   mm0, mm2            ;
170        punpckhbw   mm5, mm2            ;
171        psraw       mm0, 11             ;
172        psraw       mm5, 11
173        packsswb    mm0, mm5
174        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
175
176        pxor        mm0, mm0              ; 0
177        movq        mm5, mm1              ; abcdefgh
178        punpcklbw   mm0, mm1              ; e0f0g0h0
179        psraw       mm0, 11               ; sign extended shift right by 3
180        pxor        mm1, mm1              ; 0
181        punpckhbw   mm1, mm5              ; a0b0c0d0
182        psraw       mm1, 11               ; sign extended shift right by 3
183        movq        mm5, mm0              ; save results
184
185        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
186        paddsw      mm5, [GLOBAL(ones)]
187        paddsw      mm1, [GLOBAL(ones)]
188        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
189        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
190        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
191        pandn       mm4, mm5              ; high edge variance additive
192
193        paddsb      mm6, mm2              ; p0+= p0 add
194        pxor        mm6, [GLOBAL(t80)]    ; unoffset
195        movq        [rsi+rax], mm6        ; write back
196
197        movq        mm6, [rsi+2*rax]      ; p1
198        pxor        mm6, [GLOBAL(t80)]    ; reoffset
199        paddsb      mm6, mm4              ; p1+= p1 add
200        pxor        mm6, [GLOBAL(t80)]    ; unoffset
201        movq        [rsi+2*rax], mm6      ; write back
202
203        psubsb      mm3, mm0              ; q0-= q0 add
204        pxor        mm3, [GLOBAL(t80)]    ; unoffset
205        movq        [rsi], mm3            ; write back
206
207        psubsb      mm7, mm4              ; q1-= q1 add
208        pxor        mm7, [GLOBAL(t80)]    ; unoffset
209        movq        [rdi], mm7            ; write back
210
211        add         rsi,8
212        neg         rax
213        dec         rcx
214        jnz         .next8_h
215
216    add rsp, 32
217    pop rsp
218    ; begin epilog
219    pop rdi
220    pop rsi
221    RESTORE_GOT
222    UNSHADOW_ARGS
223    pop         rbp
224    ret
225
226
227;void vp8_loop_filter_vertical_edge_mmx
228;(
229;    unsigned char *src_ptr,
230;    int  src_pixel_step,
231;    const char *blimit,
232;    const char *limit,
233;    const char *thresh,
234;    int count
235;)
236global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
237sym(vp8_loop_filter_vertical_edge_mmx):
238    push        rbp
239    mov         rbp, rsp
240    SHADOW_ARGS_TO_STACK 6
241    GET_GOT     rbx
242    push        rsi
243    push        rdi
244    ; end prolog
245
246    ALIGN_STACK 16, rax
247    sub          rsp, 64      ; reserve 64 bytes
248    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
249    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
250    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
251
252        mov         rsi,        arg(0) ;src_ptr
253        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
254
255        lea         rsi,        [rsi + rax*4 - 4]
256
257        movsxd      rcx,        dword ptr arg(5) ;count
258.next8_v:
259        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
260        add         rdi,        rax
261
262
263        ;transpose
264        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
265        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
266
267        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
268        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
269
270        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
271        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
272
273        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
274        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
275
276        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
277        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
278
279        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
280        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
281
282        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
283        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
284
285        neg         rax
286        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
287
288        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
289        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
290
291        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
292        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
293
294        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
295        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
296
297        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
298        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
299
300        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
301        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
302
303        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
304
305        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
306        psubusb     mm5,        mm7                         ; q2-q3
307
308        psubusb     mm7,        mm6                         ; q3-q2
309        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
310
311        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
312        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
313
314        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
315        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
316
317        psubusb     mm3,        mm6                         ; q1-q2
318        psubusb     mm6,        mm5                         ; q2-q1
319
320        por         mm6,        mm3                         ; mm6=abs(q2-q1)
321        lea         rdx,        srct
322
323        movq        [rdx+24],   mm5                         ; save q1
324        movq        [rdx+16],   mm0                         ; save q0
325
326        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
327        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
328
329        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
330        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
331
332        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
333        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
334
335        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
336        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
337
338        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
339        psubusb     mm2,        mm0                         ; p2-p3
340
341        psubusb     mm0,        mm1                         ; p3-p2
342        por         mm0,        mm2                         ; mm0=abs(p3-p2)
343
344        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
345        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
346
347        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
348        movq        [rdx+8],    mm3                         ; save p0
349
350        movq        [rdx],      mm2                         ; save p1
351        movq        mm5,        mm2                         ; mm5 = p1
352
353        psubusb     mm2,        mm1                         ; p1-p2
354        psubusb     mm1,        mm5                         ; p2-p1
355
356        por         mm1,        mm2                         ; mm1=abs(p2-p1)
357        mov         rdx,        arg(3) ;limit
358
359        movq        mm4,        [rdx]                       ; mm4 = limit
360        psubusb     mm7,        mm4
361
362        psubusb     mm0,        mm4
363        psubusb     mm1,        mm4
364
365        psubusb     mm6,        mm4
366        por         mm7,        mm6
367
368        por         mm0,        mm1
369        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
370
371        movq        mm1,        mm5                         ; p1
372
373        movq        mm7,        mm3                         ; mm3=mm7=p0
374        psubusb     mm7,        mm5                         ; p0 - p1
375
376        psubusb     mm5,        mm3                         ; p1 - p0
377        por         mm5,        mm7                         ; abs(p1-p0)
378
379        movq        t0,         mm5                         ; save abs(p1-p0)
380        lea         rdx,        srct
381
382        psubusb     mm5,        mm4
383        por         mm0,        mm5                         ; mm0=mask
384
385        movq        mm5,        [rdx+16]                    ; mm5=q0
386        movq        mm7,        [rdx+24]                    ; mm7=q1
387
388        movq        mm6,        mm5                         ; mm6=q0
389        movq        mm2,        mm7                         ; q1
390        psubusb     mm5,        mm7                         ; q0-q1
391
392        psubusb     mm7,        mm6                         ; q1-q0
393        por         mm7,        mm5                         ; abs(q1-q0)
394
395        movq        t1,         mm7                         ; save abs(q1-q0)
396        psubusb     mm7,        mm4
397
398        por         mm0,        mm7                         ; mask
399
400        movq        mm5,        mm2                         ; q1
401        psubusb     mm5,        mm1                         ; q1-=p1
402        psubusb     mm1,        mm2                         ; p1-=q1
403        por         mm5,        mm1                         ; abs(p1-q1)
404        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
405        psrlw       mm5,        1                           ; abs(p1-q1)/2
406
407        mov         rdx,        arg(2) ;blimit                      ;
408
409        movq        mm4,        [rdx]                       ;blimit
410        movq        mm1,        mm3                         ; mm1=mm3=p0
411
412        movq        mm7,        mm6                         ; mm7=mm6=q0
413        psubusb     mm1,        mm7                         ; p0-q0
414
415        psubusb     mm7,        mm3                         ; q0-p0
416        por         mm1,        mm7                         ; abs(q0-p0)
417        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
418        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
419
420        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
421        por         mm1,        mm0;                        ; mask
422
423        pxor        mm0,        mm0
424        pcmpeqb     mm1,        mm0
425
426        ; calculate high edge variance
427        mov         rdx,        arg(4) ;thresh            ; get thresh
428        movq        mm7,        [rdx]
429        ;
430        movq        mm4,        t0              ; get abs (q1 - q0)
431        psubusb     mm4,        mm7
432
433        movq        mm3,        t1              ; get abs (p1 - p0)
434        psubusb     mm3,        mm7
435
436        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
437        pcmpeqb     mm4,        mm0
438
439        pcmpeqb     mm0,        mm0
440        pxor        mm4,        mm0
441
442
443
444        ; start work on filters
445        lea         rdx,        srct
446
447        movq        mm2,        [rdx]           ; p1
448        movq        mm7,        [rdx+24]        ; q1
449
450        movq        mm6,        [rdx+8]         ; p0
451        movq        mm0,        [rdx+16]        ; q0
452
453        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
454        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
455
456        psubsb      mm2,        mm7             ; p1 - q1
457        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
458
459        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
460        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
461
462        movq        mm3,        mm0             ; q0
463        psubsb      mm0,        mm6             ; q0 - p0
464
465        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
466        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
467
468        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
469        pand       mm1,        mm2              ; mask filter values we don't care about
470
471        movq        mm2,        mm1
472        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
473
474        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
475        pxor        mm0,        mm0          ;
476
477        pxor        mm5,        mm5
478        punpcklbw   mm0,        mm2         ;
479
480        punpckhbw   mm5,        mm2         ;
481        psraw       mm0,        11              ;
482
483        psraw       mm5,        11
484        packsswb    mm0,        mm5
485
486        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
487
488        pxor        mm0,        mm0           ; 0
489        movq        mm5,        mm1           ; abcdefgh
490
491        punpcklbw   mm0,        mm1           ; e0f0g0h0
492        psraw       mm0,        11                ; sign extended shift right by 3
493
494        pxor        mm1,        mm1           ; 0
495        punpckhbw   mm1,        mm5           ; a0b0c0d0
496
497        psraw       mm1,        11                ; sign extended shift right by 3
498        movq        mm5,        mm0              ; save results
499
500        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
501        paddsw      mm5,        [GLOBAL(ones)]
502
503        paddsw      mm1,        [GLOBAL(ones)]
504        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
505
506        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
507        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
508
509        pandn       mm4,        mm5             ; high edge variance additive
510
511        paddsb      mm6,        mm2             ; p0+= p0 add
512        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
513
514        ; mm6=p0                               ;
515        movq        mm1,        [rdx]           ; p1
516        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
517
518        paddsb      mm1,        mm4                 ; p1+= p1 add
519        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
520        ; mm6 = p0 mm1 = p1
521
522        psubsb      mm3,        mm0                 ; q0-= q0 add
523        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
524
525        ; mm3 = q0
526        psubsb      mm7,        mm4                 ; q1-= q1 add
527        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
528        ; mm7 = q1
529
530        ; tranpose and write back
531        ; mm1 =    72 62 52 42 32 22 12 02
532        ; mm6 =    73 63 53 43 33 23 13 03
533        ; mm3 =    74 64 54 44 34 24 14 04
534        ; mm7 =    75 65 55 45 35 25 15 05
535
536        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
537        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
538
539        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
540        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
541
542        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
543        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
544
545        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
546        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
547
548        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
549        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
550
551        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
552        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
553
554
555        ; mm2 = 15 14 13 12 05 04 03 02
556        ; mm6 = 35 34 33 32 25 24 23 22
557        ; mm5 = 55 54 53 52 45 44 43 42
558        ; mm1 = 75 74 73 72 65 64 63 62
559
560
561
562        movd        [rsi+rax*4+2], mm2
563        psrlq       mm2,        32
564
565        movd        [rdi+rax*4+2], mm2
566        movd        [rsi+rax*2+2], mm6
567
568        psrlq       mm6,        32
569        movd        [rsi+rax+2],mm6
570
571        movd        [rsi+2],    mm1
572        psrlq       mm1,        32
573
574        movd        [rdi+2],    mm1
575        neg         rax
576
577        movd        [rdi+rax+2],mm5
578        psrlq       mm5,        32
579
580        movd        [rdi+rax*2+2], mm5
581
582        lea         rsi,        [rsi+rax*8]
583        dec         rcx
584        jnz         .next8_v
585
586    add rsp, 64
587    pop rsp
588    ; begin epilog
589    pop rdi
590    pop rsi
591    RESTORE_GOT
592    UNSHADOW_ARGS
593    pop         rbp
594    ret
595
596
597;void vp8_mbloop_filter_horizontal_edge_mmx
598;(
599;    unsigned char *src_ptr,
600;    int  src_pixel_step,
601;    const char *blimit,
602;    const char *limit,
603;    const char *thresh,
604;    int count
605;)
606global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
607sym(vp8_mbloop_filter_horizontal_edge_mmx):
608    push        rbp
609    mov         rbp, rsp
610    SHADOW_ARGS_TO_STACK 6
611    GET_GOT     rbx
612    push        rsi
613    push        rdi
614    ; end prolog
615
616    ALIGN_STACK 16, rax
617    sub          rsp, 32      ; reserve 32 bytes
618    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
619    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
620
621        mov         rsi, arg(0) ;src_ptr
622        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
623
624        movsxd      rcx, dword ptr arg(5) ;count
625.next8_mbh:
626        mov         rdx, arg(3) ;limit
627        movq        mm7, [rdx]
628        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
629        add         rdi, rax
630
631        ; calculate breakout conditions
632        movq        mm2, [rdi+2*rax]      ; q3
633
634        movq        mm1, [rsi+2*rax]      ; q2
635        movq        mm6, mm1              ; q2
636        psubusb     mm1, mm2              ; q2-=q3
637        psubusb     mm2, mm6              ; q3-=q2
638        por         mm1, mm2              ; abs(q3-q2)
639        psubusb     mm1, mm7
640
641
642        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
643        movq        mm4, [rsi+rax]        ; q1
644        movq        mm3, mm4              ; q1
645        psubusb     mm4, mm6              ; q1-=q2
646        psubusb     mm6, mm3              ; q2-=q1
647        por         mm4, mm6              ; abs(q2-q1)
648        psubusb     mm4, mm7
649        por        mm1, mm4
650
651
652        ; mm1 = mask,      mm3=q1, mm7 = limit
653
654        movq        mm4, [rsi]            ; q0
655        movq        mm0, mm4              ; q0
656        psubusb     mm4, mm3              ; q0-=q1
657        psubusb     mm3, mm0              ; q1-=q0
658        por         mm4, mm3              ; abs(q0-q1)
659        movq        t0, mm4               ; save to t0
660        psubusb     mm4, mm7
661        por        mm1, mm4
662
663
664        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
665
666        neg         rax                   ; negate pitch to deal with above border
667
668        movq        mm2, [rsi+4*rax]      ; p3
669        movq        mm4, [rdi+4*rax]      ; p2
670        movq        mm5, mm4              ; p2
671        psubusb     mm4, mm2              ; p2-=p3
672        psubusb     mm2, mm5              ; p3-=p2
673        por         mm4, mm2              ; abs(p3 - p2)
674        psubusb     mm4, mm7
675        por        mm1, mm4
676        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
677
678        movq        mm4, [rsi+2*rax]      ; p1
679        movq        mm3, mm4              ; p1
680        psubusb     mm4, mm5              ; p1-=p2
681        psubusb     mm5, mm3              ; p2-=p1
682        por         mm4, mm5              ; abs(p2 - p1)
683        psubusb     mm4, mm7
684        por        mm1, mm4
685
686        movq        mm2, mm3              ; p1
687
688
689        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
690
691        movq        mm4, [rsi+rax]        ; p0
692        movq        mm5, mm4              ; p0
693        psubusb     mm4, mm3              ; p0-=p1
694        psubusb     mm3, mm5              ; p1-=p0
695        por         mm4, mm3              ; abs(p1 - p0)
696        movq        t1, mm4               ; save to t1
697        psubusb     mm4, mm7
698        por        mm1, mm4
699        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
700        ; mm5 = p0
701        movq        mm3, [rdi]            ; q1
702        movq        mm4, mm3              ; q1
703        psubusb     mm3, mm2              ; q1-=p1
704        psubusb     mm2, mm4              ; p1-=q1
705        por         mm2, mm3              ; abs(p1-q1)
706        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
707        psrlw       mm2, 1                ; abs(p1-q1)/2
708
709        movq        mm6, mm5              ; p0
710        movq        mm3, mm0              ; q0
711        psubusb     mm5, mm3              ; p0-=q0
712        psubusb     mm3, mm6              ; q0-=p0
713        por         mm5, mm3              ; abs(p0 - q0)
714        paddusb     mm5, mm5              ; abs(p0-q0)*2
715        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
716
717        mov         rdx, arg(2) ;blimit           ; get blimit
718        movq        mm7, [rdx]            ; blimit
719
720        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
721        por         mm1,    mm5
722        pxor        mm5,    mm5
723        pcmpeqb     mm1,    mm5           ; mask mm1
724
725        ; mm1 = mask, mm0=q0,  mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
726        ; mm6 = p0,
727
728        ; calculate high edge variance
729        mov         rdx, arg(4) ;thresh           ; get thresh
730        movq        mm7, [rdx]            ;
731        movq        mm4, t0               ; get abs (q1 - q0)
732        psubusb     mm4, mm7
733        movq        mm3, t1               ; get abs (p1 - p0)
734        psubusb     mm3, mm7
735        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
736
737        pcmpeqb     mm4,        mm5
738
739        pcmpeqb     mm5,        mm5
740        pxor        mm4,        mm5
741
742
743
744        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
745        ; mm6 = p0, mm4=hev
746        ; start work on filters
747        movq        mm2, [rsi+2*rax]      ; p1
748        movq        mm7, [rdi]            ; q1
749        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
750        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
751        psubsb      mm2, mm7              ; p1 - q1
752
753        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
754        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
755        movq        mm3, mm0              ; q0
756        psubsb      mm0, mm6              ; q0 - p0
757        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
758        paddsb      mm2, mm0              ; 2 * (q0 - p0)
759        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
760        pand        mm1, mm2              ; mask filter values we don't care about
761
762
763        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
764        movq        mm2, mm1              ; vp8_filter
765        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
766
767        movq        mm5,        mm2       ;
768        paddsb      mm5,        [GLOBAL(t3)];
769
770        pxor        mm0, mm0              ; 0
771        pxor        mm7, mm7              ; 0
772
773        punpcklbw   mm0, mm5              ; e0f0g0h0
774        psraw       mm0, 11               ; sign extended shift right by 3
775        punpckhbw   mm7, mm5              ; a0b0c0d0
776        psraw       mm7, 11               ; sign extended shift right by 3
777        packsswb    mm0, mm7              ; Filter2 >>=3;
778
779        movq        mm5, mm0              ; Filter2
780
781        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
782        pxor        mm0, mm0              ; 0
783        pxor        mm7, mm7              ; 0
784
785        punpcklbw   mm0, mm2              ; e0f0g0h0
786        psraw       mm0, 11               ; sign extended shift right by 3
787        punpckhbw   mm7, mm2              ; a0b0c0d0
788        psraw       mm7, 11               ; sign extended shift right by 3
789        packsswb    mm0, mm7              ; Filter2 >>=3;
790
791        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
792        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
793        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
794
795        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
796        ; vp8_filter &= ~hev;
797        ; Filter2 = vp8_filter;
798        pandn       mm4, mm1              ; vp8_filter&=~hev
799
800
801        ; mm3=qs0, mm4=filter2, mm6=ps0
802
803        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
804        ; s = vp8_signed_char_clamp(qs0 - u);
805        ; *oq0 = s^0x80;
806        ; s = vp8_signed_char_clamp(ps0 + u);
807        ; *op0 = s^0x80;
808        pxor        mm0, mm0
809
810        pxor        mm1, mm1
811        pxor        mm2, mm2
812        punpcklbw   mm1, mm4
813        punpckhbw   mm2, mm4
814        pmulhw      mm1, [GLOBAL(s27)]
815        pmulhw      mm2, [GLOBAL(s27)]
816        paddw       mm1, [GLOBAL(s63)]
817        paddw       mm2, [GLOBAL(s63)]
818        psraw       mm1, 7
819        psraw       mm2, 7
820        packsswb    mm1, mm2
821
822        psubsb      mm3, mm1
823        paddsb      mm6, mm1
824
825        pxor        mm3, [GLOBAL(t80)]
826        pxor        mm6, [GLOBAL(t80)]
827        movq        [rsi+rax], mm6
828        movq        [rsi],     mm3
829
830        ; roughly 2/7th difference across boundary
831        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
832        ; s = vp8_signed_char_clamp(qs1 - u);
833        ; *oq1 = s^0x80;
834        ; s = vp8_signed_char_clamp(ps1 + u);
835        ; *op1 = s^0x80;
836        pxor        mm1, mm1
837        pxor        mm2, mm2
838        punpcklbw   mm1, mm4
839        punpckhbw   mm2, mm4
840        pmulhw      mm1, [GLOBAL(s18)]
841        pmulhw      mm2, [GLOBAL(s18)]
842        paddw       mm1, [GLOBAL(s63)]
843        paddw       mm2, [GLOBAL(s63)]
844        psraw       mm1, 7
845        psraw       mm2, 7
846        packsswb    mm1, mm2
847
848        movq        mm3, [rdi]
849        movq        mm6, [rsi+rax*2]       ; p1
850
851        pxor        mm3, [GLOBAL(t80)]
852        pxor        mm6, [GLOBAL(t80)]
853
854        paddsb      mm6, mm1
855        psubsb      mm3, mm1
856
857        pxor        mm6, [GLOBAL(t80)]
858        pxor        mm3, [GLOBAL(t80)]
859        movq        [rdi], mm3
860        movq        [rsi+rax*2], mm6
861
862        ; roughly 1/7th difference across boundary
863        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
864        ; s = vp8_signed_char_clamp(qs2 - u);
865        ; *oq2 = s^0x80;
866        ; s = vp8_signed_char_clamp(ps2 + u);
867        ; *op2 = s^0x80;
868        pxor        mm1, mm1
869        pxor        mm2, mm2
870        punpcklbw   mm1, mm4
871        punpckhbw   mm2, mm4
872        pmulhw      mm1, [GLOBAL(s9)]
873        pmulhw      mm2, [GLOBAL(s9)]
874        paddw       mm1, [GLOBAL(s63)]
875        paddw       mm2, [GLOBAL(s63)]
876        psraw       mm1, 7
877        psraw       mm2, 7
878        packsswb    mm1, mm2
879
880
881        movq        mm6, [rdi+rax*4]
882        neg         rax
883        movq        mm3, [rdi+rax  ]
884
885        pxor        mm6, [GLOBAL(t80)]
886        pxor        mm3, [GLOBAL(t80)]
887
888        paddsb      mm6, mm1
889        psubsb      mm3, mm1
890
891        pxor        mm6, [GLOBAL(t80)]
892        pxor        mm3, [GLOBAL(t80)]
893        movq        [rdi+rax  ], mm3
894        neg         rax
895        movq        [rdi+rax*4], mm6
896
897;EARLY_BREAK_OUT:
898        neg         rax
899        add         rsi,8
900        dec         rcx
901        jnz         .next8_mbh
902
903    add rsp, 32
904    pop rsp
905    ; begin epilog
906    pop rdi
907    pop rsi
908    RESTORE_GOT
909    UNSHADOW_ARGS
910    pop         rbp
911    ret
912
913
914;void vp8_mbloop_filter_vertical_edge_mmx
915;(
916;    unsigned char *src_ptr,
917;    int  src_pixel_step,
918;    const char *blimit,
919;    const char *limit,
920;    const char *thresh,
921;    int count
922;)
923global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
924sym(vp8_mbloop_filter_vertical_edge_mmx):
925    push        rbp
926    mov         rbp, rsp
927    SHADOW_ARGS_TO_STACK 6
928    GET_GOT     rbx
929    push        rsi
930    push        rdi
931    ; end prolog
932
933    ALIGN_STACK 16, rax
934    sub          rsp, 96      ; reserve 96 bytes
935    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
936    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
937    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
938
939        mov         rsi,        arg(0) ;src_ptr
940        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
941
942        lea         rsi,        [rsi + rax*4 - 4]
943
944        movsxd      rcx,        dword ptr arg(5) ;count
945.next8_mbv:
946        lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
947
948        ;transpose
949        movq        mm0,        [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
950        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
951
952        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
953        punpckhbw   mm7,        mm0                         ; 77 67 76 66 75 65 74 64
954
955        punpcklbw   mm6,        mm0                         ; 73 63 72 62 71 61 70 60
956        movq        mm0,        [rsi+rax]                   ; 57 56 55 54 53 52 51 50
957
958        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
959        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
960
961        punpckhbw   mm5,        mm0                         ; 57 47 56 46 55 45 54 44
962        punpcklbw   mm4,        mm0                         ; 53 43 52 42 51 41 50 40
963
964        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
965        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
966
967        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
968        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
969
970        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
971        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
972
973        neg         rax
974
975        movq        mm7,        [rsi+rax]                   ; 37 36 35 34 33 32 31 30
976        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
977
978        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
979        punpckhbw   mm6,        mm7                         ; 37 27 36 36 35 25 34 24
980
981        punpcklbw   mm1,        mm7                         ; 33 23 32 22 31 21 30 20
982
983        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
984        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
985
986        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
987        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
988
989        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
990        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
991
992        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
993        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
994
995        lea         rdx,        srct
996        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
997
998        movq        [rdx+56],   mm7
999        psubusb     mm5,        mm7                         ; q2-q3
1000
1001
1002        movq        [rdx+48],   mm6
1003        psubusb     mm7,        mm6                         ; q3-q2
1004
1005        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
1006        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
1007
1008        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
1009        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
1010
1011        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
1012        psubusb     mm3,        mm6                         ; q1-q2
1013
1014        psubusb     mm6,        mm5                         ; q2-q1
1015        por         mm6,        mm3                         ; mm6=abs(q2-q1)
1016
1017        movq        [rdx+40],   mm5                         ; save q1
1018        movq        [rdx+32],   mm0                         ; save q0
1019
1020        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
1021        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
1022
1023        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
1024        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
1025
1026        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
1027        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
1028
1029        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
1030        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
1031
1032        movq        [rdx],      mm0                         ; save p3
1033        movq        [rdx+8],    mm1                         ; save p2
1034
1035        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
1036        psubusb     mm2,        mm0                         ; p2-p3
1037
1038        psubusb     mm0,        mm1                         ; p3-p2
1039        por         mm0,        mm2                         ; mm0=abs(p3-p2)
1040
1041        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
1042        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
1043
1044        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
1045        movq        [rdx+24],   mm3                         ; save p0
1046
1047        movq        [rdx+16],   mm2                         ; save p1
1048        movq        mm5,        mm2                         ; mm5 = p1
1049
1050        psubusb     mm2,        mm1                         ; p1-p2
1051        psubusb     mm1,        mm5                         ; p2-p1
1052
1053        por         mm1,        mm2                         ; mm1=abs(p2-p1)
1054        mov         rdx,        arg(3) ;limit
1055
1056        movq        mm4,        [rdx]                       ; mm4 = limit
1057        psubusb     mm7,        mm4                         ; abs(q3-q2) > limit
1058
1059        psubusb     mm0,        mm4                         ; abs(p3-p2) > limit
1060        psubusb     mm1,        mm4                         ; abs(p2-p1) > limit
1061
1062        psubusb     mm6,        mm4                         ; abs(q2-q1) > limit
1063        por         mm7,        mm6                         ; or
1064
1065        por         mm0,        mm1                         ;
1066        por         mm0,        mm7                         ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
1067
1068        movq        mm1,        mm5                         ; p1
1069
1070        movq        mm7,        mm3                         ; mm3=mm7=p0
1071        psubusb     mm7,        mm5                         ; p0 - p1
1072
1073        psubusb     mm5,        mm3                         ; p1 - p0
1074        por         mm5,        mm7                         ; abs(p1-p0)
1075
1076        movq        t0,         mm5                         ; save abs(p1-p0)
1077        lea         rdx,        srct
1078
1079        psubusb     mm5,        mm4                         ; mm5 = abs(p1-p0) > limit
1080        por         mm0,        mm5                         ; mm0=mask
1081
1082        movq        mm5,        [rdx+32]                    ; mm5=q0
1083        movq        mm7,        [rdx+40]                    ; mm7=q1
1084
1085        movq        mm6,        mm5                         ; mm6=q0
1086        movq        mm2,        mm7                         ; q1
1087        psubusb     mm5,        mm7                         ; q0-q1
1088
1089        psubusb     mm7,        mm6                         ; q1-q0
1090        por         mm7,        mm5                         ; abs(q1-q0)
1091
1092        movq        t1,         mm7                         ; save abs(q1-q0)
1093        psubusb     mm7,        mm4                         ; mm7=abs(q1-q0)> limit
1094
1095        por         mm0,        mm7                         ; mask
1096
1097        movq        mm5,        mm2                         ; q1
1098        psubusb     mm5,        mm1                         ; q1-=p1
1099        psubusb     mm1,        mm2                         ; p1-=q1
1100        por         mm5,        mm1                         ; abs(p1-q1)
1101        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
1102        psrlw       mm5,        1                           ; abs(p1-q1)/2
1103
1104        mov         rdx,        arg(2) ;blimit                      ;
1105
1106        movq        mm4,        [rdx]                       ;blimit
1107        movq        mm1,        mm3                         ; mm1=mm3=p0
1108
1109        movq        mm7,        mm6                         ; mm7=mm6=q0
1110        psubusb     mm1,        mm7                         ; p0-q0
1111
1112        psubusb     mm7,        mm3                         ; q0-p0
1113        por         mm1,        mm7                         ; abs(q0-p0)
1114        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
1115        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
1116
1117        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
1118        por         mm1,        mm0;                        ; mask
1119
1120        pxor        mm0,        mm0
1121        pcmpeqb     mm1,        mm0
1122
1123        ; calculate high edge variance
1124        mov         rdx,        arg(4) ;thresh            ; get thresh
1125        movq        mm7,        [rdx]
1126        ;
1127        movq        mm4,        t0              ; get abs (q1 - q0)
1128        psubusb     mm4,        mm7             ; abs(q1 - q0) > thresh
1129
1130        movq        mm3,        t1              ; get abs (p1 - p0)
1131        psubusb     mm3,        mm7             ; abs(p1 - p0)> thresh
1132
1133        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
1134        pcmpeqb     mm4,        mm0
1135
1136        pcmpeqb     mm0,        mm0
1137        pxor        mm4,        mm0
1138
1139
1140
1141
1142        ; start work on filters
1143        lea         rdx,        srct
1144
1145        ; start work on filters
1146        movq        mm2, [rdx+16]         ; p1
1147        movq        mm7, [rdx+40]         ; q1
1148        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
1149        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
1150        psubsb      mm2, mm7              ; p1 - q1
1151
1152        movq        mm6, [rdx+24]         ; p0
1153        movq        mm0, [rdx+32]         ; q0
1154        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
1155        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
1156
1157        movq        mm3, mm0              ; q0
1158        psubsb      mm0, mm6              ; q0 - p0
1159        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
1160        paddsb      mm2, mm0              ; 2 * (q0 - p0)
1161        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
1162        pand       mm1, mm2           ; mask filter values we don't care about
1163
1164        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
1165        movq        mm2, mm1              ; vp8_filter
1166        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
1167
1168        movq        mm5,        mm2       ;
1169        paddsb      mm5,        [GLOBAL(t3)];
1170
1171        pxor        mm0, mm0              ; 0
1172        pxor        mm7, mm7              ; 0
1173
1174        punpcklbw   mm0, mm5              ; e0f0g0h0
1175        psraw       mm0, 11               ; sign extended shift right by 3
1176        punpckhbw   mm7, mm5              ; a0b0c0d0
1177        psraw       mm7, 11               ; sign extended shift right by 3
1178        packsswb    mm0, mm7              ; Filter2 >>=3;
1179
1180        movq        mm5, mm0              ; Filter2
1181
1182        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
1183        pxor        mm0, mm0              ; 0
1184        pxor        mm7, mm7              ; 0
1185
1186        punpcklbw   mm0, mm2              ; e0f0g0h0
1187        psraw       mm0, 11               ; sign extended shift right by 3
1188        punpckhbw   mm7, mm2              ; a0b0c0d0
1189        psraw       mm7, 11               ; sign extended shift right by 3
1190        packsswb    mm0, mm7              ; Filter2 >>=3;
1191
1192        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
1193        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
1194        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
1195
1196        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
1197        ; vp8_filter &= ~hev;
1198        ; Filter2 = vp8_filter;
1199        pandn       mm4, mm1              ; vp8_filter&=~hev
1200
1201
1202        ; mm3=qs0, mm4=filter2, mm6=ps0
1203
1204        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
1205        ; s = vp8_signed_char_clamp(qs0 - u);
1206        ; *oq0 = s^0x80;
1207        ; s = vp8_signed_char_clamp(ps0 + u);
1208        ; *op0 = s^0x80;
1209        pxor        mm0, mm0
1210
1211        pxor        mm1, mm1
1212        pxor        mm2, mm2
1213        punpcklbw   mm1, mm4
1214        punpckhbw   mm2, mm4
1215        pmulhw      mm1, [GLOBAL(s27)]
1216        pmulhw      mm2, [GLOBAL(s27)]
1217        paddw       mm1, [GLOBAL(s63)]
1218        paddw       mm2, [GLOBAL(s63)]
1219        psraw       mm1, 7
1220        psraw       mm2, 7
1221        packsswb    mm1, mm2
1222
1223        psubsb      mm3, mm1
1224        paddsb      mm6, mm1
1225
1226        pxor        mm3, [GLOBAL(t80)]
1227        pxor        mm6, [GLOBAL(t80)]
1228        movq        [rdx+24], mm6
1229        movq        [rdx+32], mm3
1230
1231        ; roughly 2/7th difference across boundary
1232        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
1233        ; s = vp8_signed_char_clamp(qs1 - u);
1234        ; *oq1 = s^0x80;
1235        ; s = vp8_signed_char_clamp(ps1 + u);
1236        ; *op1 = s^0x80;
1237        pxor        mm1, mm1
1238        pxor        mm2, mm2
1239        punpcklbw   mm1, mm4
1240        punpckhbw   mm2, mm4
1241        pmulhw      mm1, [GLOBAL(s18)]
1242        pmulhw      mm2, [GLOBAL(s18)]
1243        paddw       mm1, [GLOBAL(s63)]
1244        paddw       mm2, [GLOBAL(s63)]
1245        psraw       mm1, 7
1246        psraw       mm2, 7
1247        packsswb    mm1, mm2
1248
1249        movq        mm3, [rdx + 40]
1250        movq        mm6, [rdx + 16]       ; p1
1251        pxor        mm3, [GLOBAL(t80)]
1252        pxor        mm6, [GLOBAL(t80)]
1253
1254        paddsb      mm6, mm1
1255        psubsb      mm3, mm1
1256
1257        pxor        mm6, [GLOBAL(t80)]
1258        pxor        mm3, [GLOBAL(t80)]
1259        movq        [rdx + 40], mm3
1260        movq        [rdx + 16], mm6
1261
1262        ; roughly 1/7th difference across boundary
1263        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
1264        ; s = vp8_signed_char_clamp(qs2 - u);
1265        ; *oq2 = s^0x80;
1266        ; s = vp8_signed_char_clamp(ps2 + u);
1267        ; *op2 = s^0x80;
1268        pxor        mm1, mm1
1269        pxor        mm2, mm2
1270        punpcklbw   mm1, mm4
1271        punpckhbw   mm2, mm4
1272        pmulhw      mm1, [GLOBAL(s9)]
1273        pmulhw      mm2, [GLOBAL(s9)]
1274        paddw       mm1, [GLOBAL(s63)]
1275        paddw       mm2, [GLOBAL(s63)]
1276        psraw       mm1, 7
1277        psraw       mm2, 7
1278        packsswb    mm1, mm2
1279
1280        movq        mm6, [rdx+ 8]
1281        movq        mm3, [rdx+48]
1282
1283        pxor        mm6, [GLOBAL(t80)]
1284        pxor        mm3, [GLOBAL(t80)]
1285
1286        paddsb      mm6, mm1
1287        psubsb      mm3, mm1
1288
1289        pxor        mm6, [GLOBAL(t80)]          ; mm6 = 71 61 51 41 31 21 11 01
1290        pxor        mm3, [GLOBAL(t80)]          ; mm3 = 76 66 56 46 36 26 15 06
1291
1292        ; tranpose and write back
1293        movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
1294        movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
1295
1296        punpcklbw   mm0,    mm6                 ; mm0 = 31 30 21 20 11 10 01 00
1297        punpckhbw   mm1,    mm6                 ; mm3 = 71 70 61 60 51 50 41 40
1298
1299        movq        mm2,    [rdx+16]            ; mm2 = 72 62 52 42 32 22 12 02
1300        movq        mm6,    mm2                 ; mm3 = 72 62 52 42 32 22 12 02
1301
1302        punpcklbw   mm2,    [rdx+24]            ; mm2 = 33 32 23 22 13 12 03 02
1303        punpckhbw   mm6,    [rdx+24]            ; mm3 = 73 72 63 62 53 52 43 42
1304
1305        movq        mm5,    mm0                 ; mm5 = 31 30 21 20 11 10 01 00
1306        punpcklwd   mm0,    mm2                 ; mm0 = 13 12 11 10 03 02 01 00
1307
1308        punpckhwd   mm5,    mm2                 ; mm5 = 33 32 31 30 23 22 21 20
1309        movq        mm4,    mm1                 ; mm4 = 71 70 61 60 51 50 41 40
1310
1311        punpcklwd   mm1,    mm6                 ; mm1 = 53 52 51 50 43 42 41 40
1312        punpckhwd   mm4,    mm6                 ; mm4 = 73 72 71 70 63 62 61 60
1313
1314        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
1315        punpcklbw   mm2,    [rdx+40]            ; mm2 = 35 34 25 24 15 14 05 04
1316
1317        movq        mm6,    mm3                 ; mm6 = 76 66 56 46 36 26 15 06
1318        punpcklbw   mm6,    [rdx+56]            ; mm6 = 37 36 27 26 17 16 07 06
1319
1320        movq        mm7,    mm2                 ; mm7 = 35 34 25 24 15 14 05 04
1321        punpcklwd   mm2,    mm6                 ; mm2 = 17 16 15 14 07 06 05 04
1322
1323        punpckhwd   mm7,    mm6                 ; mm7 = 37 36 35 34 27 26 25 24
1324        movq        mm6,    mm0                 ; mm6 = 13 12 11 10 03 02 01 00
1325
1326        punpckldq   mm0,    mm2                 ; mm0 = 07 06 05 04 03 02 01 00
1327        punpckhdq   mm6,    mm2                 ; mm6 = 17 16 15 14 13 12 11 10
1328
1329        movq        [rsi+rax*4], mm0            ; write out
1330        movq        [rdi+rax*4], mm6            ; write out
1331
1332        movq        mm0,    mm5                 ; mm0 = 33 32 31 30 23 22 21 20
1333        punpckldq   mm0,    mm7                 ; mm0 = 27 26 25 24 23 22 20 20
1334
1335        punpckhdq   mm5,    mm7                 ; mm5 = 37 36 35 34 33 32 31 30
1336        movq        [rsi+rax*2], mm0            ; write out
1337
1338        movq        [rdi+rax*2], mm5            ; write out
1339        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
1340
1341        punpckhbw   mm2,    [rdx+40]            ; mm2 = 75 74 65 64 54 54 45 44
1342        punpckhbw   mm3,    [rdx+56]            ; mm3 = 77 76 67 66 57 56 47 46
1343
1344        movq        mm5,    mm2                 ; mm5 = 75 74 65 64 54 54 45 44
1345        punpcklwd   mm2,    mm3                 ; mm2 = 57 56 55 54 47 46 45 44
1346
1347        punpckhwd   mm5,    mm3                 ; mm5 = 77 76 75 74 67 66 65 64
1348        movq        mm0,    mm1                 ; mm0=  53 52 51 50 43 42 41 40
1349
1350        movq        mm3,    mm4                 ; mm4 = 73 72 71 70 63 62 61 60
1351        punpckldq   mm0,    mm2                 ; mm0 = 47 46 45 44 43 42 41 40
1352
1353        punpckhdq   mm1,    mm2                 ; mm1 = 57 56 55 54 53 52 51 50
1354        movq        [rsi],  mm0                 ; write out
1355
1356        movq        [rdi],  mm1                 ; write out
1357        neg         rax
1358
1359        punpckldq   mm3,    mm5                 ; mm3 = 67 66 65 64 63 62 61 60
1360        punpckhdq   mm4,    mm5                 ; mm4 = 77 76 75 74 73 72 71 60
1361
1362        movq        [rsi+rax*2], mm3
1363        movq        [rdi+rax*2], mm4
1364
1365        lea         rsi,        [rsi+rax*8]
1366        dec         rcx
1367
1368        jnz         .next8_mbv
1369
1370    add rsp, 96
1371    pop rsp
1372    ; begin epilog
1373    pop rdi
1374    pop rsi
1375    RESTORE_GOT
1376    UNSHADOW_ARGS
1377    pop         rbp
1378    ret
1379
1380
1381;void vp8_loop_filter_simple_horizontal_edge_mmx
1382;(
1383;    unsigned char *src_ptr,
1384;    int  src_pixel_step,
1385;    const char *blimit
1386;)
1387global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
1388sym(vp8_loop_filter_simple_horizontal_edge_mmx):
1389    push        rbp
1390    mov         rbp, rsp
1391    SHADOW_ARGS_TO_STACK 3
1392    GET_GOT     rbx
1393    push        rsi
1394    push        rdi
1395    ; end prolog
1396
1397        mov         rsi, arg(0) ;src_ptr
1398        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
1399
1400        mov         rcx, 2                ; count
1401.nexts8_h:
1402        mov         rdx, arg(2) ;blimit           ; get blimit
1403        movq        mm3, [rdx]            ;
1404
1405        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
1406        add         rdi, rax
1407        neg         rax
1408
1409        ; calculate mask
1410        movq        mm1, [rsi+2*rax]      ; p1
1411        movq        mm0, [rdi]            ; q1
1412        movq        mm2, mm1
1413        movq        mm7, mm0
1414        movq        mm4, mm0
1415        psubusb     mm0, mm1              ; q1-=p1
1416        psubusb     mm1, mm4              ; p1-=q1
1417        por         mm1, mm0              ; abs(p1-q1)
1418        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
1419        psrlw       mm1, 1                ; abs(p1-q1)/2
1420
1421        movq        mm5, [rsi+rax]        ; p0
1422        movq        mm4, [rsi]            ; q0
1423        movq        mm0, mm4              ; q0
1424        movq        mm6, mm5              ; p0
1425        psubusb     mm5, mm4              ; p0-=q0
1426        psubusb     mm4, mm6              ; q0-=p0
1427        por         mm5, mm4              ; abs(p0 - q0)
1428        paddusb     mm5, mm5              ; abs(p0-q0)*2
1429        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
1430
1431        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
1432        pxor        mm3, mm3
1433        pcmpeqb     mm5, mm3
1434
1435        ; start work on filters
1436        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
1437        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
1438        psubsb      mm2, mm7              ; p1 - q1
1439
1440        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
1441        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
1442        movq        mm3, mm0              ; q0
1443        psubsb      mm0, mm6              ; q0 - p0
1444        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
1445        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
1446        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
1447        pand        mm5, mm2              ; mask filter values we don't care about
1448
1449        ; do + 4 side
1450        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
1451
1452        movq        mm0, mm5              ; get a copy of filters
1453        psllw       mm0, 8                ; shift left 8
1454        psraw       mm0, 3                ; arithmetic shift right 11
1455        psrlw       mm0, 8
1456        movq        mm1, mm5              ; get a copy of filters
1457        psraw       mm1, 11               ; arithmetic shift right 11
1458        psllw       mm1, 8                ; shift left 8 to put it back
1459
1460        por         mm0, mm1              ; put the two together to get result
1461
1462        psubsb      mm3, mm0              ; q0-= q0 add
1463        pxor        mm3, [GLOBAL(t80)]    ; unoffset
1464        movq        [rsi], mm3            ; write back
1465
1466
1467        ; now do +3 side
1468        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
1469
1470        movq        mm0, mm5              ; get a copy of filters
1471        psllw       mm0, 8                ; shift left 8
1472        psraw       mm0, 3                ; arithmetic shift right 11
1473        psrlw       mm0, 8
1474        psraw       mm5, 11               ; arithmetic shift right 11
1475        psllw       mm5, 8                ; shift left 8 to put it back
1476        por         mm0, mm5              ; put the two together to get result
1477
1478
1479        paddsb      mm6, mm0              ; p0+= p0 add
1480        pxor        mm6, [GLOBAL(t80)]    ; unoffset
1481        movq        [rsi+rax], mm6        ; write back
1482
1483        add         rsi,8
1484        neg         rax
1485        dec         rcx
1486        jnz         .nexts8_h
1487
1488    ; begin epilog
1489    pop rdi
1490    pop rsi
1491    RESTORE_GOT
1492    UNSHADOW_ARGS
1493    pop         rbp
1494    ret
1495
1496
1497;void vp8_loop_filter_simple_vertical_edge_mmx
1498;(
1499;    unsigned char *src_ptr,
1500;    int  src_pixel_step,
1501;    const char *blimit
1502;)
1503global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
1504sym(vp8_loop_filter_simple_vertical_edge_mmx):
1505    push        rbp
1506    mov         rbp, rsp
1507    SHADOW_ARGS_TO_STACK 3
1508    GET_GOT     rbx
1509    push        rsi
1510    push        rdi
1511    ; end prolog
1512
1513    ALIGN_STACK 16, rax
1514    sub          rsp, 32      ; reserve 32 bytes
1515    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
1516    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
1517
1518        mov         rsi, arg(0) ;src_ptr
1519        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
1520
1521        lea         rsi, [rsi + rax*4- 2];  ;
1522        mov         rcx, 2                                      ; count
1523.nexts8_v:
1524
1525        lea         rdi,        [rsi + rax];
1526        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
1527
1528        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
1529        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
1530
1531        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
1532        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
1533
1534        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
1535        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
1536
1537        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
1538        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
1539
1540        neg         rax
1541
1542        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
1543        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
1544
1545        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
1546        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
1547
1548        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
1549        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
1550
1551        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
1552        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
1553
1554        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
1555        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
1556
1557        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
1558        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
1559
1560        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
1561        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
1562
1563        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
1564
1565
1566        ; calculate mask
1567        movq        mm6,        mm0                             ; p1
1568        movq        mm7,        mm3                             ; q1
1569        psubusb     mm7,        mm6                             ; q1-=p1
1570        psubusb     mm6,        mm3                             ; p1-=q1
1571        por         mm6,        mm7                             ; abs(p1-q1)
1572        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
1573        psrlw       mm6,        1                               ; abs(p1-q1)/2
1574
1575        movq        mm5,        mm1                             ; p0
1576        movq        mm4,        mm2                             ; q0
1577
1578        psubusb     mm5,        mm2                             ; p0-=q0
1579        psubusb     mm4,        mm1                             ; q0-=p0
1580
1581        por         mm5,        mm4                             ; abs(p0 - q0)
1582        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
1583        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
1584
1585        mov         rdx,        arg(2) ;blimit                          ; get blimit
1586        movq        mm7,        [rdx]
1587
1588        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
1589        pxor        mm7,        mm7
1590        pcmpeqb     mm5,        mm7                             ; mm5 = mask
1591
1592        ; start work on filters
1593        movq        t0,         mm0
1594        movq        t1,         mm3
1595
1596        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
1597        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
1598
1599        psubsb      mm0,        mm3                             ; p1 - q1
1600        movq        mm6,        mm1                             ; p0
1601
1602        movq        mm7,        mm2                             ; q0
1603        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
1604
1605        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
1606        movq        mm3,        mm7                             ; offseted ; q0
1607
1608        psubsb      mm7,        mm6                             ; q0 - p0
1609        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
1610
1611        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
1612        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
1613
1614        pand        mm5,        mm0                             ; mask filter values we don't care about
1615
1616        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
1617
1618        movq        mm0,        mm5                             ; get a copy of filters
1619        psllw       mm0,        8                               ; shift left 8
1620        psraw       mm0,        3                               ; arithmetic shift right 11
1621        psrlw       mm0,        8
1622
1623        movq        mm7,        mm5                             ; get a copy of filters
1624        psraw       mm7,        11                              ; arithmetic shift right 11
1625        psllw       mm7,        8                               ; shift left 8 to put it back
1626
1627        por         mm0,        mm7                             ; put the two together to get result
1628
1629        psubsb      mm3,        mm0                             ; q0-= q0sz add
1630        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
1631
1632        ; now do +3 side
1633        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
1634
1635        movq        mm0, mm5                                    ; get a copy of filters
1636        psllw       mm0, 8                                      ; shift left 8
1637        psraw       mm0, 3                                      ; arithmetic shift right 11
1638        psrlw       mm0, 8
1639
1640        psraw       mm5, 11                                     ; arithmetic shift right 11
1641        psllw       mm5, 8                                      ; shift left 8 to put it back
1642        por         mm0, mm5                                    ; put the two together to get result
1643
1644        paddsb      mm6, mm0                                    ; p0+= p0 add
1645        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
1646
1647
1648        movq        mm0,        t0
1649        movq        mm4,        t1
1650
1651        ; mm0 = 70 60 50 40 30 20 10 00
1652        ; mm6 = 71 61 51 41 31 21 11 01
1653        ; mm3 = 72 62 52 42 32 22 12 02
1654        ; mm4 = 73 63 53 43 33 23 13 03
1655        ; transpose back to write out
1656
1657        movq        mm1,        mm0                         ;
1658        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
1659
1660        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
1661        movq        mm2,        mm3                         ;
1662
1663        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
1664        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
1665
1666        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
1667        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
1668
1669        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
1670        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
1671
1672        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
1673        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
1674
1675        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
1676        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
1677
1678        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
1679        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
1680
1681        psrlq       mm6,        32                          ; 33 32 31 30
1682        movd        [rsi],      mm1                         ; write 43 42 41 40
1683
1684        movd        [rsi + rax], mm6                        ; write 33 32 31 30
1685        neg         rax
1686
1687        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
1688        psrlq       mm1,        32                          ; 53 52 51 50
1689
1690        movd        [rdi],      mm1                         ; write out 53 52 51 50
1691        psrlq       mm5,        32                          ; 73 72 71 70
1692
1693        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
1694
1695        lea         rsi,        [rsi+rax*8]                 ; next 8
1696
1697        dec         rcx
1698        jnz         .nexts8_v
1699
1700    add rsp, 32
1701    pop rsp
1702    ; begin epilog
1703    pop rdi
1704    pop rsi
1705    RESTORE_GOT
1706    UNSHADOW_ARGS
1707    pop         rbp
1708    ret
1709
1710
1711
1712;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
1713;                  int y_stride,
1714;                  loop_filter_info *lfi)
1715;{
1716;
1717;
1718;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1719;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1720;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1721;}
1722
1723SECTION_RODATA
1724align 16
1725tfe:
1726    times 8 db 0xfe
1727align 16
1728t80:
1729    times 8 db 0x80
1730align 16
1731t1s:
1732    times 8 db 0x01
1733align 16
1734t3:
1735    times 8 db 0x03
1736align 16
1737t4:
1738    times 8 db 0x04
1739align 16
1740ones:
1741    times 4 dw 0x0001
1742align 16
1743s27:
1744    times 4 dw 0x1b00
1745align 16
1746s18:
1747    times 4 dw 0x1200
1748align 16
1749s9:
1750    times 4 dw 0x0900
1751align 16
1752s63:
1753    times 4 dw 0x003f
1754