1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14
15;void vp8_loop_filter_horizontal_edge_mmx
16;(
17;    unsigned char *src_ptr,
18;    int src_pixel_step,
19;    const char *flimit,
20;    const char *limit,
21;    const char *thresh,
22;    int  count
23;)
24global sym(vp8_loop_filter_horizontal_edge_mmx)
25sym(vp8_loop_filter_horizontal_edge_mmx):
26    push        rbp
27    mov         rbp, rsp
28    SHADOW_ARGS_TO_STACK 6
29    GET_GOT     rbx
30    push        rsi
31    push        rdi
32    ; end prolog
33
34    ALIGN_STACK 16, rax
35    sub         rsp, 32                         ; reserve 32 bytes
36    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
37    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
38
39        mov         rsi, arg(0) ;src_ptr
40        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
41
42        movsxd      rcx, dword ptr arg(5) ;count
43next8_h:
44        mov         rdx, arg(3) ;limit
45        movq        mm7, [rdx]
46        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
47        add         rdi, rax
48
49        ; calculate breakout conditions
50        movq        mm2, [rdi+2*rax]      ; q3
51        movq        mm1, [rsi+2*rax]      ; q2
52        movq        mm6, mm1              ; q2
53        psubusb     mm1, mm2              ; q2-=q3
54        psubusb     mm2, mm6              ; q3-=q2
55        por         mm1, mm2              ; abs(q3-q2)
56        psubusb     mm1, mm7              ;
57
58
59        movq        mm4, [rsi+rax]        ; q1
60        movq        mm3, mm4              ; q1
61        psubusb     mm4, mm6              ; q1-=q2
62        psubusb     mm6, mm3              ; q2-=q1
63        por         mm4, mm6              ; abs(q2-q1)
64
65        psubusb     mm4, mm7
66        por        mm1, mm4
67
68        movq        mm4, [rsi]            ; q0
69        movq        mm0, mm4              ; q0
70        psubusb     mm4, mm3              ; q0-=q1
71        psubusb     mm3, mm0              ; q1-=q0
72        por         mm4, mm3              ; abs(q0-q1)
73        movq        t0, mm4               ; save to t0
74        psubusb     mm4, mm7
75        por        mm1, mm4
76
77
78        neg         rax                   ; negate pitch to deal with above border
79
80        movq        mm2, [rsi+4*rax]      ; p3
81        movq        mm4, [rdi+4*rax]      ; p2
82        movq        mm5, mm4              ; p2
83        psubusb     mm4, mm2              ; p2-=p3
84        psubusb     mm2, mm5              ; p3-=p2
85        por         mm4, mm2              ; abs(p3 - p2)
86        psubusb     mm4, mm7
87        por        mm1, mm4
88
89
90        movq        mm4, [rsi+2*rax]      ; p1
91        movq        mm3, mm4              ; p1
92        psubusb     mm4, mm5              ; p1-=p2
93        psubusb     mm5, mm3              ; p2-=p1
94        por         mm4, mm5              ; abs(p2 - p1)
95        psubusb     mm4, mm7
96        por        mm1, mm4
97
98        movq        mm2, mm3              ; p1
99
100        movq        mm4, [rsi+rax]        ; p0
101        movq        mm5, mm4              ; p0
102        psubusb     mm4, mm3              ; p0-=p1
103        psubusb     mm3, mm5              ; p1-=p0
104        por         mm4, mm3              ; abs(p1 - p0)
105        movq        t1, mm4               ; save to t1
106        psubusb     mm4, mm7
107        por        mm1, mm4
108
109        movq        mm3, [rdi]            ; q1
110        movq        mm4, mm3              ; q1
111        psubusb     mm3, mm2              ; q1-=p1
112        psubusb     mm2, mm4              ; p1-=q1
113        por         mm2, mm3              ; abs(p1-q1)
114        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
115        psrlw       mm2, 1                ; abs(p1-q1)/2
116
117        movq        mm6, mm5              ; p0
118        movq        mm3, [rsi]            ; q0
119        psubusb     mm5, mm3              ; p0-=q0
120        psubusb     mm3, mm6              ; q0-=p0
121        por         mm5, mm3              ; abs(p0 - q0)
122        paddusb     mm5, mm5              ; abs(p0-q0)*2
123        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
124
125        mov         rdx, arg(2) ;flimit           ; get flimit
126        movq        mm2, [rdx]            ; flimit mm2
127        paddb       mm2, mm2              ; flimit*2 (less than 255)
128        paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
129
130        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
131        por         mm1,    mm5
132        pxor        mm5,    mm5
133        pcmpeqb     mm1,    mm5           ; mask mm1
134
135        ; calculate high edge variance
136        mov         rdx, arg(4) ;thresh           ; get thresh
137        movq        mm7, [rdx]            ;
138        movq        mm4, t0               ; get abs (q1 - q0)
139        psubusb     mm4, mm7
140        movq        mm3, t1               ; get abs (p1 - p0)
141        psubusb     mm3, mm7
142        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
143
144        pcmpeqb     mm4,        mm5
145
146        pcmpeqb     mm5,        mm5
147        pxor        mm4,        mm5
148
149
150        ; start work on filters
151        movq        mm2, [rsi+2*rax]      ; p1
152        movq        mm7, [rdi]            ; q1
153        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
154        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
155        psubsb      mm2, mm7              ; p1 - q1
156        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
157        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
158        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
159        movq        mm3, mm0              ; q0
160        psubsb      mm0, mm6              ; q0 - p0
161        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
162        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
163        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
164        pand        mm1, mm2                  ; mask filter values we don't care about
165        movq        mm2, mm1
166        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
167        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
168
169        pxor        mm0, mm0             ;
170        pxor        mm5, mm5
171        punpcklbw   mm0, mm2            ;
172        punpckhbw   mm5, mm2            ;
173        psraw       mm0, 11             ;
174        psraw       mm5, 11
175        packsswb    mm0, mm5
176        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
177
178        pxor        mm0, mm0              ; 0
179        movq        mm5, mm1              ; abcdefgh
180        punpcklbw   mm0, mm1              ; e0f0g0h0
181        psraw       mm0, 11               ; sign extended shift right by 3
182        pxor        mm1, mm1              ; 0
183        punpckhbw   mm1, mm5              ; a0b0c0d0
184        psraw       mm1, 11               ; sign extended shift right by 3
185        movq        mm5, mm0              ; save results
186
187        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
188        paddsw      mm5, [GLOBAL(ones)]
189        paddsw      mm1, [GLOBAL(ones)]
190        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
191        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
192        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
193        pandn       mm4, mm5              ; high edge variance additive
194
195        paddsb      mm6, mm2              ; p0+= p0 add
196        pxor        mm6, [GLOBAL(t80)]    ; unoffset
197        movq        [rsi+rax], mm6        ; write back
198
199        movq        mm6, [rsi+2*rax]      ; p1
200        pxor        mm6, [GLOBAL(t80)]    ; reoffset
201        paddsb      mm6, mm4              ; p1+= p1 add
202        pxor        mm6, [GLOBAL(t80)]    ; unoffset
203        movq        [rsi+2*rax], mm6      ; write back
204
205        psubsb      mm3, mm0              ; q0-= q0 add
206        pxor        mm3, [GLOBAL(t80)]    ; unoffset
207        movq        [rsi], mm3            ; write back
208
209        psubsb      mm7, mm4              ; q1-= q1 add
210        pxor        mm7, [GLOBAL(t80)]    ; unoffset
211        movq        [rdi], mm7            ; write back
212
213        add         rsi,8
214        neg         rax
215        dec         rcx
216        jnz         next8_h
217
218    add rsp, 32
219    pop rsp
220    ; begin epilog
221    pop rdi
222    pop rsi
223    RESTORE_GOT
224    UNSHADOW_ARGS
225    pop         rbp
226    ret
227
228
229;void vp8_loop_filter_vertical_edge_mmx
230;(
231;    unsigned char *src_ptr,
232;    int  src_pixel_step,
233;    const char *flimit,
234;    const char *limit,
235;    const char *thresh,
236;    int count
237;)
238global sym(vp8_loop_filter_vertical_edge_mmx)
239sym(vp8_loop_filter_vertical_edge_mmx):
240    push        rbp
241    mov         rbp, rsp
242    SHADOW_ARGS_TO_STACK 6
243    GET_GOT     rbx
244    push        rsi
245    push        rdi
246    ; end prolog
247
248    ALIGN_STACK 16, rax
249    sub          rsp, 64      ; reserve 64 bytes
250    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
251    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
252    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
253
254        mov         rsi,        arg(0) ;src_ptr
255        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
256
257        lea         rsi,        [rsi + rax*4 - 4]
258
259        movsxd      rcx,        dword ptr arg(5) ;count
260next8_v:
261        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
262        add         rdi,        rax
263
264
265        ;transpose
266        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
267        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
268
269        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
270        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
271
272        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
273        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
274
275        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
276        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
277
278        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
279        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
280
281        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
282        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
283
284        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
285        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
286
287        neg         rax
288        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
289
290        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
291        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
292
293        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
294        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
295
296        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
297        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
298
299        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
300        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
301
302        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
303        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
304
305        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
306
307        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
308        psubusb     mm5,        mm7                         ; q2-q3
309
310        psubusb     mm7,        mm6                         ; q3-q2
311        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
312
313        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
314        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
315
316        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
317        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
318
319        psubusb     mm3,        mm6                         ; q1-q2
320        psubusb     mm6,        mm5                         ; q2-q1
321
322        por         mm6,        mm3                         ; mm6=abs(q2-q1)
323        lea         rdx,        srct
324
325        movq        [rdx+24],   mm5                         ; save q1
326        movq        [rdx+16],   mm0                         ; save q0
327
328        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
329        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
330
331        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
332        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
333
334        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
335        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
336
337        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
338        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
339
340        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
341        psubusb     mm2,        mm0                         ; p2-p3
342
343        psubusb     mm0,        mm1                         ; p3-p2
344        por         mm0,        mm2                         ; mm0=abs(p3-p2)
345
346        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
347        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
348
349        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
350        movq        [rdx+8],    mm3                         ; save p0
351
352        movq        [rdx],      mm2                         ; save p1
353        movq        mm5,        mm2                         ; mm5 = p1
354
355        psubusb     mm2,        mm1                         ; p1-p2
356        psubusb     mm1,        mm5                         ; p2-p1
357
358        por         mm1,        mm2                         ; mm1=abs(p2-p1)
359        mov         rdx,        arg(3) ;limit
360
361        movq        mm4,        [rdx]                       ; mm4 = limit
362        psubusb     mm7,        mm4
363
364        psubusb     mm0,        mm4
365        psubusb     mm1,        mm4
366
367        psubusb     mm6,        mm4
368        por         mm7,        mm6
369
370        por         mm0,        mm1
371        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
372
373        movq        mm1,        mm5                         ; p1
374
375        movq        mm7,        mm3                         ; mm3=mm7=p0
376        psubusb     mm7,        mm5                         ; p0 - p1
377
378        psubusb     mm5,        mm3                         ; p1 - p0
379        por         mm5,        mm7                         ; abs(p1-p0)
380
381        movq        t0,         mm5                         ; save abs(p1-p0)
382        lea         rdx,        srct
383
384        psubusb     mm5,        mm4
385        por         mm0,        mm5                         ; mm0=mask
386
387        movq        mm5,        [rdx+16]                    ; mm5=q0
388        movq        mm7,        [rdx+24]                    ; mm7=q1
389
390        movq        mm6,        mm5                         ; mm6=q0
391        movq        mm2,        mm7                         ; q1
392        psubusb     mm5,        mm7                         ; q0-q1
393
394        psubusb     mm7,        mm6                         ; q1-q0
395        por         mm7,        mm5                         ; abs(q1-q0)
396
397        movq        t1,         mm7                         ; save abs(q1-q0)
398        psubusb     mm7,        mm4
399
400        por         mm0,        mm7                         ; mask
401
402        movq        mm5,        mm2                         ; q1
403        psubusb     mm5,        mm1                         ; q1-=p1
404        psubusb     mm1,        mm2                         ; p1-=q1
405        por         mm5,        mm1                         ; abs(p1-q1)
406        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
407        psrlw       mm5,        1                           ; abs(p1-q1)/2
408
409        mov         rdx,        arg(2) ;flimit                      ;
410
411        movq        mm2,        [rdx]                       ;flimit  mm2
412        movq        mm1,        mm3                         ; mm1=mm3=p0
413
414        movq        mm7,        mm6                         ; mm7=mm6=q0
415        psubusb     mm1,        mm7                         ; p0-q0
416
417        psubusb     mm7,        mm3                         ; q0-p0
418        por         mm1,        mm7                         ; abs(q0-p0)
419        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
420        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
421
422        paddb       mm2,        mm2                         ; flimit*2 (less than 255)
423        paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
424
425        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
426        por         mm1,        mm0;                        ; mask
427
428        pxor        mm0,        mm0
429        pcmpeqb     mm1,        mm0
430
431        ; calculate high edge variance
432        mov         rdx,        arg(4) ;thresh            ; get thresh
433        movq        mm7,        [rdx]
434        ;
435        movq        mm4,        t0              ; get abs (q1 - q0)
436        psubusb     mm4,        mm7
437
438        movq        mm3,        t1              ; get abs (p1 - p0)
439        psubusb     mm3,        mm7
440
441        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
442        pcmpeqb     mm4,        mm0
443
444        pcmpeqb     mm0,        mm0
445        pxor        mm4,        mm0
446
447
448
449        ; start work on filters
450        lea         rdx,        srct
451
452        movq        mm2,        [rdx]           ; p1
453        movq        mm7,        [rdx+24]        ; q1
454
455        movq        mm6,        [rdx+8]         ; p0
456        movq        mm0,        [rdx+16]        ; q0
457
458        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
459        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
460
461        psubsb      mm2,        mm7             ; p1 - q1
462        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
463
464        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
465        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
466
467        movq        mm3,        mm0             ; q0
468        psubsb      mm0,        mm6             ; q0 - p0
469
470        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
471        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
472
473        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
474        pand       mm1,        mm2              ; mask filter values we don't care about
475
476        movq        mm2,        mm1
477        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
478
479        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
480        pxor        mm0,        mm0          ;
481
482        pxor        mm5,        mm5
483        punpcklbw   mm0,        mm2         ;
484
485        punpckhbw   mm5,        mm2         ;
486        psraw       mm0,        11              ;
487
488        psraw       mm5,        11
489        packsswb    mm0,        mm5
490
491        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
492
493        pxor        mm0,        mm0           ; 0
494        movq        mm5,        mm1           ; abcdefgh
495
496        punpcklbw   mm0,        mm1           ; e0f0g0h0
497        psraw       mm0,        11                ; sign extended shift right by 3
498
499        pxor        mm1,        mm1           ; 0
500        punpckhbw   mm1,        mm5           ; a0b0c0d0
501
502        psraw       mm1,        11                ; sign extended shift right by 3
503        movq        mm5,        mm0              ; save results
504
505        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
506        paddsw      mm5,        [GLOBAL(ones)]
507
508        paddsw      mm1,        [GLOBAL(ones)]
509        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
510
511        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
512        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
513
514        pandn       mm4,        mm5             ; high edge variance additive
515
516        paddsb      mm6,        mm2             ; p0+= p0 add
517        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
518
519        ; mm6=p0                               ;
520        movq        mm1,        [rdx]           ; p1
521        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
522
523        paddsb      mm1,        mm4                 ; p1+= p1 add
524        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
525        ; mm6 = p0 mm1 = p1
526
527        psubsb      mm3,        mm0                 ; q0-= q0 add
528        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
529
530        ; mm3 = q0
531        psubsb      mm7,        mm4                 ; q1-= q1 add
532        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
533        ; mm7 = q1
534
535        ; tranpose and write back
536        ; mm1 =    72 62 52 42 32 22 12 02
537        ; mm6 =    73 63 53 43 33 23 13 03
538        ; mm3 =    74 64 54 44 34 24 14 04
539        ; mm7 =    75 65 55 45 35 25 15 05
540
541        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
542        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
543
544        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
545        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
546
547        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
548        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
549
550        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
551        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
552
553        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
554        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
555
556        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
557        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
558
559
560        ; mm2 = 15 14 13 12 05 04 03 02
561        ; mm6 = 35 34 33 32 25 24 23 22
562        ; mm5 = 55 54 53 52 45 44 43 42
563        ; mm1 = 75 74 73 72 65 64 63 62
564
565
566
567        movd        [rsi+rax*4+2], mm2
568        psrlq       mm2,        32
569
570        movd        [rdi+rax*4+2], mm2
571        movd        [rsi+rax*2+2], mm6
572
573        psrlq       mm6,        32
574        movd        [rsi+rax+2],mm6
575
576        movd        [rsi+2],    mm1
577        psrlq       mm1,        32
578
579        movd        [rdi+2],    mm1
580        neg         rax
581
582        movd        [rdi+rax+2],mm5
583        psrlq       mm5,        32
584
585        movd        [rdi+rax*2+2], mm5
586
587        lea         rsi,        [rsi+rax*8]
588        dec         rcx
589        jnz         next8_v
590
591    add rsp, 64
592    pop rsp
593    ; begin epilog
594    pop rdi
595    pop rsi
596    RESTORE_GOT
597    UNSHADOW_ARGS
598    pop         rbp
599    ret
600
601
602;void vp8_mbloop_filter_horizontal_edge_mmx
603;(
604;    unsigned char *src_ptr,
605;    int  src_pixel_step,
606;    const char *flimit,
607;    const char *limit,
608;    const char *thresh,
609;    int count
610;)
611global sym(vp8_mbloop_filter_horizontal_edge_mmx)
612sym(vp8_mbloop_filter_horizontal_edge_mmx):
613    push        rbp
614    mov         rbp, rsp
615    SHADOW_ARGS_TO_STACK 6
616    GET_GOT     rbx
617    push        rsi
618    push        rdi
619    ; end prolog
620
621    ALIGN_STACK 16, rax
622    sub          rsp, 32      ; reserve 32 bytes
623    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
624    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
625
626        mov         rsi, arg(0) ;src_ptr
627        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
628
629        movsxd      rcx, dword ptr arg(5) ;count
630next8_mbh:
631        mov         rdx, arg(3) ;limit
632        movq        mm7, [rdx]
633        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
634        add         rdi, rax
635
636        ; calculate breakout conditions
637        movq        mm2, [rdi+2*rax]      ; q3
638
639        movq        mm1, [rsi+2*rax]      ; q2
640        movq        mm6, mm1              ; q2
641        psubusb     mm1, mm2              ; q2-=q3
642        psubusb     mm2, mm6              ; q3-=q2
643        por         mm1, mm2              ; abs(q3-q2)
644        psubusb     mm1, mm7
645
646
647        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
648        movq        mm4, [rsi+rax]        ; q1
649        movq        mm3, mm4              ; q1
650        psubusb     mm4, mm6              ; q1-=q2
651        psubusb     mm6, mm3              ; q2-=q1
652        por         mm4, mm6              ; abs(q2-q1)
653        psubusb     mm4, mm7
654        por        mm1, mm4
655
656
657        ; mm1 = mask,      mm3=q1, mm7 = limit
658
659        movq        mm4, [rsi]            ; q0
660        movq        mm0, mm4              ; q0
661        psubusb     mm4, mm3              ; q0-=q1
662        psubusb     mm3, mm0              ; q1-=q0
663        por         mm4, mm3              ; abs(q0-q1)
664        movq        t0, mm4               ; save to t0
665        psubusb     mm4, mm7
666        por        mm1, mm4
667
668
669        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
670
671        neg         rax                   ; negate pitch to deal with above border
672
673        movq        mm2, [rsi+4*rax]      ; p3
674        movq        mm4, [rdi+4*rax]      ; p2
675        movq        mm5, mm4              ; p2
676        psubusb     mm4, mm2              ; p2-=p3
677        psubusb     mm2, mm5              ; p3-=p2
678        por         mm4, mm2              ; abs(p3 - p2)
679        psubusb     mm4, mm7
680        por        mm1, mm4
681        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
682
683        movq        mm4, [rsi+2*rax]      ; p1
684        movq        mm3, mm4              ; p1
685        psubusb     mm4, mm5              ; p1-=p2
686        psubusb     mm5, mm3              ; p2-=p1
687        por         mm4, mm5              ; abs(p2 - p1)
688        psubusb     mm4, mm7
689        por        mm1, mm4
690
691        movq        mm2, mm3              ; p1
692
693
694        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
695
696        movq        mm4, [rsi+rax]        ; p0
697        movq        mm5, mm4              ; p0
698        psubusb     mm4, mm3              ; p0-=p1
699        psubusb     mm3, mm5              ; p1-=p0
700        por         mm4, mm3              ; abs(p1 - p0)
701        movq        t1, mm4               ; save to t1
702        psubusb     mm4, mm7
703        por        mm1, mm4
704        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
705        ; mm5 = p0
706        movq        mm3, [rdi]            ; q1
707        movq        mm4, mm3              ; q1
708        psubusb     mm3, mm2              ; q1-=p1
709        psubusb     mm2, mm4              ; p1-=q1
710        por         mm2, mm3              ; abs(p1-q1)
711        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
712        psrlw       mm2, 1                ; abs(p1-q1)/2
713
714        movq        mm6, mm5              ; p0
715        movq        mm3, mm0              ; q0
716        psubusb     mm5, mm3              ; p0-=q0
717        psubusb     mm3, mm6              ; q0-=p0
718        por         mm5, mm3              ; abs(p0 - q0)
719        paddusb     mm5, mm5              ; abs(p0-q0)*2
720        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
721
722        mov         rdx, arg(2) ;flimit           ; get flimit
723        movq        mm2, [rdx]            ; flimit mm2
724        paddb       mm2, mm2              ; flimit*2 (less than 255)
725        paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
726
727        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
728        por         mm1,    mm5
729        pxor        mm5,    mm5
730        pcmpeqb     mm1,    mm5           ; mask mm1
731
732        ; mm1 = mask, mm0=q0,  mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
733        ; mm6 = p0,
734
735        ; calculate high edge variance
736        mov         rdx, arg(4) ;thresh           ; get thresh
737        movq        mm7, [rdx]            ;
738        movq        mm4, t0               ; get abs (q1 - q0)
739        psubusb     mm4, mm7
740        movq        mm3, t1               ; get abs (p1 - p0)
741        psubusb     mm3, mm7
742        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
743
744        pcmpeqb     mm4,        mm5
745
746        pcmpeqb     mm5,        mm5
747        pxor        mm4,        mm5
748
749
750
751        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
752        ; mm6 = p0, mm4=hev
753        ; start work on filters
754        movq        mm2, [rsi+2*rax]      ; p1
755        movq        mm7, [rdi]            ; q1
756        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
757        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
758        psubsb      mm2, mm7              ; p1 - q1
759
760        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
761        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
762        movq        mm3, mm0              ; q0
763        psubsb      mm0, mm6              ; q0 - p0
764        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
765        paddsb      mm2, mm0              ; 2 * (q0 - p0)
766        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
767        pand        mm1, mm2              ; mask filter values we don't care about
768
769
770        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
771        movq        mm2, mm1              ; vp8_filter
772        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
773
774        movq        mm5,        mm2       ;
775        paddsb      mm5,        [GLOBAL(t3)];
776
777        pxor        mm0, mm0              ; 0
778        pxor        mm7, mm7              ; 0
779
780        punpcklbw   mm0, mm5              ; e0f0g0h0
781        psraw       mm0, 11               ; sign extended shift right by 3
782        punpckhbw   mm7, mm5              ; a0b0c0d0
783        psraw       mm7, 11               ; sign extended shift right by 3
784        packsswb    mm0, mm7              ; Filter2 >>=3;
785
786        movq        mm5, mm0              ; Filter2
787
788        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
789        pxor        mm0, mm0              ; 0
790        pxor        mm7, mm7              ; 0
791
792        punpcklbw   mm0, mm2              ; e0f0g0h0
793        psraw       mm0, 11               ; sign extended shift right by 3
794        punpckhbw   mm7, mm2              ; a0b0c0d0
795        psraw       mm7, 11               ; sign extended shift right by 3
796        packsswb    mm0, mm7              ; Filter2 >>=3;
797
798        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
799        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
800        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
801
802        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
803        ; vp8_filter &= ~hev;
804        ; Filter2 = vp8_filter;
805        pandn       mm4, mm1              ; vp8_filter&=~hev
806
807
808        ; mm3=qs0, mm4=filter2, mm6=ps0
809
810        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
811        ; s = vp8_signed_char_clamp(qs0 - u);
812        ; *oq0 = s^0x80;
813        ; s = vp8_signed_char_clamp(ps0 + u);
814        ; *op0 = s^0x80;
815        pxor        mm0, mm0
816
817        pxor        mm1, mm1
818        pxor        mm2, mm2
819        punpcklbw   mm1, mm4
820        punpckhbw   mm2, mm4
821        pmulhw      mm1, [GLOBAL(s27)]
822        pmulhw      mm2, [GLOBAL(s27)]
823        paddw       mm1, [GLOBAL(s63)]
824        paddw       mm2, [GLOBAL(s63)]
825        psraw       mm1, 7
826        psraw       mm2, 7
827        packsswb    mm1, mm2
828
829        psubsb      mm3, mm1
830        paddsb      mm6, mm1
831
832        pxor        mm3, [GLOBAL(t80)]
833        pxor        mm6, [GLOBAL(t80)]
834        movq        [rsi+rax], mm6
835        movq        [rsi],     mm3
836
837        ; roughly 2/7th difference across boundary
838        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
839        ; s = vp8_signed_char_clamp(qs1 - u);
840        ; *oq1 = s^0x80;
841        ; s = vp8_signed_char_clamp(ps1 + u);
842        ; *op1 = s^0x80;
843        pxor        mm1, mm1
844        pxor        mm2, mm2
845        punpcklbw   mm1, mm4
846        punpckhbw   mm2, mm4
847        pmulhw      mm1, [GLOBAL(s18)]
848        pmulhw      mm2, [GLOBAL(s18)]
849        paddw       mm1, [GLOBAL(s63)]
850        paddw       mm2, [GLOBAL(s63)]
851        psraw       mm1, 7
852        psraw       mm2, 7
853        packsswb    mm1, mm2
854
855        movq        mm3, [rdi]
856        movq        mm6, [rsi+rax*2]       ; p1
857
858        pxor        mm3, [GLOBAL(t80)]
859        pxor        mm6, [GLOBAL(t80)]
860
861        paddsb      mm6, mm1
862        psubsb      mm3, mm1
863
864        pxor        mm6, [GLOBAL(t80)]
865        pxor        mm3, [GLOBAL(t80)]
866        movq        [rdi], mm3
867        movq        [rsi+rax*2], mm6
868
869        ; roughly 1/7th difference across boundary
870        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
871        ; s = vp8_signed_char_clamp(qs2 - u);
872        ; *oq2 = s^0x80;
873        ; s = vp8_signed_char_clamp(ps2 + u);
874        ; *op2 = s^0x80;
875        pxor        mm1, mm1
876        pxor        mm2, mm2
877        punpcklbw   mm1, mm4
878        punpckhbw   mm2, mm4
879        pmulhw      mm1, [GLOBAL(s9)]
880        pmulhw      mm2, [GLOBAL(s9)]
881        paddw       mm1, [GLOBAL(s63)]
882        paddw       mm2, [GLOBAL(s63)]
883        psraw       mm1, 7
884        psraw       mm2, 7
885        packsswb    mm1, mm2
886
887
888        movq        mm6, [rdi+rax*4]
889        neg         rax
890        movq        mm3, [rdi+rax  ]
891
892        pxor        mm6, [GLOBAL(t80)]
893        pxor        mm3, [GLOBAL(t80)]
894
895        paddsb      mm6, mm1
896        psubsb      mm3, mm1
897
898        pxor        mm6, [GLOBAL(t80)]
899        pxor        mm3, [GLOBAL(t80)]
900        movq        [rdi+rax  ], mm3
901        neg         rax
902        movq        [rdi+rax*4], mm6
903
904;EARLY_BREAK_OUT:
905        neg         rax
906        add         rsi,8
907        dec         rcx
908        jnz         next8_mbh
909
910    add rsp, 32
911    pop rsp
912    ; begin epilog
913    pop rdi
914    pop rsi
915    RESTORE_GOT
916    UNSHADOW_ARGS
917    pop         rbp
918    ret
919
920
921;void vp8_mbloop_filter_vertical_edge_mmx
922;(
923;    unsigned char *src_ptr,
924;    int  src_pixel_step,
925;    const char *flimit,
926;    const char *limit,
927;    const char *thresh,
928;    int count
929;)
930global sym(vp8_mbloop_filter_vertical_edge_mmx)
931sym(vp8_mbloop_filter_vertical_edge_mmx):
932    push        rbp
933    mov         rbp, rsp
934    SHADOW_ARGS_TO_STACK 6
935    GET_GOT     rbx
936    push        rsi
937    push        rdi
938    ; end prolog
939
940    ALIGN_STACK 16, rax
941    sub          rsp, 96      ; reserve 96 bytes
942    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
943    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
944    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
945
946        mov         rsi,        arg(0) ;src_ptr
947        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
948
949        lea         rsi,        [rsi + rax*4 - 4]
950
951        movsxd      rcx,        dword ptr arg(5) ;count
952next8_mbv:
953        lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
954
955        ;transpose
956        movq        mm0,        [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
957        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
958
959        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
960        punpckhbw   mm7,        mm0                         ; 77 67 76 66 75 65 74 64
961
962        punpcklbw   mm6,        mm0                         ; 73 63 72 62 71 61 70 60
963        movq        mm0,        [rsi+rax]                   ; 57 56 55 54 53 52 51 50
964
965        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
966        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
967
968        punpckhbw   mm5,        mm0                         ; 57 47 56 46 55 45 54 44
969        punpcklbw   mm4,        mm0                         ; 53 43 52 42 51 41 50 40
970
971        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
972        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
973
974        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
975        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
976
977        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
978        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
979
980        neg         rax
981
982        movq        mm7,        [rsi+rax]                   ; 37 36 35 34 33 32 31 30
983        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
984
985        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
986        punpckhbw   mm6,        mm7                         ; 37 27 36 36 35 25 34 24
987
988        punpcklbw   mm1,        mm7                         ; 33 23 32 22 31 21 30 20
989
990        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
991        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
992
993        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
994        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
995
996        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
997        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
998
999        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
1000        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
1001
1002        lea         rdx,        srct
1003        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
1004
1005        movq        [rdx+56],   mm7
1006        psubusb     mm5,        mm7                         ; q2-q3
1007
1008
1009        movq        [rdx+48],   mm6
1010        psubusb     mm7,        mm6                         ; q3-q2
1011
1012        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
1013        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
1014
1015        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
1016        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
1017
1018        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
1019        psubusb     mm3,        mm6                         ; q1-q2
1020
1021        psubusb     mm6,        mm5                         ; q2-q1
1022        por         mm6,        mm3                         ; mm6=abs(q2-q1)
1023
1024        movq        [rdx+40],   mm5                         ; save q1
1025        movq        [rdx+32],   mm0                         ; save q0
1026
1027        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
1028        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
1029
1030        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
1031        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
1032
1033        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
1034        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
1035
1036        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
1037        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
1038
1039        movq        [rdx],      mm0                         ; save p3
1040        movq        [rdx+8],    mm1                         ; save p2
1041
1042        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
1043        psubusb     mm2,        mm0                         ; p2-p3
1044
1045        psubusb     mm0,        mm1                         ; p3-p2
1046        por         mm0,        mm2                         ; mm0=abs(p3-p2)
1047
1048        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
1049        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
1050
1051        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
1052        movq        [rdx+24],   mm3                         ; save p0
1053
1054        movq        [rdx+16],   mm2                         ; save p1
1055        movq        mm5,        mm2                         ; mm5 = p1
1056
1057        psubusb     mm2,        mm1                         ; p1-p2
1058        psubusb     mm1,        mm5                         ; p2-p1
1059
1060        por         mm1,        mm2                         ; mm1=abs(p2-p1)
1061        mov         rdx,        arg(3) ;limit
1062
1063        movq        mm4,        [rdx]                       ; mm4 = limit
1064        psubusb     mm7,        mm4                         ; abs(q3-q2) > limit
1065
1066        psubusb     mm0,        mm4                         ; abs(p3-p2) > limit
1067        psubusb     mm1,        mm4                         ; abs(p2-p1) > limit
1068
1069        psubusb     mm6,        mm4                         ; abs(q2-q1) > limit
1070        por         mm7,        mm6                         ; or
1071
1072        por         mm0,        mm1                         ;
1073        por         mm0,        mm7                         ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
1074
1075        movq        mm1,        mm5                         ; p1
1076
1077        movq        mm7,        mm3                         ; mm3=mm7=p0
1078        psubusb     mm7,        mm5                         ; p0 - p1
1079
1080        psubusb     mm5,        mm3                         ; p1 - p0
1081        por         mm5,        mm7                         ; abs(p1-p0)
1082
1083        movq        t0,         mm5                         ; save abs(p1-p0)
1084        lea         rdx,        srct
1085
1086        psubusb     mm5,        mm4                         ; mm5 = abs(p1-p0) > limit
1087        por         mm0,        mm5                         ; mm0=mask
1088
1089        movq        mm5,        [rdx+32]                    ; mm5=q0
1090        movq        mm7,        [rdx+40]                    ; mm7=q1
1091
1092        movq        mm6,        mm5                         ; mm6=q0
1093        movq        mm2,        mm7                         ; q1
1094        psubusb     mm5,        mm7                         ; q0-q1
1095
1096        psubusb     mm7,        mm6                         ; q1-q0
1097        por         mm7,        mm5                         ; abs(q1-q0)
1098
1099        movq        t1,         mm7                         ; save abs(q1-q0)
1100        psubusb     mm7,        mm4                         ; mm7=abs(q1-q0)> limit
1101
1102        por         mm0,        mm7                         ; mask
1103
1104        movq        mm5,        mm2                         ; q1
1105        psubusb     mm5,        mm1                         ; q1-=p1
1106        psubusb     mm1,        mm2                         ; p1-=q1
1107        por         mm5,        mm1                         ; abs(p1-q1)
1108        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
1109        psrlw       mm5,        1                           ; abs(p1-q1)/2
1110
1111        mov         rdx,        arg(2) ;flimit                      ;
1112
1113        movq        mm2,        [rdx]                       ;flimit  mm2
1114        movq        mm1,        mm3                         ; mm1=mm3=p0
1115
1116        movq        mm7,        mm6                         ; mm7=mm6=q0
1117        psubusb     mm1,        mm7                         ; p0-q0
1118
1119        psubusb     mm7,        mm3                         ; q0-p0
1120        por         mm1,        mm7                         ; abs(q0-p0)
1121        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
1122        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
1123
1124        paddb       mm2,        mm2                         ; flimit*2 (less than 255)
1125        paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
1126
1127        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
1128        por         mm1,        mm0;                        ; mask
1129
1130        pxor        mm0,        mm0
1131        pcmpeqb     mm1,        mm0
1132
1133        ; calculate high edge variance
1134        mov         rdx,        arg(4) ;thresh            ; get thresh
1135        movq        mm7,        [rdx]
1136        ;
1137        movq        mm4,        t0              ; get abs (q1 - q0)
1138        psubusb     mm4,        mm7             ; abs(q1 - q0) > thresh
1139
1140        movq        mm3,        t1              ; get abs (p1 - p0)
1141        psubusb     mm3,        mm7             ; abs(p1 - p0)> thresh
1142
1143        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
1144        pcmpeqb     mm4,        mm0
1145
1146        pcmpeqb     mm0,        mm0
1147        pxor        mm4,        mm0
1148
1149
1150
1151
1152        ; start work on filters
1153        lea         rdx,        srct
1154
1155        ; start work on filters
1156        movq        mm2, [rdx+16]         ; p1
1157        movq        mm7, [rdx+40]         ; q1
1158        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
1159        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
1160        psubsb      mm2, mm7              ; p1 - q1
1161
1162        movq        mm6, [rdx+24]         ; p0
1163        movq        mm0, [rdx+32]         ; q0
1164        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
1165        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
1166
1167        movq        mm3, mm0              ; q0
1168        psubsb      mm0, mm6              ; q0 - p0
1169        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
1170        paddsb      mm2, mm0              ; 2 * (q0 - p0)
1171        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
1172        pand       mm1, mm2           ; mask filter values we don't care about
1173
1174        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
1175        movq        mm2, mm1              ; vp8_filter
1176        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
1177
1178        movq        mm5,        mm2       ;
1179        paddsb      mm5,        [GLOBAL(t3)];
1180
1181        pxor        mm0, mm0              ; 0
1182        pxor        mm7, mm7              ; 0
1183
1184        punpcklbw   mm0, mm5              ; e0f0g0h0
1185        psraw       mm0, 11               ; sign extended shift right by 3
1186        punpckhbw   mm7, mm5              ; a0b0c0d0
1187        psraw       mm7, 11               ; sign extended shift right by 3
1188        packsswb    mm0, mm7              ; Filter2 >>=3;
1189
1190        movq        mm5, mm0              ; Filter2
1191
1192        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
1193        pxor        mm0, mm0              ; 0
1194        pxor        mm7, mm7              ; 0
1195
1196        punpcklbw   mm0, mm2              ; e0f0g0h0
1197        psraw       mm0, 11               ; sign extended shift right by 3
1198        punpckhbw   mm7, mm2              ; a0b0c0d0
1199        psraw       mm7, 11               ; sign extended shift right by 3
1200        packsswb    mm0, mm7              ; Filter2 >>=3;
1201
1202        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
1203        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
1204        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
1205
1206        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
1207        ; vp8_filter &= ~hev;
1208        ; Filter2 = vp8_filter;
1209        pandn       mm4, mm1              ; vp8_filter&=~hev
1210
1211
1212        ; mm3=qs0, mm4=filter2, mm6=ps0
1213
1214        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
1215        ; s = vp8_signed_char_clamp(qs0 - u);
1216        ; *oq0 = s^0x80;
1217        ; s = vp8_signed_char_clamp(ps0 + u);
1218        ; *op0 = s^0x80;
1219        pxor        mm0, mm0
1220
1221        pxor        mm1, mm1
1222        pxor        mm2, mm2
1223        punpcklbw   mm1, mm4
1224        punpckhbw   mm2, mm4
1225        pmulhw      mm1, [GLOBAL(s27)]
1226        pmulhw      mm2, [GLOBAL(s27)]
1227        paddw       mm1, [GLOBAL(s63)]
1228        paddw       mm2, [GLOBAL(s63)]
1229        psraw       mm1, 7
1230        psraw       mm2, 7
1231        packsswb    mm1, mm2
1232
1233        psubsb      mm3, mm1
1234        paddsb      mm6, mm1
1235
1236        pxor        mm3, [GLOBAL(t80)]
1237        pxor        mm6, [GLOBAL(t80)]
1238        movq        [rdx+24], mm6
1239        movq        [rdx+32], mm3
1240
1241        ; roughly 2/7th difference across boundary
1242        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
1243        ; s = vp8_signed_char_clamp(qs1 - u);
1244        ; *oq1 = s^0x80;
1245        ; s = vp8_signed_char_clamp(ps1 + u);
1246        ; *op1 = s^0x80;
1247        pxor        mm1, mm1
1248        pxor        mm2, mm2
1249        punpcklbw   mm1, mm4
1250        punpckhbw   mm2, mm4
1251        pmulhw      mm1, [GLOBAL(s18)]
1252        pmulhw      mm2, [GLOBAL(s18)]
1253        paddw       mm1, [GLOBAL(s63)]
1254        paddw       mm2, [GLOBAL(s63)]
1255        psraw       mm1, 7
1256        psraw       mm2, 7
1257        packsswb    mm1, mm2
1258
1259        movq        mm3, [rdx + 40]
1260        movq        mm6, [rdx + 16]       ; p1
1261        pxor        mm3, [GLOBAL(t80)]
1262        pxor        mm6, [GLOBAL(t80)]
1263
1264        paddsb      mm6, mm1
1265        psubsb      mm3, mm1
1266
1267        pxor        mm6, [GLOBAL(t80)]
1268        pxor        mm3, [GLOBAL(t80)]
1269        movq        [rdx + 40], mm3
1270        movq        [rdx + 16], mm6
1271
1272        ; roughly 1/7th difference across boundary
1273        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
1274        ; s = vp8_signed_char_clamp(qs2 - u);
1275        ; *oq2 = s^0x80;
1276        ; s = vp8_signed_char_clamp(ps2 + u);
1277        ; *op2 = s^0x80;
1278        pxor        mm1, mm1
1279        pxor        mm2, mm2
1280        punpcklbw   mm1, mm4
1281        punpckhbw   mm2, mm4
1282        pmulhw      mm1, [GLOBAL(s9)]
1283        pmulhw      mm2, [GLOBAL(s9)]
1284        paddw       mm1, [GLOBAL(s63)]
1285        paddw       mm2, [GLOBAL(s63)]
1286        psraw       mm1, 7
1287        psraw       mm2, 7
1288        packsswb    mm1, mm2
1289
1290        movq        mm6, [rdx+ 8]
1291        movq        mm3, [rdx+48]
1292
1293        pxor        mm6, [GLOBAL(t80)]
1294        pxor        mm3, [GLOBAL(t80)]
1295
1296        paddsb      mm6, mm1
1297        psubsb      mm3, mm1
1298
1299        pxor        mm6, [GLOBAL(t80)]          ; mm6 = 71 61 51 41 31 21 11 01
1300        pxor        mm3, [GLOBAL(t80)]          ; mm3 = 76 66 56 46 36 26 15 06
1301
1302        ; tranpose and write back
1303        movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
1304        movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
1305
1306        punpcklbw   mm0,    mm6                 ; mm0 = 31 30 21 20 11 10 01 00
1307        punpckhbw   mm1,    mm6                 ; mm3 = 71 70 61 60 51 50 41 40
1308
1309        movq        mm2,    [rdx+16]            ; mm2 = 72 62 52 42 32 22 12 02
1310        movq        mm6,    mm2                 ; mm3 = 72 62 52 42 32 22 12 02
1311
1312        punpcklbw   mm2,    [rdx+24]            ; mm2 = 33 32 23 22 13 12 03 02
1313        punpckhbw   mm6,    [rdx+24]            ; mm3 = 73 72 63 62 53 52 43 42
1314
1315        movq        mm5,    mm0                 ; mm5 = 31 30 21 20 11 10 01 00
1316        punpcklwd   mm0,    mm2                 ; mm0 = 13 12 11 10 03 02 01 00
1317
1318        punpckhwd   mm5,    mm2                 ; mm5 = 33 32 31 30 23 22 21 20
1319        movq        mm4,    mm1                 ; mm4 = 71 70 61 60 51 50 41 40
1320
1321        punpcklwd   mm1,    mm6                 ; mm1 = 53 52 51 50 43 42 41 40
1322        punpckhwd   mm4,    mm6                 ; mm4 = 73 72 71 70 63 62 61 60
1323
1324        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
1325        punpcklbw   mm2,    [rdx+40]            ; mm2 = 35 34 25 24 15 14 05 04
1326
1327        movq        mm6,    mm3                 ; mm6 = 76 66 56 46 36 26 15 06
1328        punpcklbw   mm6,    [rdx+56]            ; mm6 = 37 36 27 26 17 16 07 06
1329
1330        movq        mm7,    mm2                 ; mm7 = 35 34 25 24 15 14 05 04
1331        punpcklwd   mm2,    mm6                 ; mm2 = 17 16 15 14 07 06 05 04
1332
1333        punpckhwd   mm7,    mm6                 ; mm7 = 37 36 35 34 27 26 25 24
1334        movq        mm6,    mm0                 ; mm6 = 13 12 11 10 03 02 01 00
1335
1336        punpckldq   mm0,    mm2                 ; mm0 = 07 06 05 04 03 02 01 00
1337        punpckhdq   mm6,    mm2                 ; mm6 = 17 16 15 14 13 12 11 10
1338
1339        movq        [rsi+rax*4], mm0            ; write out
1340        movq        [rdi+rax*4], mm6            ; write out
1341
1342        movq        mm0,    mm5                 ; mm0 = 33 32 31 30 23 22 21 20
1343        punpckldq   mm0,    mm7                 ; mm0 = 27 26 25 24 23 22 20 20
1344
1345        punpckhdq   mm5,    mm7                 ; mm5 = 37 36 35 34 33 32 31 30
1346        movq        [rsi+rax*2], mm0            ; write out
1347
1348        movq        [rdi+rax*2], mm5            ; write out
1349        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
1350
1351        punpckhbw   mm2,    [rdx+40]            ; mm2 = 75 74 65 64 54 54 45 44
1352        punpckhbw   mm3,    [rdx+56]            ; mm3 = 77 76 67 66 57 56 47 46
1353
1354        movq        mm5,    mm2                 ; mm5 = 75 74 65 64 54 54 45 44
1355        punpcklwd   mm2,    mm3                 ; mm2 = 57 56 55 54 47 46 45 44
1356
1357        punpckhwd   mm5,    mm3                 ; mm5 = 77 76 75 74 67 66 65 64
1358        movq        mm0,    mm1                 ; mm0=  53 52 51 50 43 42 41 40
1359
1360        movq        mm3,    mm4                 ; mm4 = 73 72 71 70 63 62 61 60
1361        punpckldq   mm0,    mm2                 ; mm0 = 47 46 45 44 43 42 41 40
1362
1363        punpckhdq   mm1,    mm2                 ; mm1 = 57 56 55 54 53 52 51 50
1364        movq        [rsi],  mm0                 ; write out
1365
1366        movq        [rdi],  mm1                 ; write out
1367        neg         rax
1368
1369        punpckldq   mm3,    mm5                 ; mm3 = 67 66 65 64 63 62 61 60
1370        punpckhdq   mm4,    mm5                 ; mm4 = 77 76 75 74 73 72 71 60
1371
1372        movq        [rsi+rax*2], mm3
1373        movq        [rdi+rax*2], mm4
1374
1375        lea         rsi,        [rsi+rax*8]
1376        dec         rcx
1377
1378        jnz         next8_mbv
1379
1380    add rsp, 96
1381    pop rsp
1382    ; begin epilog
1383    pop rdi
1384    pop rsi
1385    RESTORE_GOT
1386    UNSHADOW_ARGS
1387    pop         rbp
1388    ret
1389
1390
1391;void vp8_loop_filter_simple_horizontal_edge_mmx
1392;(
1393;    unsigned char *src_ptr,
1394;    int  src_pixel_step,
1395;    const char *flimit,
1396;    const char *limit,
1397;    const char *thresh,
1398;    int count
1399;)
1400global sym(vp8_loop_filter_simple_horizontal_edge_mmx)
1401sym(vp8_loop_filter_simple_horizontal_edge_mmx):
1402    push        rbp
1403    mov         rbp, rsp
1404    SHADOW_ARGS_TO_STACK 6
1405    GET_GOT     rbx
1406    push        rsi
1407    push        rdi
1408    ; end prolog
1409
1410        mov         rsi, arg(0) ;src_ptr
1411        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
1412
1413        movsxd      rcx, dword ptr arg(5) ;count
1414nexts8_h:
1415        mov         rdx, arg(3) ;limit
1416        movq        mm7, [rdx]
1417        mov         rdx, arg(2) ;flimit           ; get flimit
1418        movq        mm3, [rdx]            ;
1419        paddb       mm3, mm3              ; flimit*2 (less than 255)
1420        paddb       mm3, mm7              ; flimit * 2 + limit (less than 255)
1421
1422        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
1423        add         rdi, rax
1424        neg         rax
1425
1426        ; calculate mask
1427        movq        mm1, [rsi+2*rax]      ; p1
1428        movq        mm0, [rdi]            ; q1
1429        movq        mm2, mm1
1430        movq        mm7, mm0
1431        movq        mm4, mm0
1432        psubusb     mm0, mm1              ; q1-=p1
1433        psubusb     mm1, mm4              ; p1-=q1
1434        por         mm1, mm0              ; abs(p1-q1)
1435        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
1436        psrlw       mm1, 1                ; abs(p1-q1)/2
1437
1438        movq        mm5, [rsi+rax]        ; p0
1439        movq        mm4, [rsi]            ; q0
1440        movq        mm0, mm4              ; q0
1441        movq        mm6, mm5              ; p0
1442        psubusb     mm5, mm4              ; p0-=q0
1443        psubusb     mm4, mm6              ; q0-=p0
1444        por         mm5, mm4              ; abs(p0 - q0)
1445        paddusb     mm5, mm5              ; abs(p0-q0)*2
1446        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
1447
1448        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
1449        pxor        mm3, mm3
1450        pcmpeqb     mm5, mm3
1451
1452        ; start work on filters
1453        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
1454        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
1455        psubsb      mm2, mm7              ; p1 - q1
1456
1457        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
1458        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
1459        movq        mm3, mm0              ; q0
1460        psubsb      mm0, mm6              ; q0 - p0
1461        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
1462        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
1463        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
1464        pand        mm5, mm2              ; mask filter values we don't care about
1465
1466        ; do + 4 side
1467        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
1468
1469        movq        mm0, mm5              ; get a copy of filters
1470        psllw       mm0, 8                ; shift left 8
1471        psraw       mm0, 3                ; arithmetic shift right 11
1472        psrlw       mm0, 8
1473        movq        mm1, mm5              ; get a copy of filters
1474        psraw       mm1, 11               ; arithmetic shift right 11
1475        psllw       mm1, 8                ; shift left 8 to put it back
1476
1477        por         mm0, mm1              ; put the two together to get result
1478
1479        psubsb      mm3, mm0              ; q0-= q0 add
1480        pxor        mm3, [GLOBAL(t80)]    ; unoffset
1481        movq        [rsi], mm3            ; write back
1482
1483
1484        ; now do +3 side
1485        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
1486
1487        movq        mm0, mm5              ; get a copy of filters
1488        psllw       mm0, 8                ; shift left 8
1489        psraw       mm0, 3                ; arithmetic shift right 11
1490        psrlw       mm0, 8
1491        psraw       mm5, 11               ; arithmetic shift right 11
1492        psllw       mm5, 8                ; shift left 8 to put it back
1493        por         mm0, mm5              ; put the two together to get result
1494
1495
1496        paddsb      mm6, mm0              ; p0+= p0 add
1497        pxor        mm6, [GLOBAL(t80)]    ; unoffset
1498        movq        [rsi+rax], mm6        ; write back
1499
1500        add         rsi,8
1501        neg         rax
1502        dec         rcx
1503        jnz         nexts8_h
1504
1505    ; begin epilog
1506    pop rdi
1507    pop rsi
1508    RESTORE_GOT
1509    UNSHADOW_ARGS
1510    pop         rbp
1511    ret
1512
1513
1514;void vp8_loop_filter_simple_vertical_edge_mmx
1515;(
1516;    unsigned char *src_ptr,
1517;    int  src_pixel_step,
1518;    const char *flimit,
1519;    const char *limit,
1520;    const char *thresh,
1521;    int count
1522;)
1523global sym(vp8_loop_filter_simple_vertical_edge_mmx)
1524sym(vp8_loop_filter_simple_vertical_edge_mmx):
1525    push        rbp
1526    mov         rbp, rsp
1527    SHADOW_ARGS_TO_STACK 6
1528    GET_GOT     rbx
1529    push        rsi
1530    push        rdi
1531    ; end prolog
1532
1533    ALIGN_STACK 16, rax
1534    sub          rsp, 32      ; reserve 32 bytes
1535    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
1536    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
1537
1538        mov         rsi, arg(0) ;src_ptr
1539        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
1540
1541        lea         rsi, [rsi + rax*4- 2];  ;
1542        movsxd      rcx, dword ptr arg(5) ;count
1543nexts8_v:
1544
1545        lea         rdi,        [rsi + rax];
1546        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
1547
1548        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
1549        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
1550
1551        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
1552        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
1553
1554        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
1555        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
1556
1557        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
1558        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
1559
1560        neg         rax
1561
1562        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
1563        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
1564
1565        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
1566        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
1567
1568        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
1569        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
1570
1571        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
1572        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
1573
1574        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
1575        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
1576
1577        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
1578        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
1579
1580        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
1581        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
1582
1583        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
1584
1585
1586        ; calculate mask
1587        movq        mm6,        mm0                             ; p1
1588        movq        mm7,        mm3                             ; q1
1589        psubusb     mm7,        mm6                             ; q1-=p1
1590        psubusb     mm6,        mm3                             ; p1-=q1
1591        por         mm6,        mm7                             ; abs(p1-q1)
1592        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
1593        psrlw       mm6,        1                               ; abs(p1-q1)/2
1594
1595        movq        mm5,        mm1                             ; p0
1596        movq        mm4,        mm2                             ; q0
1597
1598        psubusb     mm5,        mm2                             ; p0-=q0
1599        psubusb     mm4,        mm1                             ; q0-=p0
1600
1601        por         mm5,        mm4                             ; abs(p0 - q0)
1602        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
1603        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
1604
1605        mov         rdx,        arg(2) ;flimit                          ; get flimit
1606        movq        mm7,        [rdx]
1607        mov         rdx,        arg(3)                          ; get limit
1608        movq        mm6,        [rdx]
1609        paddb       mm7,        mm7                             ; flimit*2 (less than 255)
1610        paddb       mm7,        mm6                             ; flimit * 2 + limit (less than 255)
1611
1612        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
1613        pxor        mm7,        mm7
1614        pcmpeqb     mm5,        mm7                             ; mm5 = mask
1615
1616        ; start work on filters
1617        movq        t0,         mm0
1618        movq        t1,         mm3
1619
1620        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
1621        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
1622
1623        psubsb      mm0,        mm3                             ; p1 - q1
1624        movq        mm6,        mm1                             ; p0
1625
1626        movq        mm7,        mm2                             ; q0
1627        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
1628
1629        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
1630        movq        mm3,        mm7                             ; offseted ; q0
1631
1632        psubsb      mm7,        mm6                             ; q0 - p0
1633        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
1634
1635        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
1636        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
1637
1638        pand        mm5,        mm0                             ; mask filter values we don't care about
1639
1640        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
1641
1642        movq        mm0,        mm5                             ; get a copy of filters
1643        psllw       mm0,        8                               ; shift left 8
1644        psraw       mm0,        3                               ; arithmetic shift right 11
1645        psrlw       mm0,        8
1646
1647        movq        mm7,        mm5                             ; get a copy of filters
1648        psraw       mm7,        11                              ; arithmetic shift right 11
1649        psllw       mm7,        8                               ; shift left 8 to put it back
1650
1651        por         mm0,        mm7                             ; put the two together to get result
1652
1653        psubsb      mm3,        mm0                             ; q0-= q0sz add
1654        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
1655
1656        ; now do +3 side
1657        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
1658
1659        movq        mm0, mm5                                    ; get a copy of filters
1660        psllw       mm0, 8                                      ; shift left 8
1661        psraw       mm0, 3                                      ; arithmetic shift right 11
1662        psrlw       mm0, 8
1663
1664        psraw       mm5, 11                                     ; arithmetic shift right 11
1665        psllw       mm5, 8                                      ; shift left 8 to put it back
1666        por         mm0, mm5                                    ; put the two together to get result
1667
1668        paddsb      mm6, mm0                                    ; p0+= p0 add
1669        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
1670
1671
1672        movq        mm0,        t0
1673        movq        mm4,        t1
1674
1675        ; mm0 = 70 60 50 40 30 20 10 00
1676        ; mm6 = 71 61 51 41 31 21 11 01
1677        ; mm3 = 72 62 52 42 32 22 12 02
1678        ; mm4 = 73 63 53 43 33 23 13 03
1679        ; transpose back to write out
1680
1681        movq        mm1,        mm0                         ;
1682        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
1683
1684        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
1685        movq        mm2,        mm3                         ;
1686
1687        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
1688        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
1689
1690        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
1691        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
1692
1693        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
1694        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
1695
1696        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
1697        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
1698
1699        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
1700        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
1701
1702        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
1703        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
1704
1705        psrlq       mm6,        32                          ; 33 32 31 30
1706        movd        [rsi],      mm1                         ; write 43 42 41 40
1707
1708        movd        [rsi + rax], mm6                        ; write 33 32 31 30
1709        neg         rax
1710
1711        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
1712        psrlq       mm1,        32                          ; 53 52 51 50
1713
1714        movd        [rdi],      mm1                         ; write out 53 52 51 50
1715        psrlq       mm5,        32                          ; 73 72 71 70
1716
1717        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
1718
1719        lea         rsi,        [rsi+rax*8]                 ; next 8
1720
1721        dec         rcx
1722        jnz         nexts8_v
1723
1724    add rsp, 32
1725    pop rsp
1726    ; begin epilog
1727    pop rdi
1728    pop rsi
1729    RESTORE_GOT
1730    UNSHADOW_ARGS
1731    pop         rbp
1732    ret
1733
1734
1735
1736;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
1737;                  int y_stride,
1738;                  loop_filter_info *lfi)
1739;{
1740;
1741;
1742;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1743;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1744;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1745;}
1746
1747SECTION_RODATA
1748align 16
1749tfe:
1750    times 8 db 0xfe
1751align 16
1752t80:
1753    times 8 db 0x80
1754align 16
1755t1s:
1756    times 8 db 0x01
1757align 16
1758t3:
1759    times 8 db 0x03
1760align 16
1761t4:
1762    times 8 db 0x04
1763align 16
1764ones:
1765    times 4 dw 0x0001
1766align 16
1767s27:
1768    times 4 dw 0x1b00
1769align 16
1770s18:
1771    times 4 dw 0x1200
1772align 16
1773s9:
1774    times 4 dw 0x0900
1775align 16
1776s63:
1777    times 4 dw 0x003f
1778