1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include <math.h>
13#include <stdlib.h>
14#include "vpx_scale/yv12config.h"
15#include "pragmas.h"
16
17#define VP8_FILTER_WEIGHT 128
18#define VP8_FILTER_SHIFT  7
19
20
21
22/* static constants */
23__declspec(align(16))
24const static short  Blur[48] =
25{
26
27    16, 16, 16, 16, 16, 16, 16, 16,
28    16, 16, 16, 16, 16, 16, 16, 16,
29    64, 64, 64, 64, 64, 64, 64, 64,
30    16, 16, 16, 16, 16, 16, 16, 16,
31    16, 16, 16, 16, 16, 16, 16, 16,
32    0,  0,  0,  0,  0,  0,  0,  0,
33
34};
35#define RD  __declspec(align(16)) __int64 rd  = 0x0040004000400040;
36#define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
37
38#ifndef RELOCATEABLE
39const static RD;
40const static R4D2;
41#endif
42
43
44/* external references */
45extern double vp8_gaussian(double sigma, double mu, double x);
46extern short vp8_rv[];
47extern int vp8_q2mbl(int x) ;
48
49
50
51void vp8_post_proc_down_and_across_mmx
52(
53    unsigned char *src_ptr,
54    unsigned char *dst_ptr,
55    int src_pixels_per_line,
56    int dst_pixels_per_line,
57    int rows,
58    int cols,
59    int flimit
60)
61{
62#ifdef RELOCATEABLE
63    RD
64    R4D2
65#endif
66
67    __asm
68    {
69        push        ebx
70        lea         ebx, Blur
71        movd        mm2, flimit
72        punpcklwd   mm2, mm2
73        punpckldq   mm2, mm2
74
75        mov         esi,        src_ptr
76        mov         edi,        dst_ptr
77
78        mov         ecx, DWORD PTR rows
79        mov         eax, src_pixels_per_line ;
80        destination pitch?
81        pxor        mm0, mm0              ;
82        mm0 = 00000000
83
84        nextrow:
85
86        xor         edx,        edx       ;
87
88        clear out edx for use as loop counter
89        nextcol:
90
91        pxor        mm7, mm7              ;
92
93    mm7 = 00000000
94    movq        mm6, [ebx + 32 ]      ;
95        mm6 = kernel 2 taps
96        movq        mm3, [esi]            ;
97        mm4 = r0 p0..p7
98        punpcklbw   mm3, mm0              ;
99        mm3 = p0..p3
100        movq        mm1, mm3              ;
101        mm1 = p0..p3
102        pmullw      mm3, mm6              ;
103        mm3 *= kernel 2 modifiers
104
105        movq        mm6, [ebx + 48]       ;
106        mm6 = kernel 3 taps
107        movq        mm5, [esi + eax]      ;
108        mm4 = r1 p0..p7
109        punpcklbw   mm5, mm0              ;
110        mm5 = r1 p0..p3
111        pmullw      mm6, mm5              ;
112        mm6 *= p0..p3 * kernel 3 modifiers
113        paddusw     mm3, mm6              ;
114        mm3 += mm6
115
116        ;
117        thresholding
118        movq        mm7, mm1              ;
119        mm7 = r0 p0..p3
120        psubusw     mm7, mm5              ;
121        mm7 = r0 p0..p3 - r1 p0..p3
122        psubusw     mm5, mm1              ;
123        mm5 = r1 p0..p3 - r0 p0..p3
124        paddusw     mm7, mm5              ;
125        mm7 = abs(r0 p0..p3 - r1 p0..p3)
126        pcmpgtw     mm7, mm2
127
128        movq        mm6, [ebx + 64 ]      ;
129        mm6 = kernel 4 modifiers
130        movq        mm5, [esi + 2*eax]    ;
131        mm4 = r2 p0..p7
132        punpcklbw   mm5, mm0              ;
133        mm5 = r2 p0..p3
134        pmullw      mm6, mm5              ;
135        mm5 *= kernel 4 modifiers
136        paddusw     mm3, mm6              ;
137        mm3 += mm5
138
139        ;
140        thresholding
141        movq        mm6, mm1              ;
142        mm6 = r0 p0..p3
143        psubusw     mm6, mm5              ;
144        mm6 = r0 p0..p3 - r2 p0..p3
145        psubusw     mm5, mm1              ;
146        mm5 = r2 p0..p3 - r2 p0..p3
147        paddusw     mm6, mm5              ;
148        mm6 = abs(r0 p0..p3 - r2 p0..p3)
149        pcmpgtw     mm6, mm2
150        por         mm7, mm6              ;
151        accumulate thresholds
152
153
154        neg         eax
155        movq        mm6, [ebx ]           ;
156        kernel 0 taps
157        movq        mm5, [esi+2*eax]      ;
158        mm4 = r-2 p0..p7
159        punpcklbw   mm5, mm0              ;
160        mm5 = r-2 p0..p3
161        pmullw      mm6, mm5              ;
162        mm5 *= kernel 0 modifiers
163        paddusw     mm3, mm6              ;
164        mm3 += mm5
165
166        ;
167        thresholding
168        movq        mm6, mm1              ;
169        mm6 = r0 p0..p3
170        psubusw     mm6, mm5              ;
171        mm6 = p0..p3 - r-2 p0..p3
172        psubusw     mm5, mm1              ;
173        mm5 = r-2 p0..p3 - p0..p3
174        paddusw     mm6, mm5              ;
175        mm6 = abs(r0 p0..p3 - r-2 p0..p3)
176        pcmpgtw     mm6, mm2
177        por         mm7, mm6              ;
178        accumulate thresholds
179
180        movq        mm6, [ebx + 16]       ;
181        kernel 1 taps
182        movq        mm4, [esi+eax]        ;
183        mm4 = r-1 p0..p7
184        punpcklbw   mm4, mm0              ;
185        mm4 = r-1 p0..p3
186        pmullw      mm6, mm4              ;
187        mm4 *= kernel 1 modifiers.
188        paddusw     mm3, mm6              ;
189        mm3 += mm5
190
191        ;
192        thresholding
193        movq        mm6, mm1              ;
194        mm6 = r0 p0..p3
195        psubusw     mm6, mm4              ;
196        mm6 = p0..p3 - r-2 p0..p3
197        psubusw     mm4, mm1              ;
198        mm5 = r-1 p0..p3 - p0..p3
199        paddusw     mm6, mm4              ;
200        mm6 = abs(r0 p0..p3 - r-1 p0..p3)
201        pcmpgtw     mm6, mm2
202        por         mm7, mm6              ;
203        accumulate thresholds
204
205
206        paddusw     mm3, rd               ;
207        mm3 += round value
208        psraw       mm3, VP8_FILTER_SHIFT     ;
209        mm3 /= 128
210
211        pand        mm1, mm7              ;
212        mm1 select vals > thresh from source
213        pandn       mm7, mm3              ;
214        mm7 select vals < thresh from blurred result
215        paddusw     mm1, mm7              ;
216        combination
217
218        packuswb    mm1, mm0              ;
219        pack to bytes
220
221        movd        [edi], mm1            ;
222        neg         eax                   ;
223        pitch is positive
224
225
226        add         esi, 4
227        add         edi, 4
228        add         edx, 4
229
230        cmp         edx, cols
231        jl          nextcol
232        // done with the all cols, start the across filtering in place
233        sub         esi, edx
234        sub         edi, edx
235
236
237        push        eax
238        xor         edx,    edx
239        mov         eax,    [edi-4];
240
241        acrossnextcol:
242        pxor        mm7, mm7              ;
243        mm7 = 00000000
244        movq        mm6, [ebx + 32 ]      ;
245        movq        mm4, [edi+edx]        ;
246        mm4 = p0..p7
247        movq        mm3, mm4              ;
248        mm3 = p0..p7
249        punpcklbw   mm3, mm0              ;
250        mm3 = p0..p3
251        movq        mm1, mm3              ;
252        mm1 = p0..p3
253        pmullw      mm3, mm6              ;
254        mm3 *= kernel 2 modifiers
255
256        movq        mm6, [ebx + 48]
257        psrlq       mm4, 8                ;
258        mm4 = p1..p7
259        movq        mm5, mm4              ;
260        mm5 = p1..p7
261        punpcklbw   mm5, mm0              ;
262        mm5 = p1..p4
263        pmullw      mm6, mm5              ;
264        mm6 *= p1..p4 * kernel 3 modifiers
265        paddusw     mm3, mm6              ;
266        mm3 += mm6
267
268        ;
269        thresholding
270        movq        mm7, mm1              ;
271        mm7 = p0..p3
272        psubusw     mm7, mm5              ;
273        mm7 = p0..p3 - p1..p4
274        psubusw     mm5, mm1              ;
275        mm5 = p1..p4 - p0..p3
276        paddusw     mm7, mm5              ;
277        mm7 = abs(p0..p3 - p1..p4)
278        pcmpgtw     mm7, mm2
279
280        movq        mm6, [ebx + 64 ]
281        psrlq       mm4, 8                ;
282        mm4 = p2..p7
283        movq        mm5, mm4              ;
284        mm5 = p2..p7
285        punpcklbw   mm5, mm0              ;
286        mm5 = p2..p5
287        pmullw      mm6, mm5              ;
288        mm5 *= kernel 4 modifiers
289        paddusw     mm3, mm6              ;
290        mm3 += mm5
291
292        ;
293        thresholding
294        movq        mm6, mm1              ;
295        mm6 = p0..p3
296        psubusw     mm6, mm5              ;
297        mm6 = p0..p3 - p1..p4
298        psubusw     mm5, mm1              ;
299        mm5 = p1..p4 - p0..p3
300        paddusw     mm6, mm5              ;
301        mm6 = abs(p0..p3 - p1..p4)
302        pcmpgtw     mm6, mm2
303        por         mm7, mm6              ;
304        accumulate thresholds
305
306
307        movq        mm6, [ebx ]
308        movq        mm4, [edi+edx-2]      ;
309        mm4 = p-2..p5
310        movq        mm5, mm4              ;
311        mm5 = p-2..p5
312        punpcklbw   mm5, mm0              ;
313        mm5 = p-2..p1
314        pmullw      mm6, mm5              ;
315        mm5 *= kernel 0 modifiers
316        paddusw     mm3, mm6              ;
317        mm3 += mm5
318
319        ;
320        thresholding
321        movq        mm6, mm1              ;
322        mm6 = p0..p3
323        psubusw     mm6, mm5              ;
324        mm6 = p0..p3 - p1..p4
325        psubusw     mm5, mm1              ;
326        mm5 = p1..p4 - p0..p3
327        paddusw     mm6, mm5              ;
328        mm6 = abs(p0..p3 - p1..p4)
329        pcmpgtw     mm6, mm2
330        por         mm7, mm6              ;
331        accumulate thresholds
332
333        movq        mm6, [ebx + 16]
334        psrlq       mm4, 8                ;
335        mm4 = p-1..p5
336        punpcklbw   mm4, mm0              ;
337        mm4 = p-1..p2
338        pmullw      mm6, mm4              ;
339        mm4 *= kernel 1 modifiers.
340        paddusw     mm3, mm6              ;
341        mm3 += mm5
342
343        ;
344        thresholding
345        movq        mm6, mm1              ;
346        mm6 = p0..p3
347        psubusw     mm6, mm4              ;
348        mm6 = p0..p3 - p1..p4
349        psubusw     mm4, mm1              ;
350        mm5 = p1..p4 - p0..p3
351        paddusw     mm6, mm4              ;
352        mm6 = abs(p0..p3 - p1..p4)
353        pcmpgtw     mm6, mm2
354        por         mm7, mm6              ;
355        accumulate thresholds
356
357        paddusw     mm3, rd               ;
358        mm3 += round value
359        psraw       mm3, VP8_FILTER_SHIFT     ;
360        mm3 /= 128
361
362        pand        mm1, mm7              ;
363        mm1 select vals > thresh from source
364        pandn       mm7, mm3              ;
365        mm7 select vals < thresh from blurred result
366        paddusw     mm1, mm7              ;
367        combination
368
369        packuswb    mm1, mm0              ;
370        pack to bytes
371        mov         DWORD PTR [edi+edx-4],  eax   ;
372        store previous four bytes
373        movd        eax,    mm1
374
375        add         edx, 4
376        cmp         edx, cols
377        jl          acrossnextcol;
378
379        mov         DWORD PTR [edi+edx-4],  eax
380        pop         eax
381
382        // done with this rwo
383        add         esi, eax               ;
384        next line
385        mov         eax, dst_pixels_per_line ;
386        destination pitch?
387        add         edi, eax               ;
388        next destination
389        mov         eax, src_pixels_per_line ;
390        destination pitch?
391
392        dec         ecx                   ;
393        decrement count
394        jnz         nextrow               ;
395        next row
396        pop         ebx
397
398    }
399}
400
401
402
403void vp8_post_proc_down_and_across_xmm
404(
405    unsigned char *src_ptr,
406    unsigned char *dst_ptr,
407    int src_pixels_per_line,
408    int dst_pixels_per_line,
409    int rows,
410    int cols,
411    int flimit
412)
413{
414#ifdef RELOCATEABLE
415    R4D2
416#endif
417
418    __asm
419    {
420        movd        xmm2,       flimit
421        punpcklwd   xmm2,       xmm2
422        punpckldq   xmm2,       xmm2
423        punpcklqdq  xmm2,       xmm2
424
425        mov         esi,        src_ptr
426        mov         edi,        dst_ptr
427
428        mov         ecx,        DWORD PTR rows
429        mov         eax,        src_pixels_per_line ;
430        destination pitch?
431        pxor        xmm0,       xmm0              ;
432        mm0 = 00000000
433
434        nextrow:
435
436        xor         edx,        edx       ;
437
438        clear out edx for use as loop counter
439        nextcol:
440        movq        xmm3,       QWORD PTR [esi]         ;
441
442        mm4 = r0 p0..p7
443        punpcklbw   xmm3,       xmm0                    ;
444        mm3 = p0..p3
445        movdqa      xmm1,       xmm3                    ;
446        mm1 = p0..p3
447        psllw       xmm3,       2                       ;
448
449        movq        xmm5,       QWORD PTR [esi + eax]   ;
450        mm4 = r1 p0..p7
451        punpcklbw   xmm5,       xmm0                    ;
452        mm5 = r1 p0..p3
453        paddusw     xmm3,       xmm5                    ;
454        mm3 += mm6
455
456        ;
457        thresholding
458        movdqa      xmm7,       xmm1                    ;
459        mm7 = r0 p0..p3
460        psubusw     xmm7,       xmm5                    ;
461        mm7 = r0 p0..p3 - r1 p0..p3
462        psubusw     xmm5,       xmm1                    ;
463        mm5 = r1 p0..p3 - r0 p0..p3
464        paddusw     xmm7,       xmm5                    ;
465        mm7 = abs(r0 p0..p3 - r1 p0..p3)
466        pcmpgtw     xmm7,       xmm2
467
468        movq        xmm5,       QWORD PTR [esi + 2*eax] ;
469        mm4 = r2 p0..p7
470        punpcklbw   xmm5,       xmm0                    ;
471        mm5 = r2 p0..p3
472        paddusw     xmm3,       xmm5                    ;
473        mm3 += mm5
474
475        ;
476        thresholding
477        movdqa      xmm6,       xmm1                    ;
478        mm6 = r0 p0..p3
479        psubusw     xmm6,       xmm5                    ;
480        mm6 = r0 p0..p3 - r2 p0..p3
481        psubusw     xmm5,       xmm1                    ;
482        mm5 = r2 p0..p3 - r2 p0..p3
483        paddusw     xmm6,       xmm5                    ;
484        mm6 = abs(r0 p0..p3 - r2 p0..p3)
485        pcmpgtw     xmm6,       xmm2
486        por         xmm7,       xmm6                    ;
487        accumulate thresholds
488
489
490        neg         eax
491        movq        xmm5,       QWORD PTR [esi+2*eax]   ;
492        mm4 = r-2 p0..p7
493        punpcklbw   xmm5,       xmm0                    ;
494        mm5 = r-2 p0..p3
495        paddusw     xmm3,       xmm5                    ;
496        mm3 += mm5
497
498        ;
499        thresholding
500        movdqa      xmm6,       xmm1                    ;
501        mm6 = r0 p0..p3
502        psubusw     xmm6,       xmm5                    ;
503        mm6 = p0..p3 - r-2 p0..p3
504        psubusw     xmm5,       xmm1                    ;
505        mm5 = r-2 p0..p3 - p0..p3
506        paddusw     xmm6,       xmm5                    ;
507        mm6 = abs(r0 p0..p3 - r-2 p0..p3)
508        pcmpgtw     xmm6,       xmm2
509        por         xmm7,       xmm6                    ;
510        accumulate thresholds
511
512        movq        xmm4,       QWORD PTR [esi+eax]     ;
513        mm4 = r-1 p0..p7
514        punpcklbw   xmm4,       xmm0                    ;
515        mm4 = r-1 p0..p3
516        paddusw     xmm3,       xmm4                    ;
517        mm3 += mm5
518
519        ;
520        thresholding
521        movdqa      xmm6,       xmm1                    ;
522        mm6 = r0 p0..p3
523        psubusw     xmm6,       xmm4                    ;
524        mm6 = p0..p3 - r-2 p0..p3
525        psubusw     xmm4,       xmm1                    ;
526        mm5 = r-1 p0..p3 - p0..p3
527        paddusw     xmm6,       xmm4                    ;
528        mm6 = abs(r0 p0..p3 - r-1 p0..p3)
529        pcmpgtw     xmm6,       xmm2
530        por         xmm7,       xmm6                    ;
531        accumulate thresholds
532
533
534        paddusw     xmm3,       rd42                    ;
535        mm3 += round value
536        psraw       xmm3,       3                       ;
537        mm3 /= 8
538
539        pand        xmm1,       xmm7                    ;
540        mm1 select vals > thresh from source
541        pandn       xmm7,       xmm3                    ;
542        mm7 select vals < thresh from blurred result
543        paddusw     xmm1,       xmm7                    ;
544        combination
545
546        packuswb    xmm1,       xmm0                    ;
547        pack to bytes
548        movq        QWORD PTR [edi], xmm1             ;
549
550        neg         eax                   ;
551        pitch is positive
552        add         esi,        8
553        add         edi,        8
554
555        add         edx,        8
556        cmp         edx,        cols
557
558        jl          nextcol
559
560        // done with the all cols, start the across filtering in place
561        sub         esi,        edx
562        sub         edi,        edx
563
564        xor         edx,        edx
565        movq        mm0,        QWORD PTR [edi-8];
566
567        acrossnextcol:
568        movq        xmm7,       QWORD PTR [edi +edx -2]
569        movd        xmm4,       DWORD PTR [edi +edx +6]
570
571        pslldq      xmm4,       8
572        por         xmm4,       xmm7
573
574        movdqa      xmm3,       xmm4
575        psrldq      xmm3,       2
576        punpcklbw   xmm3,       xmm0              ;
577        mm3 = p0..p3
578        movdqa      xmm1,       xmm3              ;
579        mm1 = p0..p3
580        psllw       xmm3,       2
581
582
583        movdqa      xmm5,       xmm4
584        psrldq      xmm5,       3
585        punpcklbw   xmm5,       xmm0              ;
586        mm5 = p1..p4
587        paddusw     xmm3,       xmm5              ;
588        mm3 += mm6
589
590        ;
591        thresholding
592        movdqa      xmm7,       xmm1              ;
593        mm7 = p0..p3
594        psubusw     xmm7,       xmm5              ;
595        mm7 = p0..p3 - p1..p4
596        psubusw     xmm5,       xmm1              ;
597        mm5 = p1..p4 - p0..p3
598        paddusw     xmm7,       xmm5              ;
599        mm7 = abs(p0..p3 - p1..p4)
600        pcmpgtw     xmm7,       xmm2
601
602        movdqa      xmm5,       xmm4
603        psrldq      xmm5,       4
604        punpcklbw   xmm5,       xmm0              ;
605        mm5 = p2..p5
606        paddusw     xmm3,       xmm5              ;
607        mm3 += mm5
608
609        ;
610        thresholding
611        movdqa      xmm6,       xmm1              ;
612        mm6 = p0..p3
613        psubusw     xmm6,       xmm5              ;
614        mm6 = p0..p3 - p1..p4
615        psubusw     xmm5,       xmm1              ;
616        mm5 = p1..p4 - p0..p3
617        paddusw     xmm6,       xmm5              ;
618        mm6 = abs(p0..p3 - p1..p4)
619        pcmpgtw     xmm6,       xmm2
620        por         xmm7,       xmm6              ;
621        accumulate thresholds
622
623
624        movdqa      xmm5,       xmm4              ;
625        mm5 = p-2..p5
626        punpcklbw   xmm5,       xmm0              ;
627        mm5 = p-2..p1
628        paddusw     xmm3,       xmm5              ;
629        mm3 += mm5
630
631        ;
632        thresholding
633        movdqa      xmm6,       xmm1              ;
634        mm6 = p0..p3
635        psubusw     xmm6,       xmm5              ;
636        mm6 = p0..p3 - p1..p4
637        psubusw     xmm5,       xmm1              ;
638        mm5 = p1..p4 - p0..p3
639        paddusw     xmm6,       xmm5              ;
640        mm6 = abs(p0..p3 - p1..p4)
641        pcmpgtw     xmm6,       xmm2
642        por         xmm7,       xmm6              ;
643        accumulate thresholds
644
645        psrldq      xmm4,       1                   ;
646        mm4 = p-1..p5
647        punpcklbw   xmm4,       xmm0              ;
648        mm4 = p-1..p2
649        paddusw     xmm3,       xmm4              ;
650        mm3 += mm5
651
652        ;
653        thresholding
654        movdqa      xmm6,       xmm1              ;
655        mm6 = p0..p3
656        psubusw     xmm6,       xmm4              ;
657        mm6 = p0..p3 - p1..p4
658        psubusw     xmm4,       xmm1              ;
659        mm5 = p1..p4 - p0..p3
660        paddusw     xmm6,       xmm4              ;
661        mm6 = abs(p0..p3 - p1..p4)
662        pcmpgtw     xmm6,       xmm2
663        por         xmm7,       xmm6              ;
664        accumulate thresholds
665
666        paddusw     xmm3,       rd42              ;
667        mm3 += round value
668        psraw       xmm3,       3                 ;
669        mm3 /= 8
670
671        pand        xmm1,       xmm7              ;
672        mm1 select vals > thresh from source
673        pandn       xmm7,       xmm3              ;
674        mm7 select vals < thresh from blurred result
675        paddusw     xmm1,       xmm7              ;
676        combination
677
678        packuswb    xmm1,       xmm0              ;
679        pack to bytes
680        movq        QWORD PTR [edi+edx-8],  mm0   ;
681        store previous four bytes
682        movdq2q     mm0,        xmm1
683
684        add         edx,        8
685        cmp         edx,        cols
686        jl          acrossnextcol;
687
688        // last 8 pixels
689        movq        QWORD PTR [edi+edx-8],  mm0
690
691        // done with this rwo
692        add         esi, eax               ;
693        next line
694        mov         eax, dst_pixels_per_line ;
695        destination pitch?
696        add         edi, eax               ;
697        next destination
698        mov         eax, src_pixels_per_line ;
699        destination pitch?
700
701        dec         ecx                   ;
702        decrement count
703        jnz         nextrow               ;
704        next row
705    }
706}
707
708
709void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
710{
711    int c, i;
712    __declspec(align(16))
713    int flimit2[2];
714    __declspec(align(16))
715    unsigned char d[16][8];
716
717    flimit = vp8_q2mbl(flimit);
718
719    for (i = 0; i < 2; i++)
720        flimit2[i] = flimit;
721
722    rows += 8;
723
724    for (c = 0; c < cols; c += 4)
725    {
726        unsigned char *s = &dst[c];
727
728        __asm
729        {
730            mov         esi,        s           ;
731            pxor        mm0,        mm0     ;
732
733            mov         eax,        pitch       ;
734            neg         eax                                     // eax = -pitch
735
736            lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
737            neg         eax
738
739
740            pxor        mm5,        mm5
741            pxor        mm6,        mm6     ;
742
743            pxor        mm7,        mm7     ;
744            mov         edi,        esi
745
746            mov         ecx,        15          ;
747
748            loop_initvar:
749            movd        mm1,        DWORD PTR [edi];
750            punpcklbw   mm1,        mm0     ;
751
752            paddw       mm5,        mm1     ;
753            pmullw      mm1,        mm1     ;
754
755            movq        mm2,        mm1     ;
756            punpcklwd   mm1,        mm0     ;
757
758            punpckhwd   mm2,        mm0     ;
759            paddd       mm6,        mm1     ;
760
761            paddd       mm7,        mm2     ;
762            lea         edi,        [edi+eax]   ;
763
764            dec         ecx
765            jne         loop_initvar
766            //save the var and sum
767            xor         edx,        edx
768            loop_row:
769            movd        mm1,        DWORD PTR [esi]     // [s-pitch*8]
770            movd        mm2,        DWORD PTR [edi]     // [s+pitch*7]
771
772            punpcklbw   mm1,        mm0
773            punpcklbw   mm2,        mm0
774
775            paddw       mm5,        mm2
776            psubw       mm5,        mm1
777
778            pmullw      mm2,        mm2
779            movq        mm4,        mm2
780
781            punpcklwd   mm2,        mm0
782            punpckhwd   mm4,        mm0
783
784            paddd       mm6,        mm2
785            paddd       mm7,        mm4
786
787            pmullw      mm1,        mm1
788            movq        mm2,        mm1
789
790            punpcklwd   mm1,        mm0
791            psubd       mm6,        mm1
792
793            punpckhwd   mm2,        mm0
794            psubd       mm7,        mm2
795
796
797            movq        mm3,        mm6
798            pslld       mm3,        4
799
800            psubd       mm3,        mm6
801            movq        mm1,        mm5
802
803            movq        mm4,        mm5
804            pmullw      mm1,        mm1
805
806            pmulhw      mm4,        mm4
807            movq        mm2,        mm1
808
809            punpcklwd   mm1,        mm4
810            punpckhwd   mm2,        mm4
811
812            movq        mm4,        mm7
813            pslld       mm4,        4
814
815            psubd       mm4,        mm7
816
817            psubd       mm3,        mm1
818            psubd       mm4,        mm2
819
820            psubd       mm3,        flimit2
821            psubd       mm4,        flimit2
822
823            psrad       mm3,        31
824            psrad       mm4,        31
825
826            packssdw    mm3,        mm4
827            packsswb    mm3,        mm0
828
829            movd        mm1,        DWORD PTR [esi+eax*8]
830
831            movq        mm2,        mm1
832            punpcklbw   mm1,        mm0
833
834            paddw       mm1,        mm5
835            mov         ecx,        edx
836
837            and         ecx,        127
838            movq        mm4,        vp8_rv[ecx*2]
839
840            paddw       mm1,        mm4
841            //paddw     xmm1,       eight8s
842            psraw       mm1,        4
843
844            packuswb    mm1,        mm0
845            pand        mm1,        mm3
846
847            pandn       mm3,        mm2
848            por         mm1,        mm3
849
850            and         ecx,        15
851            movd        DWORD PTR  d[ecx*4], mm1
852
853            mov         ecx,        edx
854            sub         ecx,        8
855
856            and         ecx,        15
857            movd        mm1,        DWORD PTR d[ecx*4]
858
859            movd        [esi],      mm1
860            lea         esi,        [esi+eax]
861
862            lea         edi,        [edi+eax]
863            add         edx,        1
864
865            cmp         edx,        rows
866            jl          loop_row
867
868        }
869
870    }
871}
872
873void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
874{
875    int c, i;
876    __declspec(align(16))
877    int flimit4[4];
878    __declspec(align(16))
879    unsigned char d[16][8];
880
881    flimit = vp8_q2mbl(flimit);
882
883    for (i = 0; i < 4; i++)
884        flimit4[i] = flimit;
885
886    rows += 8;
887
888    for (c = 0; c < cols; c += 8)
889    {
890        unsigned char *s = &dst[c];
891
892        __asm
893        {
894            mov         esi,        s           ;
895            pxor        xmm0,       xmm0        ;
896
897            mov         eax,        pitch       ;
898            neg         eax                                     // eax = -pitch
899
900            lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
901            neg         eax
902
903
904            pxor        xmm5,       xmm5
905            pxor        xmm6,       xmm6        ;
906
907            pxor        xmm7,       xmm7        ;
908            mov         edi,        esi
909
910            mov         ecx,        15          ;
911
912            loop_initvar:
913            movq        xmm1,       QWORD PTR [edi];
914            punpcklbw   xmm1,       xmm0        ;
915
916            paddw       xmm5,       xmm1        ;
917            pmullw      xmm1,       xmm1        ;
918
919            movdqa      xmm2,       xmm1        ;
920            punpcklwd   xmm1,       xmm0        ;
921
922            punpckhwd   xmm2,       xmm0        ;
923            paddd       xmm6,       xmm1        ;
924
925            paddd       xmm7,       xmm2        ;
926            lea         edi,        [edi+eax]   ;
927
928            dec         ecx
929            jne         loop_initvar
930            //save the var and sum
931            xor         edx,        edx
932            loop_row:
933            movq        xmm1,       QWORD PTR [esi]     // [s-pitch*8]
934            movq        xmm2,       QWORD PTR [edi]     // [s+pitch*7]
935
936            punpcklbw   xmm1,       xmm0
937            punpcklbw   xmm2,       xmm0
938
939            paddw       xmm5,       xmm2
940            psubw       xmm5,       xmm1
941
942            pmullw      xmm2,       xmm2
943            movdqa      xmm4,       xmm2
944
945            punpcklwd   xmm2,       xmm0
946            punpckhwd   xmm4,       xmm0
947
948            paddd       xmm6,       xmm2
949            paddd       xmm7,       xmm4
950
951            pmullw      xmm1,       xmm1
952            movdqa      xmm2,       xmm1
953
954            punpcklwd   xmm1,       xmm0
955            psubd       xmm6,       xmm1
956
957            punpckhwd   xmm2,       xmm0
958            psubd       xmm7,       xmm2
959
960
961            movdqa      xmm3,       xmm6
962            pslld       xmm3,       4
963
964            psubd       xmm3,       xmm6
965            movdqa      xmm1,       xmm5
966
967            movdqa      xmm4,       xmm5
968            pmullw      xmm1,       xmm1
969
970            pmulhw      xmm4,       xmm4
971            movdqa      xmm2,       xmm1
972
973            punpcklwd   xmm1,       xmm4
974            punpckhwd   xmm2,       xmm4
975
976            movdqa      xmm4,       xmm7
977            pslld       xmm4,       4
978
979            psubd       xmm4,       xmm7
980
981            psubd       xmm3,       xmm1
982            psubd       xmm4,       xmm2
983
984            psubd       xmm3,       flimit4
985            psubd       xmm4,       flimit4
986
987            psrad       xmm3,       31
988            psrad       xmm4,       31
989
990            packssdw    xmm3,       xmm4
991            packsswb    xmm3,       xmm0
992
993            movq        xmm1,       QWORD PTR [esi+eax*8]
994
995            movq        xmm2,       xmm1
996            punpcklbw   xmm1,       xmm0
997
998            paddw       xmm1,       xmm5
999            mov         ecx,        edx
1000
1001            and         ecx,        127
1002            movdqu      xmm4,       vp8_rv[ecx*2]
1003
1004            paddw       xmm1,       xmm4
1005            //paddw     xmm1,       eight8s
1006            psraw       xmm1,       4
1007
1008            packuswb    xmm1,       xmm0
1009            pand        xmm1,       xmm3
1010
1011            pandn       xmm3,       xmm2
1012            por         xmm1,       xmm3
1013
1014            and         ecx,        15
1015            movq        QWORD PTR  d[ecx*8], xmm1
1016
1017            mov         ecx,        edx
1018            sub         ecx,        8
1019
1020            and         ecx,        15
1021            movq        mm0,        d[ecx*8]
1022
1023            movq        [esi],      mm0
1024            lea         esi,        [esi+eax]
1025
1026            lea         edi,        [edi+eax]
1027            add         edx,        1
1028
1029            cmp         edx,        rows
1030            jl          loop_row
1031
1032        }
1033
1034    }
1035}
1036#if 0
1037/****************************************************************************
1038 *
1039 *  ROUTINE       : plane_add_noise_wmt
1040 *
1041 *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
1042 *                                  noise to
1043 *                  unsigned int Width    width of plane
1044 *                  unsigned int Height   height of plane
1045 *                  int  Pitch    distance between subsequent lines of frame
1046 *                  int  q        quantizer used to determine amount of noise
1047 *                                  to add
1048 *
1049 *  OUTPUTS       : None.
1050 *
1051 *  RETURNS       : void.
1052 *
1053 *  FUNCTION      : adds gaussian noise to a plane of pixels
1054 *
1055 *  SPECIAL NOTES : None.
1056 *
1057 ****************************************************************************/
1058void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1059{
1060    unsigned int i;
1061
1062    __declspec(align(16)) unsigned char blackclamp[16];
1063    __declspec(align(16)) unsigned char whiteclamp[16];
1064    __declspec(align(16)) unsigned char bothclamp[16];
1065    char char_dist[300];
1066    char Rand[2048];
1067    double sigma;
1068//    return;
1069    __asm emms
1070    sigma = a + .5 + .6 * (63 - q) / 63.0;
1071
1072    // set up a lookup table of 256 entries that matches
1073    // a gaussian distribution with sigma determined by q.
1074    //
1075    {
1076        double i;
1077        int next, j;
1078
1079        next = 0;
1080
1081        for (i = -32; i < 32; i++)
1082        {
1083            double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
1084            int a = (int)(g + .5);
1085
1086            if (a)
1087            {
1088                for (j = 0; j < a; j++)
1089                {
1090                    char_dist[next+j] = (char) i;
1091                }
1092
1093                next = next + j;
1094            }
1095
1096        }
1097
1098        for (next = next; next < 256; next++)
1099            char_dist[next] = 0;
1100
1101    }
1102
1103    for (i = 0; i < 2048; i++)
1104    {
1105        Rand[i] = char_dist[rand() & 0xff];
1106    }
1107
1108    for (i = 0; i < 16; i++)
1109    {
1110        blackclamp[i] = -char_dist[0];
1111        whiteclamp[i] = -char_dist[0];
1112        bothclamp[i] = -2 * char_dist[0];
1113    }
1114
1115    for (i = 0; i < Height; i++)
1116    {
1117        unsigned char *Pos = Start + i * Pitch;
1118        char  *Ref = Rand + (rand() & 0xff);
1119
1120        __asm
1121        {
1122            mov ecx, [Width]
1123            mov esi, Pos
1124            mov edi, Ref
1125            xor         eax, eax
1126
1127            nextset:
1128            movdqu      xmm1, [esi+eax]        // get the source
1129
1130            psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
1131            paddusb     xmm1, bothclamp
1132            psubusb     xmm1, whiteclamp
1133
1134            movdqu      xmm2, [edi+eax]        // get the noise for this line
1135            paddb       xmm1, xmm2             // add it in
1136            movdqu      [esi+eax], xmm1        // store the result
1137
1138            add         eax, 16                // move to the next line
1139
1140            cmp         eax, ecx
1141            jl          nextset
1142
1143
1144        }
1145
1146    }
1147}
1148#endif
1149__declspec(align(16))
1150static const int four8s[4] = { 8, 8, 8, 8};
1151void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
1152{
1153    int r, i;
1154    __declspec(align(16))
1155    int flimit4[4];
1156    unsigned char *s = src;
1157    int sumsq;
1158    int sum;
1159
1160
1161    flimit = vp8_q2mbl(flimit);
1162    flimit4[0] =
1163        flimit4[1] =
1164            flimit4[2] =
1165                flimit4[3] = flimit;
1166
1167    for (r = 0; r < rows; r++)
1168    {
1169
1170
1171        sumsq = 0;
1172        sum = 0;
1173
1174        for (i = -8; i <= 6; i++)
1175        {
1176            sumsq += s[i] * s[i];
1177            sum   += s[i];
1178        }
1179
1180        __asm
1181        {
1182            mov         eax,    sumsq
1183            movd        xmm7,   eax
1184
1185            mov         eax,    sum
1186            movd        xmm6,   eax
1187
1188            mov         esi,    s
1189            xor         ecx,    ecx
1190
1191            mov         edx,    cols
1192            add         edx,    8
1193            pxor        mm0,    mm0
1194            pxor        mm1,    mm1
1195
1196            pxor        xmm0,   xmm0
1197            nextcol4:
1198
1199            movd        xmm1,   DWORD PTR [esi+ecx-8]   // -8 -7 -6 -5
1200            movd        xmm2,   DWORD PTR [esi+ecx+7]   // +7 +8 +9 +10
1201
1202            punpcklbw   xmm1,   xmm0                    // expanding
1203            punpcklbw   xmm2,   xmm0                    // expanding
1204
1205            punpcklwd   xmm1,   xmm0                    // expanding to dwords
1206            punpcklwd   xmm2,   xmm0                    // expanding to dwords
1207
1208            psubd       xmm2,   xmm1                    // 7--8   8--7   9--6 10--5
1209            paddd       xmm1,   xmm1                    // -8*2   -7*2   -6*2 -5*2
1210
1211            paddd       xmm1,   xmm2                    // 7+-8   8+-7   9+-6 10+-5
1212            pmaddwd     xmm1,   xmm2                    // squared of 7+-8   8+-7   9+-6 10+-5
1213
1214            paddd       xmm6,   xmm2
1215            paddd       xmm7,   xmm1
1216
1217            pshufd      xmm6,   xmm6,   0               // duplicate the last ones
1218            pshufd      xmm7,   xmm7,   0               // duplicate the last ones
1219
1220            psrldq      xmm1,       4                   // 8--7   9--6 10--5  0000
1221            psrldq      xmm2,       4                   // 8--7   9--6 10--5  0000
1222
1223            pshufd      xmm3,   xmm1,   3               // 0000  8--7   8--7   8--7 squared
1224            pshufd      xmm4,   xmm2,   3               // 0000  8--7   8--7   8--7 squared
1225
1226            paddd       xmm6,   xmm4
1227            paddd       xmm7,   xmm3
1228
1229            pshufd      xmm3,   xmm1,   01011111b       // 0000  0000   9--6   9--6 squared
1230            pshufd      xmm4,   xmm2,   01011111b       // 0000  0000   9--6   9--6 squared
1231
1232            paddd       xmm7,   xmm3
1233            paddd       xmm6,   xmm4
1234
1235            pshufd      xmm3,   xmm1,   10111111b       // 0000  0000   8--7   8--7 squared
1236            pshufd      xmm4,   xmm2,   10111111b       // 0000  0000   8--7   8--7 squared
1237
1238            paddd       xmm7,   xmm3
1239            paddd       xmm6,   xmm4
1240
1241            movdqa      xmm3,   xmm6
1242            pmaddwd     xmm3,   xmm3
1243
1244            movdqa      xmm5,   xmm7
1245            pslld       xmm5,   4
1246
1247            psubd       xmm5,   xmm7
1248            psubd       xmm5,   xmm3
1249
1250            psubd       xmm5,   flimit4
1251            psrad       xmm5,   31
1252
1253            packssdw    xmm5,   xmm0
1254            packsswb    xmm5,   xmm0
1255
1256            movd        xmm1,   DWORD PTR [esi+ecx]
1257            movq        xmm2,   xmm1
1258
1259            punpcklbw   xmm1,   xmm0
1260            punpcklwd   xmm1,   xmm0
1261
1262            paddd       xmm1,   xmm6
1263            paddd       xmm1,   four8s
1264
1265            psrad       xmm1,   4
1266            packssdw    xmm1,   xmm0
1267
1268            packuswb    xmm1,   xmm0
1269            pand        xmm1,   xmm5
1270
1271            pandn       xmm5,   xmm2
1272            por         xmm5,   xmm1
1273
1274            movd        [esi+ecx-8],  mm0
1275            movq        mm0,    mm1
1276
1277            movdq2q     mm1,    xmm5
1278            psrldq      xmm7,   12
1279
1280            psrldq      xmm6,   12
1281            add         ecx,    4
1282
1283            cmp         ecx,    edx
1284            jl          nextcol4
1285
1286        }
1287        s += pitch;
1288    }
1289}
1290
1291#if 0
1292
1293/****************************************************************************
1294 *
1295 *  ROUTINE       : plane_add_noise_mmx
1296 *
1297 *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
1298 *                                  noise to
1299 *                  unsigned int Width    width of plane
1300 *                  unsigned int Height   height of plane
1301 *                  int  Pitch    distance between subsequent lines of frame
1302 *                  int  q        quantizer used to determine amount of noise
1303 *                                  to add
1304 *
1305 *  OUTPUTS       : None.
1306 *
1307 *  RETURNS       : void.
1308 *
1309 *  FUNCTION      : adds gaussian noise to a plane of pixels
1310 *
1311 *  SPECIAL NOTES : None.
1312 *
1313 ****************************************************************************/
1314void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1315{
1316    unsigned int i;
1317    int Pitch4 = Pitch * 4;
1318    const int noise_amount = 2;
1319    const int noise_adder = 2 * noise_amount + 1;
1320
1321    __declspec(align(16)) unsigned char blackclamp[16];
1322    __declspec(align(16)) unsigned char whiteclamp[16];
1323    __declspec(align(16)) unsigned char bothclamp[16];
1324
1325    char char_dist[300];
1326    char Rand[2048];
1327
1328    double sigma;
1329    __asm emms
1330    sigma = a + .5 + .6 * (63 - q) / 63.0;
1331
1332    // set up a lookup table of 256 entries that matches
1333    // a gaussian distribution with sigma determined by q.
1334    //
1335    {
1336        double i, sum = 0;
1337        int next, j;
1338
1339        next = 0;
1340
1341        for (i = -32; i < 32; i++)
1342        {
1343            int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
1344
1345            if (a)
1346            {
1347                for (j = 0; j < a; j++)
1348                {
1349                    char_dist[next+j] = (char) i;
1350                }
1351
1352                next = next + j;
1353            }
1354
1355        }
1356
1357        for (next = next; next < 256; next++)
1358            char_dist[next] = 0;
1359
1360    }
1361
1362    for (i = 0; i < 2048; i++)
1363    {
1364        Rand[i] = char_dist[rand() & 0xff];
1365    }
1366
1367    for (i = 0; i < 16; i++)
1368    {
1369        blackclamp[i] = -char_dist[0];
1370        whiteclamp[i] = -char_dist[0];
1371        bothclamp[i] = -2 * char_dist[0];
1372    }
1373
1374    for (i = 0; i < Height; i++)
1375    {
1376        unsigned char *Pos = Start + i * Pitch;
1377        char  *Ref = Rand + (rand() & 0xff);
1378
1379        __asm
1380        {
1381            mov ecx, [Width]
1382            mov esi, Pos
1383            mov edi, Ref
1384            xor         eax, eax
1385
1386            nextset:
1387            movq        mm1, [esi+eax]        // get the source
1388
1389            psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
1390            paddusb     mm1, bothclamp
1391            psubusb     mm1, whiteclamp
1392
1393            movq        mm2, [edi+eax]        // get the noise for this line
1394            paddb       mm1, mm2             // add it in
1395            movq        [esi+eax], mm1        // store the result
1396
1397            add         eax, 8                // move to the next line
1398
1399            cmp         eax, ecx
1400            jl          nextset
1401
1402
1403        }
1404
1405    }
1406}
1407#else
1408extern char an[8][64][3072];
1409extern int cd[8][64];
1410
1411void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1412{
1413    unsigned int i;
1414    __declspec(align(16)) unsigned char blackclamp[16];
1415    __declspec(align(16)) unsigned char whiteclamp[16];
1416    __declspec(align(16)) unsigned char bothclamp[16];
1417
1418
1419    __asm emms
1420
1421    for (i = 0; i < 16; i++)
1422    {
1423        blackclamp[i] = -cd[a][q];
1424        whiteclamp[i] = -cd[a][q];
1425        bothclamp[i] = -2 * cd[a][q];
1426    }
1427
1428    for (i = 0; i < Height; i++)
1429    {
1430        unsigned char *Pos = Start + i * Pitch;
1431        char  *Ref = an[a][q] + (rand() & 0xff);
1432
1433        __asm
1434        {
1435            mov ecx, [Width]
1436            mov esi, Pos
1437            mov edi, Ref
1438            xor         eax, eax
1439
1440            nextset:
1441            movq        mm1, [esi+eax]        // get the source
1442
1443            psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
1444            paddusb     mm1, bothclamp
1445            psubusb     mm1, whiteclamp
1446
1447            movq        mm2, [edi+eax]        // get the noise for this line
1448            paddb       mm1, mm2             // add it in
1449            movq        [esi+eax], mm1        // store the result
1450
1451            add         eax, 8                // move to the next line
1452
1453            cmp         eax, ecx
1454            jl          nextset
1455        }
1456    }
1457}
1458
1459
1460void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1461{
1462    unsigned int i;
1463
1464    __declspec(align(16)) unsigned char blackclamp[16];
1465    __declspec(align(16)) unsigned char whiteclamp[16];
1466    __declspec(align(16)) unsigned char bothclamp[16];
1467
1468    __asm emms
1469
1470    for (i = 0; i < 16; i++)
1471    {
1472        blackclamp[i] = -cd[a][q];
1473        whiteclamp[i] = -cd[a][q];
1474        bothclamp[i] = -2 * cd[a][q];
1475    }
1476
1477    for (i = 0; i < Height; i++)
1478    {
1479        unsigned char *Pos = Start + i * Pitch;
1480        char *Ref = an[a][q] + (rand() & 0xff);
1481
1482        __asm
1483        {
1484            mov ecx,    [Width]
1485            mov esi,    Pos
1486            mov edi,    Ref
1487            xor         eax, eax
1488
1489            nextset:
1490            movdqu      xmm1, [esi+eax]        // get the source
1491
1492            psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
1493            paddusb     xmm1, bothclamp
1494            psubusb     xmm1, whiteclamp
1495
1496            movdqu      xmm2, [edi+eax]        // get the noise for this line
1497            paddb       xmm1, xmm2             // add it in
1498            movdqu      [esi+eax], xmm1        // store the result
1499
1500            add         eax, 16                // move to the next line
1501
1502            cmp         eax, ecx
1503            jl          nextset
1504        }
1505    }
1506}
1507
1508#endif
1509