1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12/****************************************************************************
13*
14*   Module Title :     scaleopt.cpp
15*
16*   Description  :     Optimized scaling functions
17*
18****************************************************************************/
19#include "pragmas.h"
20
21
22
23/****************************************************************************
24*  Module Statics
25****************************************************************************/
26__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
27__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
28__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
29__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
30__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
31__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
32__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
33__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
34__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
35__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
36__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
37
38
39
40#include "vpx_scale/vpxscale.h"
41#include "vpx_mem/vpx_mem.h"
42
43/****************************************************************************
44 *
45 *  ROUTINE       : horizontal_line_3_5_scale_mmx
46 *
47 *  INPUTS        : const unsigned char *source :
48 *                  unsigned int source_width    :
49 *                  unsigned char *dest         :
50 *                  unsigned int dest_width      :
51 *
52 *  OUTPUTS       : None.
53 *
54 *  RETURNS       : void
55 *
56 *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
57 *
58 *  SPECIAL NOTES : None.
59 *
60 ****************************************************************************/
61static
62void horizontal_line_3_5_scale_mmx
63(
64    const unsigned char *source,
65    unsigned int source_width,
66    unsigned char *dest,
67    unsigned int dest_width
68)
69{
70    (void) dest_width;
71
72    __asm
73    {
74
75        push ebx
76
77        mov         esi,    source
78        mov         edi,    dest
79
80        mov         ecx,    source_width
81        lea         edx,    [esi+ecx-3];
82
83        movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
84        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
85
86        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
87        pxor        mm7,    mm7             // clear mm7
88
89        horiz_line_3_5_loop:
90
91        mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
92        mov        ebx,    eax
93
94        and         ebx,    0xffff00        // ebx = xx 01 02 xx
95        mov         ecx,    eax             // ecx = 00 01 02 03
96
97        and         eax,    0xffff0000      // eax = xx xx 02 03
98        xor         ecx,    eax             // ecx = 00 01 xx xx
99
100        shr         ebx,    8               // ebx = 01 02 xx xx
101        or          eax,    ebx             // eax = 01 02 02 03
102
103        shl         ebx,    16              // ebx = xx xx 01 02
104        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
105
106        or          ebx,    ecx             // ebx = 00 01 01 02
107        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
108
109        movd        mm0,    ebx             // mm0 = 00 01 01 02
110        pmullw      mm1,    mm6             //
111
112        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
113        pmullw      mm0,    mm5             //
114
115        mov         [edi],  ebx             // writeoutput 00 xx xx xx
116        add         esi,    3
117
118        add         edi,    5
119        paddw       mm0,    mm1
120
121        paddw       mm0,    mm4
122        psrlw       mm0,    8
123
124        cmp         esi,    edx
125        packuswb    mm0,    mm7
126
127        movd        DWORD Ptr [edi-4], mm0
128        jl          horiz_line_3_5_loop
129
130//Exit:
131        mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
132        mov         ebx,    eax
133
134        and         ebx,    0xffff00        // ebx = xx 01 02 xx
135        mov         ecx,    eax             // ecx = 00 01 02 03
136
137        and         eax,    0xffff0000      // eax = xx xx 02 03
138        xor         ecx,    eax             // ecx = 00 01 xx xx
139
140        shr         ebx,    8               // ebx = 01 02 xx xx
141        or          eax,    ebx             // eax = 01 02 02 03
142
143        shl         eax,    8               // eax = xx 01 02 02
144        and         eax,    0xffff0000      // eax = xx xx 02 02
145
146        or          eax,    ebx             // eax = 01 02 02 02
147
148        shl         ebx,    16              // ebx = xx xx 01 02
149        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
150
151        or          ebx,    ecx             // ebx = 00 01 01 02
152        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
153
154        movd        mm0,    ebx             // mm0 = 00 01 01 02
155        pmullw      mm1,    mm6             //
156
157        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
158        pmullw      mm0,    mm5             //
159
160        mov         [edi],  ebx             // writeoutput 00 xx xx xx
161        paddw       mm0,    mm1
162
163        paddw       mm0,    mm4
164        psrlw       mm0,    8
165
166        packuswb    mm0,    mm7
167        movd        DWORD Ptr [edi+1], mm0
168
169        pop ebx
170
171    }
172
173}
174
175
176/****************************************************************************
177 *
178 *  ROUTINE       : horizontal_line_4_5_scale_mmx
179 *
180 *  INPUTS        : const unsigned char *source :
181 *                  unsigned int source_width    :
182 *                  unsigned char *dest         :
183 *                  unsigned int dest_width      :
184 *
185 *  OUTPUTS       : None.
186 *
187 *  RETURNS       : void
188 *
189 *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
190 *
191 *  SPECIAL NOTES : None.
192 *
193 ****************************************************************************/
194static
195void horizontal_line_4_5_scale_mmx
196(
197    const unsigned char *source,
198    unsigned int source_width,
199    unsigned char *dest,
200    unsigned int dest_width
201)
202{
203    (void)dest_width;
204
205    __asm
206    {
207
208        mov         esi,    source
209        mov         edi,    dest
210
211        mov         ecx,    source_width
212        lea         edx,    [esi+ecx-8];
213
214        movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
215        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
216
217        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
218        pxor        mm7,    mm7             // clear mm7
219
220        horiz_line_4_5_loop:
221
222        movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
223        movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08
224
225        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
226        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
227
228        movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
229        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
230
231        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
232        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
233
234        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
235        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
236
237        movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
238        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
239
240        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
241        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
242
243        paddw       mm0,    mm1             // added round values
244        paddw       mm0,    mm4
245
246        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
247        packuswb    mm0,    mm7
248
249        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
250        add         edi,    10
251
252        add         esi,    8
253        paddw       mm2,    mm3             //
254
255        paddw       mm2,    mm4             // added round values
256        cmp         esi,    edx
257
258        psrlw       mm2,    8
259        packuswb    mm2,    mm7
260
261        movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
262        jl         horiz_line_4_5_loop
263
264//Exit:
265        movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
266        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
267
268        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
269        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
270
271        movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
272        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
273
274        psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
275        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
276
277        movq        mm3,    mm1
278
279        movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
280        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
281
282        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
283        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
284
285        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
286        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
287
288        movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
289        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
290
291        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
292        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
293
294        paddw       mm0,    mm1             // added round values
295        paddw       mm0,    mm4
296
297        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
298        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
299
300        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
301        paddw       mm2,    mm3             //
302
303        paddw       mm2,    mm4             // added round values
304        psrlw       mm2,    8
305
306        packuswb    mm2,    mm7
307        movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09
308
309
310    }
311}
312
313/****************************************************************************
314 *
315 *  ROUTINE       : vertical_band_4_5_scale_mmx
316 *
317 *  INPUTS        : unsigned char *dest    :
318 *                  unsigned int dest_pitch :
319 *                  unsigned int dest_width :
320 *
321 *  OUTPUTS       : None.
322 *
323 *  RETURNS       : void
324 *
325 *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
326 *
327 *  SPECIAL NOTES : The routine uses the first line of the band below
328 *                  the current band. The function also has a "C" only
329 *                  version.
330 *
331 ****************************************************************************/
332static
333void vertical_band_4_5_scale_mmx
334(
335    unsigned char *dest,
336    unsigned int dest_pitch,
337    unsigned int dest_width
338)
339{
340    __asm
341    {
342
343        mov         esi,    dest                    // Get the source and destination pointer
344        mov         ecx,    dest_pitch               // Get the pitch size
345
346        lea         edi,    [esi+ecx*2]             // tow lines below
347        add         edi,    ecx                     // three lines below
348
349        pxor        mm7,    mm7                     // clear out mm7
350        mov         edx,    dest_width               // Loop counter
351
352        vs_4_5_loop:
353
354        movq        mm0,    QWORD ptr [esi]         // src[0];
355        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
356
357        movq        mm2,    mm0                     // Make a copy
358        punpcklbw   mm0,    mm7                     // unpack low to word
359
360        movq        mm5,    one_fifth
361        punpckhbw   mm2,    mm7                     // unpack high to word
362
363        pmullw      mm0,    mm5                     // a * 1/5
364
365        movq        mm3,    mm1                     // make a copy
366        punpcklbw   mm1,    mm7                     // unpack low to word
367
368        pmullw      mm2,    mm5                     // a * 1/5
369        movq        mm6,    four_fifths               // constan
370
371        movq        mm4,    mm1                     // copy of low b
372        pmullw      mm4,    mm6                     // b * 4/5
373
374        punpckhbw   mm3,    mm7                     // unpack high to word
375        movq        mm5,    mm3                     // copy of high b
376
377        pmullw      mm5,    mm6                     // b * 4/5
378        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
379
380        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
381        paddw       mm0,    round_values             // + 128
382
383        paddw       mm2,    round_values             // + 128
384        psrlw       mm0,    8
385
386        psrlw       mm2,    8
387        packuswb    mm0,    mm2                     // des [1]
388
389        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
390        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
391
392        // mm1, mm3 --- Src[1]
393        // mm0 --- Src[2]
394        // mm7 for unpacking
395
396        movq        mm5,    two_fifths
397        movq        mm2,    mm0                     // make a copy
398
399        pmullw      mm1,    mm5                     // b * 2/5
400        movq        mm6,    three_fifths
401
402
403        punpcklbw   mm0,    mm7                     // unpack low to word
404        pmullw      mm3,    mm5                     // b * 2/5
405
406        movq        mm4,    mm0                     // make copy of c
407        punpckhbw   mm2,    mm7                     // unpack high to word
408
409        pmullw      mm4,    mm6                     // c * 3/5
410        movq        mm5,    mm2
411
412        pmullw      mm5,    mm6                     // c * 3/5
413        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
414
415        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
416        paddw       mm1,    round_values             // + 128
417
418        paddw       mm3,    round_values             // + 128
419        psrlw       mm1,    8
420
421        psrlw       mm3,    8
422        packuswb    mm1,    mm3                     // des[2]
423
424        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
425        movq        mm1,    [edi]                   // mm1=Src[3];
426
427        // mm0, mm2 --- Src[2]
428        // mm1 --- Src[3]
429        // mm6 --- 3/5
430        // mm7 for unpacking
431
432        pmullw      mm0,    mm6                     // c * 3/5
433        movq        mm5,    two_fifths               // mm5 = 2/5
434
435        movq        mm3,    mm1                     // make a copy
436        pmullw      mm2,    mm6                     // c * 3/5
437
438        punpcklbw   mm1,    mm7                     // unpack low
439        movq        mm4,    mm1                     // make a copy
440
441        punpckhbw   mm3,    mm7                     // unpack high
442        pmullw      mm4,    mm5                     // d * 2/5
443
444        movq        mm6,    mm3                     // make a copy
445        pmullw      mm6,    mm5                     // d * 2/5
446
447        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
448        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
449
450        paddw       mm0,    round_values             // + 128
451        paddw       mm2,    round_values             // + 128
452
453        psrlw       mm0,    8
454        psrlw       mm2,    8
455
456        packuswb    mm0,    mm2                     // des[3]
457        movq        QWORD ptr [edi], mm0            // write des[3]
458
459        //  mm1, mm3 --- Src[3]
460        //  mm7 -- cleared for unpacking
461
462        movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group
463
464        movq        mm5,    four_fifths              // mm5 = 4/5
465        pmullw      mm1,    mm5                     // d * 4/5
466
467        movq        mm6,    one_fifth                // mm6 = 1/5
468        movq        mm2,    mm0                     // make a copy
469
470        pmullw      mm3,    mm5                     // d * 4/5
471        punpcklbw   mm0,    mm7                     // unpack low
472
473        pmullw      mm0,    mm6                     // an * 1/5
474        punpckhbw   mm2,    mm7                     // unpack high
475
476        paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
477        pmullw      mm2,    mm6                     // an * 1/5
478
479        paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
480        paddw       mm1,    round_values             // + 128
481
482        paddw       mm3,    round_values             // + 128
483        psrlw       mm1,    8
484
485        psrlw       mm3,    8
486        packuswb    mm1,    mm3                     // des[4]
487
488        movq        QWORD ptr [edi+ecx], mm1        // write des[4]
489
490        add         edi,    8
491        add         esi,    8
492
493        sub         edx,    8
494        jg         vs_4_5_loop
495    }
496}
497
498/****************************************************************************
499 *
500 *  ROUTINE       : last_vertical_band_4_5_scale_mmx
501 *
502 *  INPUTS        : unsigned char *dest    :
503 *                  unsigned int dest_pitch :
504 *                  unsigned int dest_width :
505 *
506 *  OUTPUTS       : None.
507 *
508 *  RETURNS       : None
509 *
510 *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
511 *
512 *  SPECIAL NOTES : The routine uses the first line of the band below
513 *                  the current band. The function also has an "C" only
514 *                  version.
515 *
516 ****************************************************************************/
517static
518void last_vertical_band_4_5_scale_mmx
519(
520    unsigned char *dest,
521    unsigned int dest_pitch,
522    unsigned int dest_width
523)
524{
525    __asm
526    {
527        mov         esi,    dest                    // Get the source and destination pointer
528        mov         ecx,    dest_pitch               // Get the pitch size
529
530        lea         edi,    [esi+ecx*2]             // tow lines below
531        add         edi,    ecx                     // three lines below
532
533        pxor        mm7,    mm7                     // clear out mm7
534        mov         edx,    dest_width               // Loop counter
535
536        last_vs_4_5_loop:
537
538        movq        mm0,    QWORD ptr [esi]         // src[0];
539        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
540
541        movq        mm2,    mm0                     // Make a copy
542        punpcklbw   mm0,    mm7                     // unpack low to word
543
544        movq        mm5,    one_fifth
545        punpckhbw   mm2,    mm7                     // unpack high to word
546
547        pmullw      mm0,    mm5                     // a * 1/5
548
549        movq        mm3,    mm1                     // make a copy
550        punpcklbw   mm1,    mm7                     // unpack low to word
551
552        pmullw      mm2,    mm5                     // a * 1/5
553        movq        mm6,    four_fifths               // constan
554
555        movq        mm4,    mm1                     // copy of low b
556        pmullw      mm4,    mm6                     // b * 4/5
557
558        punpckhbw   mm3,    mm7                     // unpack high to word
559        movq        mm5,    mm3                     // copy of high b
560
561        pmullw      mm5,    mm6                     // b * 4/5
562        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
563
564        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
565        paddw       mm0,    round_values             // + 128
566
567        paddw       mm2,    round_values             // + 128
568        psrlw       mm0,    8
569
570        psrlw       mm2,    8
571        packuswb    mm0,    mm2                     // des [1]
572
573        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
574        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
575
576        // mm1, mm3 --- Src[1]
577        // mm0 --- Src[2]
578        // mm7 for unpacking
579
580        movq        mm5,    two_fifths
581        movq        mm2,    mm0                     // make a copy
582
583        pmullw      mm1,    mm5                     // b * 2/5
584        movq        mm6,    three_fifths
585
586
587        punpcklbw   mm0,    mm7                     // unpack low to word
588        pmullw      mm3,    mm5                     // b * 2/5
589
590        movq        mm4,    mm0                     // make copy of c
591        punpckhbw   mm2,    mm7                     // unpack high to word
592
593        pmullw      mm4,    mm6                     // c * 3/5
594        movq        mm5,    mm2
595
596        pmullw      mm5,    mm6                     // c * 3/5
597        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
598
599        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
600        paddw       mm1,    round_values             // + 128
601
602        paddw       mm3,    round_values             // + 128
603        psrlw       mm1,    8
604
605        psrlw       mm3,    8
606        packuswb    mm1,    mm3                     // des[2]
607
608        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
609        movq        mm1,    [edi]                   // mm1=Src[3];
610
611        movq        QWORD ptr [edi+ecx], mm1        // write des[4];
612
613        // mm0, mm2 --- Src[2]
614        // mm1 --- Src[3]
615        // mm6 --- 3/5
616        // mm7 for unpacking
617
618        pmullw      mm0,    mm6                     // c * 3/5
619        movq        mm5,    two_fifths               // mm5 = 2/5
620
621        movq        mm3,    mm1                     // make a copy
622        pmullw      mm2,    mm6                     // c * 3/5
623
624        punpcklbw   mm1,    mm7                     // unpack low
625        movq        mm4,    mm1                     // make a copy
626
627        punpckhbw   mm3,    mm7                     // unpack high
628        pmullw      mm4,    mm5                     // d * 2/5
629
630        movq        mm6,    mm3                     // make a copy
631        pmullw      mm6,    mm5                     // d * 2/5
632
633        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
634        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
635
636        paddw       mm0,    round_values             // + 128
637        paddw       mm2,    round_values             // + 128
638
639        psrlw       mm0,    8
640        psrlw       mm2,    8
641
642        packuswb    mm0,    mm2                     // des[3]
643        movq        QWORD ptr [edi], mm0            // write des[3]
644
645        //  mm1, mm3 --- Src[3]
646        //  mm7 -- cleared for unpacking
647        add         edi,    8
648        add         esi,    8
649
650        sub         edx,    8
651        jg          last_vs_4_5_loop
652    }
653}
654
655/****************************************************************************
656 *
657 *  ROUTINE       : vertical_band_3_5_scale_mmx
658 *
659 *  INPUTS        : unsigned char *dest    :
660 *                  unsigned int dest_pitch :
661 *                  unsigned int dest_width :
662 *
663 *  OUTPUTS       : None.
664 *
665 *  RETURNS       : void
666 *
667 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
668 *
669 *  SPECIAL NOTES : The routine uses the first line of the band below
670 *                  the current band. The function also has an "C" only
671 *                  version.
672 *
673 ****************************************************************************/
674static
675void vertical_band_3_5_scale_mmx
676(
677    unsigned char *dest,
678    unsigned int dest_pitch,
679    unsigned int dest_width
680)
681{
682    __asm
683    {
684        mov         esi,    dest                    // Get the source and destination pointer
685        mov         ecx,    dest_pitch               // Get the pitch size
686
687        lea         edi,    [esi+ecx*2]             // tow lines below
688        add         edi,    ecx                     // three lines below
689
690        pxor        mm7,    mm7                     // clear out mm7
691        mov         edx,    dest_width               // Loop counter
692
693        vs_3_5_loop:
694
695        movq        mm0,    QWORD ptr [esi]         // src[0];
696        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
697
698        movq        mm2,    mm0                     // Make a copy
699        punpcklbw   mm0,    mm7                     // unpack low to word
700
701        movq        mm5,    two_fifths               // mm5 = 2/5
702        punpckhbw   mm2,    mm7                     // unpack high to word
703
704        pmullw      mm0,    mm5                     // a * 2/5
705
706        movq        mm3,    mm1                     // make a copy
707        punpcklbw   mm1,    mm7                     // unpack low to word
708
709        pmullw      mm2,    mm5                     // a * 2/5
710        movq        mm6,    three_fifths             // mm6 = 3/5
711
712        movq        mm4,    mm1                     // copy of low b
713        pmullw      mm4,    mm6                     // b * 3/5
714
715        punpckhbw   mm3,    mm7                     // unpack high to word
716        movq        mm5,    mm3                     // copy of high b
717
718        pmullw      mm5,    mm6                     // b * 3/5
719        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
720
721        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
722        paddw       mm0,    round_values             // + 128
723
724        paddw       mm2,    round_values             // + 128
725        psrlw       mm0,    8
726
727        psrlw       mm2,    8
728        packuswb    mm0,    mm2                     // des [1]
729
730        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
731        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
732
733        // mm1, mm3 --- Src[1]
734        // mm0 --- Src[2]
735        // mm7 for unpacking
736
737        movq        mm4,    mm1                     // b low
738        pmullw      mm1,    four_fifths              // b * 4/5 low
739
740        movq        mm5,    mm3                     // b high
741        pmullw      mm3,    four_fifths              // b * 4/5 high
742
743        movq        mm2,    mm0                     // c
744        pmullw      mm4,    one_fifth                // b * 1/5
745
746        punpcklbw   mm0,    mm7                     // c low
747        pmullw      mm5,    one_fifth                // b * 1/5
748
749        movq        mm6,    mm0                     // make copy of c low
750        punpckhbw   mm2,    mm7                     // c high
751
752        pmullw      mm6,    one_fifth                // c * 1/5 low
753        movq        mm7,    mm2                     // make copy of c high
754
755        pmullw      mm7,    one_fifth                // c * 1/5 high
756        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
757
758        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
759        movq        mm6,    mm0                     // make copy of c low
760
761        pmullw      mm6,    four_fifths              // c * 4/5 low
762        movq        mm7,    mm2                     // make copy of c high
763
764        pmullw      mm7,    four_fifths              // c * 4/5 high
765
766        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
767        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
768
769        paddw       mm1,    round_values             // + 128
770        paddw       mm3,    round_values             // + 128
771
772        psrlw       mm1,    8
773        psrlw       mm3,    8
774
775        packuswb    mm1,    mm3                     // des[2]
776        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
777
778        paddw       mm4,    round_values             // + 128
779        paddw       mm5,    round_values             // + 128
780
781        psrlw       mm4,    8
782        psrlw       mm5,    8
783
784        packuswb    mm4,    mm5                     // des[3]
785        movq        QWORD ptr [edi], mm4            // write des[3]
786
787        //  mm0, mm2 --- Src[3]
788
789        pxor        mm7,    mm7                     // clear mm7 for unpacking
790        movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group
791
792        movq        mm5,    three_fifths             // mm5 = 3/5
793        pmullw      mm0,    mm5                     // d * 3/5
794
795        movq        mm6,    two_fifths                // mm6 = 2/5
796        movq        mm3,    mm1                     // make a copy
797
798        pmullw      mm2,    mm5                     // d * 3/5
799        punpcklbw   mm1,    mm7                     // unpack low
800
801        pmullw      mm1,    mm6                     // an * 2/5
802        punpckhbw   mm3,    mm7                     // unpack high
803
804        paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
805        pmullw      mm3,    mm6                     // an * 2/5
806
807        paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
808        paddw       mm0,    round_values             // + 128
809
810        paddw       mm2,    round_values             // + 128
811        psrlw       mm0,    8
812
813        psrlw       mm2,    8
814        packuswb    mm0,    mm2                     // des[4]
815
816        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
817
818        add         edi,    8
819        add         esi,    8
820
821        sub         edx,    8
822        jg          vs_3_5_loop
823    }
824}
825
826/****************************************************************************
827 *
828 *  ROUTINE       : last_vertical_band_3_5_scale_mmx
829 *
830 *  INPUTS        : unsigned char *dest    :
831 *                  unsigned int dest_pitch :
832 *                  unsigned int dest_width :
833 *
834 *  OUTPUTS       : None.
835 *
836 *  RETURNS       : void
837 *
838 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
839 *
840 *  SPECIAL NOTES : The routine uses the first line of the band below
841 *                  the current band. The function also has an "C" only
842 *                  version.
843 *
844 ****************************************************************************/
845static
846void last_vertical_band_3_5_scale_mmx
847(
848    unsigned char *dest,
849    unsigned int dest_pitch,
850    unsigned int dest_width
851)
852{
853    __asm
854    {
855        mov         esi,    dest                    // Get the source and destination pointer
856        mov         ecx,    dest_pitch               // Get the pitch size
857
858        lea         edi,    [esi+ecx*2]             // tow lines below
859        add         edi,    ecx                     // three lines below
860
861        pxor        mm7,    mm7                     // clear out mm7
862        mov         edx,    dest_width               // Loop counter
863
864
865        last_vs_3_5_loop:
866
867        movq        mm0,    QWORD ptr [esi]         // src[0];
868        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
869
870        movq        mm2,    mm0                     // Make a copy
871        punpcklbw   mm0,    mm7                     // unpack low to word
872
873        movq        mm5,    two_fifths               // mm5 = 2/5
874        punpckhbw   mm2,    mm7                     // unpack high to word
875
876        pmullw      mm0,    mm5                     // a * 2/5
877
878        movq        mm3,    mm1                     // make a copy
879        punpcklbw   mm1,    mm7                     // unpack low to word
880
881        pmullw      mm2,    mm5                     // a * 2/5
882        movq        mm6,    three_fifths             // mm6 = 3/5
883
884        movq        mm4,    mm1                     // copy of low b
885        pmullw      mm4,    mm6                     // b * 3/5
886
887        punpckhbw   mm3,    mm7                     // unpack high to word
888        movq        mm5,    mm3                     // copy of high b
889
890        pmullw      mm5,    mm6                     // b * 3/5
891        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
892
893        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
894        paddw       mm0,    round_values             // + 128
895
896        paddw       mm2,    round_values             // + 128
897        psrlw       mm0,    8
898
899        psrlw       mm2,    8
900        packuswb    mm0,    mm2                     // des [1]
901
902        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
903        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
904
905
906
907        // mm1, mm3 --- Src[1]
908        // mm0 --- Src[2]
909        // mm7 for unpacking
910
911        movq        mm4,    mm1                     // b low
912        pmullw      mm1,    four_fifths              // b * 4/5 low
913
914        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
915
916        movq        mm5,    mm3                     // b high
917        pmullw      mm3,    four_fifths              // b * 4/5 high
918
919        movq        mm2,    mm0                     // c
920        pmullw      mm4,    one_fifth                // b * 1/5
921
922        punpcklbw   mm0,    mm7                     // c low
923        pmullw      mm5,    one_fifth                // b * 1/5
924
925        movq        mm6,    mm0                     // make copy of c low
926        punpckhbw   mm2,    mm7                     // c high
927
928        pmullw      mm6,    one_fifth                // c * 1/5 low
929        movq        mm7,    mm2                     // make copy of c high
930
931        pmullw      mm7,    one_fifth                // c * 1/5 high
932        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
933
934        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
935        movq        mm6,    mm0                     // make copy of c low
936
937        pmullw      mm6,    four_fifths              // c * 4/5 low
938        movq        mm7,    mm2                     // make copy of c high
939
940        pmullw      mm7,    four_fifths              // c * 4/5 high
941
942        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
943        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
944
945        paddw       mm1,    round_values             // + 128
946        paddw       mm3,    round_values             // + 128
947
948        psrlw       mm1,    8
949        psrlw       mm3,    8
950
951        packuswb    mm1,    mm3                     // des[2]
952        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
953
954        paddw       mm4,    round_values             // + 128
955        paddw       mm5,    round_values             // + 128
956
957        psrlw       mm4,    8
958        psrlw       mm5,    8
959
960        packuswb    mm4,    mm5                     // des[3]
961        movq        QWORD ptr [edi], mm4            // write des[3]
962
963        //  mm0, mm2 --- Src[3]
964
965        add         edi,    8
966        add         esi,    8
967
968        sub         edx,    8
969        jg          last_vs_3_5_loop
970    }
971}
972
973/****************************************************************************
974 *
975 *  ROUTINE       : vertical_band_1_2_scale_mmx
976 *
977 *  INPUTS        : unsigned char *dest    :
978 *                  unsigned int dest_pitch :
979 *                  unsigned int dest_width :
980 *
981 *  OUTPUTS       : None.
982 *
983 *  RETURNS       : void
984 *
985 *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
986 *
987 *  SPECIAL NOTES : The routine uses the first line of the band below
988 *                  the current band. The function also has an "C" only
989 *                  version.
990 *
991 ****************************************************************************/
992static
993void vertical_band_1_2_scale_mmx
994(
995    unsigned char *dest,
996    unsigned int dest_pitch,
997    unsigned int dest_width
998)
999{
1000    __asm
1001    {
1002
1003        mov         esi,    dest                    // Get the source and destination pointer
1004        mov         ecx,    dest_pitch               // Get the pitch size
1005
1006        pxor        mm7,    mm7                     // clear out mm7
1007        mov         edx,    dest_width               // Loop counter
1008
1009        vs_1_2_loop:
1010
1011        movq        mm0,    [esi]                   // get Src[0]
1012        movq        mm1,    [esi + ecx * 2]         // get Src[1]
1013
1014        movq        mm2,    mm0                     // make copy before unpack
1015        movq        mm3,    mm1                     // make copy before unpack
1016
1017        punpcklbw   mm0,    mm7                     // low Src[0]
1018        movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
1019
1020        punpcklbw   mm1,    mm7                     // low Src[1]
1021        paddw       mm0,    mm1                     // low (a + b)
1022
1023        punpckhbw   mm2,    mm7                     // high Src[0]
1024        paddw       mm0,    mm6                     // low (a + b + 1)
1025
1026        punpckhbw   mm3,    mm7
1027        paddw       mm2,    mm3                     // high (a + b )
1028
1029        psraw       mm0,    1                       // low (a + b +1 )/2
1030        paddw       mm2,    mm6                     // high (a + b + 1)
1031
1032        psraw       mm2,    1                       // high (a + b + 1)/2
1033        packuswb    mm0,    mm2                     // pack results
1034
1035        movq        [esi+ecx], mm0                  // write out eight bytes
1036        add         esi,    8
1037
1038        sub         edx,    8
1039        jg          vs_1_2_loop
1040    }
1041
1042}
1043
1044/****************************************************************************
1045 *
1046 *  ROUTINE       : last_vertical_band_1_2_scale_mmx
1047 *
1048 *  INPUTS        : unsigned char *dest    :
1049 *                  unsigned int dest_pitch :
1050 *                  unsigned int dest_width :
1051 *
1052 *  OUTPUTS       : None.
1053 *
1054 *  RETURNS       : void
1055 *
1056 *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
1057 *
1058 *  SPECIAL NOTES : The routine uses the first line of the band below
1059 *                  the current band. The function also has an "C" only
1060 *                  version.
1061 *
1062 ****************************************************************************/
1063static
1064void last_vertical_band_1_2_scale_mmx
1065(
1066    unsigned char *dest,
1067    unsigned int dest_pitch,
1068    unsigned int dest_width
1069)
1070{
1071    __asm
1072    {
1073        mov         esi,    dest                    // Get the source and destination pointer
1074        mov         ecx,    dest_pitch               // Get the pitch size
1075
1076        mov         edx,    dest_width               // Loop counter
1077
1078        last_vs_1_2_loop:
1079
1080        movq        mm0,    [esi]                   // get Src[0]
1081        movq        [esi+ecx], mm0                  // write out eight bytes
1082
1083        add         esi,    8
1084        sub         edx,    8
1085
1086        jg         last_vs_1_2_loop
1087    }
1088}
1089
1090/****************************************************************************
1091 *
1092 *  ROUTINE       : horizontal_line_1_2_scale
1093 *
1094 *  INPUTS        : const unsigned char *source :
1095 *                  unsigned int source_width    :
1096 *                  unsigned char *dest         :
1097 *                  unsigned int dest_width      :
1098 *
1099 *  OUTPUTS       : None.
1100 *
1101 *  RETURNS       : void
1102 *
1103 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
1104 *
1105 *  SPECIAL NOTES : None.
1106 *
1107 ****************************************************************************/
1108static
1109void horizontal_line_1_2_scale_mmx
1110(
1111    const unsigned char *source,
1112    unsigned int source_width,
1113    unsigned char *dest,
1114    unsigned int dest_width
1115)
1116{
1117    (void) dest_width;
1118
1119    __asm
1120    {
1121        mov         esi,    source
1122        mov         edi,    dest
1123
1124        pxor        mm7,    mm7
1125        movq        mm6,    four_ones
1126
1127        mov         ecx,    source_width
1128
1129        hs_1_2_loop:
1130
1131        movq        mm0,    [esi]
1132        movq        mm1,    [esi+1]
1133
1134        movq        mm2,    mm0
1135        movq        mm3,    mm1
1136
1137        movq        mm4,    mm0
1138        punpcklbw   mm0,    mm7
1139
1140        punpcklbw   mm1,    mm7
1141        paddw       mm0,    mm1
1142
1143        paddw       mm0,    mm6
1144        punpckhbw   mm2,    mm7
1145
1146        punpckhbw   mm3,    mm7
1147        paddw       mm2,    mm3
1148
1149        paddw       mm2,    mm6
1150        psraw       mm0,    1
1151
1152        psraw       mm2,    1
1153        packuswb    mm0,    mm2
1154
1155        movq        mm2,    mm4
1156        punpcklbw   mm2,    mm0
1157
1158        movq        [edi],  mm2
1159        punpckhbw   mm4,    mm0
1160
1161        movq        [edi+8], mm4
1162        add         esi,    8
1163
1164        add         edi,    16
1165        sub         ecx,    8
1166
1167        cmp         ecx,    8
1168        jg          hs_1_2_loop
1169
1170// last eight pixel
1171
1172        movq        mm0,    [esi]
1173        movq        mm1,    mm0
1174
1175        movq        mm2,    mm0
1176        movq        mm3,    mm1
1177
1178        psrlq       mm1,    8
1179        psrlq       mm3,    56
1180
1181        psllq       mm3,    56
1182        por         mm1,    mm3
1183
1184        movq        mm3,    mm1
1185        movq        mm4,    mm0
1186
1187        punpcklbw   mm0,    mm7
1188        punpcklbw   mm1,    mm7
1189
1190        paddw       mm0,    mm1
1191        paddw       mm0,    mm6
1192
1193        punpckhbw   mm2,    mm7
1194        punpckhbw   mm3,    mm7
1195
1196        paddw       mm2,    mm3
1197        paddw       mm2,    mm6
1198
1199        psraw       mm0,    1
1200        psraw       mm2,    1
1201
1202        packuswb    mm0,    mm2
1203        movq        mm2,    mm4
1204
1205        punpcklbw   mm2,    mm0
1206        movq        [edi],  mm2
1207
1208        punpckhbw   mm4,    mm0
1209        movq        [edi+8], mm4
1210    }
1211}
1212
1213
1214
1215
1216
1217__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
1218__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
1219
1220
1221/****************************************************************************
1222 *
1223 *  ROUTINE       : horizontal_line_5_4_scale_mmx
1224 *
1225 *  INPUTS        : const unsigned char *source : Pointer to source data.
1226 *                  unsigned int source_width    : Stride of source.
1227 *                  unsigned char *dest         : Pointer to destination data.
1228 *                  unsigned int dest_width      : Stride of destination (NOT USED).
1229 *
1230 *  OUTPUTS       : None.
1231 *
1232 *  RETURNS       : void
1233 *
1234 *  FUNCTION      : Copies horizontal line of pixels from source to
1235 *                  destination scaling up by 4 to 5.
1236 *
1237 *  SPECIAL NOTES : None.
1238 *
1239 ****************************************************************************/
1240static
1241void horizontal_line_5_4_scale_mmx
1242(
1243    const unsigned char *source,
1244    unsigned int source_width,
1245    unsigned char *dest,
1246    unsigned int dest_width
1247)
1248{
1249    /*
1250    unsigned i;
1251    unsigned int a, b, c, d, e;
1252    unsigned char *des = dest;
1253    const unsigned char *src = source;
1254
1255    (void) dest_width;
1256
1257    for ( i=0; i<source_width; i+=5 )
1258    {
1259        a = src[0];
1260        b = src[1];
1261        c = src[2];
1262        d = src[3];
1263        e = src[4];
1264
1265        des[0] = a;
1266        des[1] = ((b*192 + c* 64 + 128)>>8);
1267        des[2] = ((c*128 + d*128 + 128)>>8);
1268        des[3] = ((d* 64 + e*192 + 128)>>8);
1269
1270        src += 5;
1271        des += 4;
1272    }
1273    */
1274    (void) dest_width;
1275
1276    __asm
1277    {
1278
1279        mov         esi,        source              ;
1280        mov         edi,        dest                ;
1281
1282        mov         ecx,        source_width         ;
1283        movq        mm5,        const54_1           ;
1284
1285        pxor        mm7,        mm7                 ;
1286        movq        mm6,        const54_2           ;
1287
1288        movq        mm4,        round_values         ;
1289        lea         edx,        [esi+ecx]           ;
1290        horizontal_line_5_4_loop:
1291
1292        movq        mm0,        QWORD PTR  [esi]    ;
1293        00 01 02 03 04 05 06 07
1294        movq        mm1,        mm0                 ;
1295        00 01 02 03 04 05 06 07
1296
1297        psrlq       mm0,        8                   ;
1298        01 02 03 04 05 06 07 xx
1299        punpcklbw   mm1,        mm7                 ;
1300        xx 00 xx 01 xx 02 xx 03
1301
1302        punpcklbw   mm0,        mm7                 ;
1303        xx 01 xx 02 xx 03 xx 04
1304        pmullw      mm1,        mm5
1305
1306        pmullw      mm0,        mm6
1307        add         esi,        5
1308
1309        add         edi,        4
1310        paddw       mm1,        mm0
1311
1312        paddw       mm1,        mm4
1313        psrlw       mm1,        8
1314
1315        cmp         esi,        edx
1316        packuswb    mm1,        mm7
1317
1318        movd        DWORD PTR [edi-4], mm1
1319
1320        jl          horizontal_line_5_4_loop
1321
1322    }
1323
1324}
1325__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
1326__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
1327__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
1328
1329static
1330void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1331{
1332
1333    __asm
1334    {
1335        push        ebx
1336
1337        mov         esi,    source                    // Get the source and destination pointer
1338        mov         ecx,    src_pitch               // Get the pitch size
1339
1340        mov         edi,    dest                    // tow lines below
1341        pxor        mm7,    mm7                     // clear out mm7
1342
1343        mov         edx,    dest_pitch               // Loop counter
1344        mov         ebx,    dest_width
1345
1346        vs_5_4_loop:
1347
1348        movd        mm0,    DWORD ptr [esi]         // src[0];
1349        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
1350
1351        movd        mm2,    DWORD ptr [esi+ecx*2]
1352        lea         eax,    [esi+ecx*2]             //
1353
1354        punpcklbw   mm1,    mm7
1355        punpcklbw   mm2,    mm7
1356
1357        movq        mm3,    mm2
1358        pmullw      mm1,    three_fourths
1359
1360        pmullw      mm2,    one_fourths
1361        movd        mm4,    [eax+ecx]
1362
1363        pmullw      mm3,    two_fourths
1364        punpcklbw   mm4,    mm7
1365
1366        movq        mm5,    mm4
1367        pmullw      mm4,    two_fourths
1368
1369        paddw       mm1,    mm2
1370        movd        mm6,    [eax+ecx*2]
1371
1372        pmullw      mm5,    one_fourths
1373        paddw       mm1,    round_values;
1374
1375        paddw       mm3,    mm4
1376        psrlw       mm1,    8
1377
1378        punpcklbw   mm6,    mm7
1379        paddw       mm3,    round_values
1380
1381        pmullw      mm6,    three_fourths
1382        psrlw       mm3,    8
1383
1384        packuswb    mm1,    mm7
1385        packuswb    mm3,    mm7
1386
1387        movd        DWORD PTR [edi], mm0
1388        movd        DWORD PTR [edi+edx], mm1
1389
1390
1391        paddw       mm5,    mm6
1392        movd        DWORD PTR [edi+edx*2], mm3
1393
1394        lea         eax,    [edi+edx*2]
1395        paddw       mm5,    round_values
1396
1397        psrlw       mm5,    8
1398        add         edi,    4
1399
1400        packuswb    mm5,    mm7
1401        movd        DWORD PTR [eax+edx], mm5
1402
1403        add         esi,    4
1404        sub         ebx,    4
1405
1406        jg         vs_5_4_loop
1407
1408        pop         ebx
1409    }
1410}
1411
1412
1413__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
1414__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
1415
1416
1417static
1418void horizontal_line_5_3_scale_mmx
1419(
1420    const unsigned char *source,
1421    unsigned int source_width,
1422    unsigned char *dest,
1423    unsigned int dest_width
1424)
1425{
1426
1427    (void) dest_width;
1428    __asm
1429    {
1430
1431        mov         esi,        source              ;
1432        mov         edi,        dest                ;
1433
1434        mov         ecx,        source_width         ;
1435        movq        mm5,        const53_1           ;
1436
1437        pxor        mm7,        mm7                 ;
1438        movq        mm6,        const53_2           ;
1439
1440        movq        mm4,        round_values         ;
1441        lea         edx,        [esi+ecx-5]         ;
1442        horizontal_line_5_3_loop:
1443
1444        movq        mm0,        QWORD PTR  [esi]    ;
1445        00 01 02 03 04 05 06 07
1446        movq        mm1,        mm0                 ;
1447        00 01 02 03 04 05 06 07
1448
1449        psllw       mm0,        8                   ;
1450        xx 00 xx 02 xx 04 xx 06
1451        psrlw       mm1,        8                   ;
1452        01 xx 03 xx 05 xx 07 xx
1453
1454        psrlw       mm0,        8                   ;
1455        00 xx 02 xx 04 xx 06 xx
1456        psllq       mm1,        16                  ;
1457        xx xx 01 xx 03 xx 05 xx
1458
1459        pmullw      mm0,        mm6
1460
1461        pmullw      mm1,        mm5
1462        add         esi,        5
1463
1464        add         edi,        3
1465        paddw       mm1,        mm0
1466
1467        paddw       mm1,        mm4
1468        psrlw       mm1,        8
1469
1470        cmp         esi,        edx
1471        packuswb    mm1,        mm7
1472
1473        movd        DWORD PTR [edi-3], mm1
1474        jl          horizontal_line_5_3_loop
1475
1476//exit condition
1477        movq        mm0,        QWORD PTR  [esi]    ;
1478        00 01 02 03 04 05 06 07
1479        movq        mm1,        mm0                 ;
1480        00 01 02 03 04 05 06 07
1481
1482        psllw       mm0,        8                   ;
1483        xx 00 xx 02 xx 04 xx 06
1484        psrlw       mm1,        8                   ;
1485        01 xx 03 xx 05 xx 07 xx
1486
1487        psrlw       mm0,        8                   ;
1488        00 xx 02 xx 04 xx 06 xx
1489        psllq       mm1,        16                  ;
1490        xx xx 01 xx 03 xx 05 xx
1491
1492        pmullw      mm0,        mm6
1493
1494        pmullw      mm1,        mm5
1495        paddw       mm1,        mm0
1496
1497        paddw       mm1,        mm4
1498        psrlw       mm1,        8
1499
1500        packuswb    mm1,        mm7
1501        movd        eax,        mm1
1502
1503        mov         edx,        eax
1504        shr         edx,        16
1505
1506        mov         WORD PTR[edi],   ax
1507        mov         BYTE PTR[edi+2], dl
1508
1509    }
1510
1511}
1512
1513__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
1514__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
1515
1516static
1517void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1518{
1519
1520    __asm
1521    {
1522        push        ebx
1523
1524        mov         esi,    source                    // Get the source and destination pointer
1525        mov         ecx,    src_pitch               // Get the pitch size
1526
1527        mov         edi,    dest                    // tow lines below
1528        pxor        mm7,    mm7                     // clear out mm7
1529
1530        mov         edx,    dest_pitch               // Loop counter
1531        movq        mm5,    one_thirds
1532
1533        movq        mm6,    two_thirds
1534        mov         ebx,    dest_width;
1535
1536        vs_5_3_loop:
1537
1538        movd        mm0,    DWORD ptr [esi]         // src[0];
1539        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
1540
1541        movd        mm2,    DWORD ptr [esi+ecx*2]
1542        lea         eax,    [esi+ecx*2]             //
1543
1544        punpcklbw   mm1,    mm7
1545        punpcklbw   mm2,    mm7
1546
1547        pmullw      mm1,    mm5
1548        pmullw      mm2,    mm6
1549
1550        movd        mm3,    DWORD ptr [eax+ecx]
1551        movd        mm4,    DWORD ptr [eax+ecx*2]
1552
1553        punpcklbw   mm3,    mm7
1554        punpcklbw   mm4,    mm7
1555
1556        pmullw      mm3,    mm6
1557        pmullw      mm4,    mm5
1558
1559
1560        movd        DWORD PTR [edi], mm0
1561        paddw       mm1,    mm2
1562
1563        paddw       mm1,    round_values
1564        psrlw       mm1,    8
1565
1566        packuswb    mm1,    mm7
1567        paddw       mm3,    mm4
1568
1569        paddw       mm3,    round_values
1570        movd        DWORD PTR [edi+edx], mm1
1571
1572        psrlw       mm3,    8
1573        packuswb    mm3,    mm7
1574
1575        movd        DWORD PTR [edi+edx*2], mm3
1576
1577
1578        add         edi,    4
1579        add         esi,    4
1580
1581        sub         ebx,    4
1582        jg          vs_5_3_loop
1583
1584        pop         ebx
1585    }
1586}
1587
1588
1589
1590
1591/****************************************************************************
1592 *
1593 *  ROUTINE       : horizontal_line_2_1_scale
1594 *
1595 *  INPUTS        : const unsigned char *source :
1596 *                  unsigned int source_width    :
1597 *                  unsigned char *dest         :
1598 *                  unsigned int dest_width      :
1599 *
1600 *  OUTPUTS       : None.
1601 *
1602 *  RETURNS       : void
1603 *
1604 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
1605 *
1606 *  SPECIAL NOTES : None.
1607 *
1608 ****************************************************************************/
1609static
1610void horizontal_line_2_1_scale_mmx
1611(
1612    const unsigned char *source,
1613    unsigned int source_width,
1614    unsigned char *dest,
1615    unsigned int dest_width
1616)
1617{
1618    (void) dest_width;
1619    (void) source_width;
1620    __asm
1621    {
1622        mov         esi,    source
1623        mov         edi,    dest
1624
1625        pxor        mm7,    mm7
1626        mov         ecx,    dest_width
1627
1628        xor         edx,    edx
1629        hs_2_1_loop:
1630
1631        movq        mm0,    [esi+edx*2]
1632        psllw       mm0,    8
1633
1634        psrlw       mm0,    8
1635        packuswb    mm0,    mm7
1636
1637        movd        DWORD Ptr [edi+edx], mm0;
1638        add         edx,    4
1639
1640        cmp         edx,    ecx
1641        jl          hs_2_1_loop
1642
1643    }
1644}
1645
1646
1647
1648static
1649void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1650{
1651    (void) dest_pitch;
1652    (void) src_pitch;
1653    vpx_memcpy(dest, source, dest_width);
1654}
1655
1656
1657__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
1658__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
1659
1660static
1661void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1662{
1663
1664    (void) dest_pitch;
1665    __asm
1666    {
1667        mov         esi,        source
1668        mov         edi,        dest
1669
1670        mov         eax,        src_pitch
1671        mov         edx,        dest_width
1672
1673        pxor        mm7,        mm7
1674        sub         esi,        eax             //back one line
1675
1676
1677        lea         ecx,        [esi+edx];
1678        movq        mm6,        round_values;
1679
1680        movq        mm5,        three_sixteenths;
1681        movq        mm4,        ten_sixteenths;
1682
1683        vs_2_1_i_loop:
1684        movd        mm0,        [esi]           //
1685        movd        mm1,        [esi+eax]       //
1686
1687        movd        mm2,        [esi+eax*2]     //
1688        punpcklbw   mm0,        mm7
1689
1690        pmullw      mm0,        mm5
1691        punpcklbw   mm1,        mm7
1692
1693        pmullw      mm1,        mm4
1694        punpcklbw   mm2,        mm7
1695
1696        pmullw      mm2,        mm5
1697        paddw       mm0,        round_values
1698
1699        paddw       mm1,        mm2
1700        paddw       mm0,        mm1
1701
1702        psrlw       mm0,        8
1703        packuswb    mm0,        mm7
1704
1705        movd        DWORD PTR [edi],        mm0
1706        add         esi,        4
1707
1708        add         edi,        4;
1709        cmp         esi,        ecx
1710        jl          vs_2_1_i_loop
1711
1712    }
1713}
1714
1715
1716
1717void
1718register_mmxscalers(void)
1719{
1720    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
1721    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
1722    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
1723    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
1724    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
1725    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
1726    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
1727    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
1728    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
1729
1730    vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
1731    vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
1732    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
1733    vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
1734    vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
1735    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
1736
1737
1738
1739    vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
1740    vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
1741    vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
1742    vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
1743    vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
1744    vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
1745    vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
1746
1747
1748
1749
1750}
1751