1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12/****************************************************************************
13*
14*   Module Title :     scaleopt.cpp
15*
16*   Description  :     Optimized scaling functions
17*
18****************************************************************************/
19#include "pragmas.h"
20
21/****************************************************************************
22*  Module Statics
23****************************************************************************/
24__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
25
26#include "vpx_scale/vpx_scale.h"
27#include "vpx_mem/vpx_mem.h"
28
29__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
30__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
31
32
33/****************************************************************************
34 *
35 *  ROUTINE       : horizontal_line_5_4_scale_mmx
36 *
37 *  INPUTS        : const unsigned char *source : Pointer to source data.
38 *                  unsigned int source_width    : Stride of source.
39 *                  unsigned char *dest         : Pointer to destination data.
40 *                  unsigned int dest_width      : Stride of destination (NOT USED).
41 *
42 *  OUTPUTS       : None.
43 *
44 *  RETURNS       : void
45 *
46 *  FUNCTION      : Copies horizontal line of pixels from source to
47 *                  destination scaling up by 4 to 5.
48 *
49 *  SPECIAL NOTES : None.
50 *
51 ****************************************************************************/
52static
53void horizontal_line_5_4_scale_mmx
54(
55  const unsigned char *source,
56  unsigned int source_width,
57  unsigned char *dest,
58  unsigned int dest_width
59) {
60  /*
61  unsigned i;
62  unsigned int a, b, c, d, e;
63  unsigned char *des = dest;
64  const unsigned char *src = source;
65
66  (void) dest_width;
67
68  for ( i=0; i<source_width; i+=5 )
69  {
70      a = src[0];
71      b = src[1];
72      c = src[2];
73      d = src[3];
74      e = src[4];
75
76      des[0] = a;
77      des[1] = ((b*192 + c* 64 + 128)>>8);
78      des[2] = ((c*128 + d*128 + 128)>>8);
79      des[3] = ((d* 64 + e*192 + 128)>>8);
80
81      src += 5;
82      des += 4;
83  }
84  */
85  (void) dest_width;
86
87  __asm {
88
89    mov         esi,        source;
90    mov         edi,        dest;
91
92    mov         ecx,        source_width;
93    movq        mm5,        const54_1;
94
95    pxor        mm7,        mm7;
96    movq        mm6,        const54_2;
97
98    movq        mm4,        round_values;
99    lea         edx,        [esi+ecx];
100    horizontal_line_5_4_loop:
101
102    movq        mm0,        QWORD PTR  [esi];
103    00 01 02 03 04 05 06 07
104    movq        mm1,        mm0;
105    00 01 02 03 04 05 06 07
106
107    psrlq       mm0,        8;
108    01 02 03 04 05 06 07 xx
109    punpcklbw   mm1,        mm7;
110    xx 00 xx 01 xx 02 xx 03
111
112    punpcklbw   mm0,        mm7;
113    xx 01 xx 02 xx 03 xx 04
114    pmullw      mm1,        mm5
115
116    pmullw      mm0,        mm6
117    add         esi,        5
118
119    add         edi,        4
120    paddw       mm1,        mm0
121
122    paddw       mm1,        mm4
123    psrlw       mm1,        8
124
125    cmp         esi,        edx
126    packuswb    mm1,        mm7
127
128    movd        DWORD PTR [edi-4], mm1
129
130    jl          horizontal_line_5_4_loop
131
132  }
133
134}
135__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
136__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
137__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
138
139static
140void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
141
142  __asm {
143    push        ebx
144
145    mov         esi,    source                    // Get the source and destination pointer
146    mov         ecx,    src_pitch               // Get the pitch size
147
148    mov         edi,    dest                    // tow lines below
149    pxor        mm7,    mm7                     // clear out mm7
150
151    mov         edx,    dest_pitch               // Loop counter
152    mov         ebx,    dest_width
153
154    vs_5_4_loop:
155
156    movd        mm0,    DWORD ptr [esi]         // src[0];
157    movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
158
159    movd        mm2,    DWORD ptr [esi+ecx*2]
160    lea         eax,    [esi+ecx*2]             //
161
162    punpcklbw   mm1,    mm7
163    punpcklbw   mm2,    mm7
164
165    movq        mm3,    mm2
166    pmullw      mm1,    three_fourths
167
168    pmullw      mm2,    one_fourths
169    movd        mm4,    [eax+ecx]
170
171    pmullw      mm3,    two_fourths
172    punpcklbw   mm4,    mm7
173
174    movq        mm5,    mm4
175    pmullw      mm4,    two_fourths
176
177    paddw       mm1,    mm2
178    movd        mm6,    [eax+ecx*2]
179
180    pmullw      mm5,    one_fourths
181    paddw       mm1,    round_values;
182
183    paddw       mm3,    mm4
184    psrlw       mm1,    8
185
186    punpcklbw   mm6,    mm7
187    paddw       mm3,    round_values
188
189    pmullw      mm6,    three_fourths
190    psrlw       mm3,    8
191
192    packuswb    mm1,    mm7
193    packuswb    mm3,    mm7
194
195    movd        DWORD PTR [edi], mm0
196    movd        DWORD PTR [edi+edx], mm1
197
198
199    paddw       mm5,    mm6
200    movd        DWORD PTR [edi+edx*2], mm3
201
202    lea         eax,    [edi+edx*2]
203    paddw       mm5,    round_values
204
205    psrlw       mm5,    8
206    add         edi,    4
207
208    packuswb    mm5,    mm7
209    movd        DWORD PTR [eax+edx], mm5
210
211    add         esi,    4
212    sub         ebx,    4
213
214    jg         vs_5_4_loop
215
216    pop         ebx
217  }
218}
219
220
221__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
222__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
223
224
225static
226void horizontal_line_5_3_scale_mmx
227(
228  const unsigned char *source,
229  unsigned int source_width,
230  unsigned char *dest,
231  unsigned int dest_width
232) {
233
234  (void) dest_width;
235  __asm {
236
237    mov         esi,        source;
238    mov         edi,        dest;
239
240    mov         ecx,        source_width;
241    movq        mm5,        const53_1;
242
243    pxor        mm7,        mm7;
244    movq        mm6,        const53_2;
245
246    movq        mm4,        round_values;
247    lea         edx,        [esi+ecx-5];
248    horizontal_line_5_3_loop:
249
250    movq        mm0,        QWORD PTR  [esi];
251    00 01 02 03 04 05 06 07
252    movq        mm1,        mm0;
253    00 01 02 03 04 05 06 07
254
255    psllw       mm0,        8;
256    xx 00 xx 02 xx 04 xx 06
257    psrlw       mm1,        8;
258    01 xx 03 xx 05 xx 07 xx
259
260    psrlw       mm0,        8;
261    00 xx 02 xx 04 xx 06 xx
262    psllq       mm1,        16;
263    xx xx 01 xx 03 xx 05 xx
264
265    pmullw      mm0,        mm6
266
267    pmullw      mm1,        mm5
268    add         esi,        5
269
270    add         edi,        3
271    paddw       mm1,        mm0
272
273    paddw       mm1,        mm4
274    psrlw       mm1,        8
275
276    cmp         esi,        edx
277    packuswb    mm1,        mm7
278
279    movd        DWORD PTR [edi-3], mm1
280    jl          horizontal_line_5_3_loop
281
282// exit condition
283    movq        mm0,        QWORD PTR  [esi];
284    00 01 02 03 04 05 06 07
285    movq        mm1,        mm0;
286    00 01 02 03 04 05 06 07
287
288    psllw       mm0,        8;
289    xx 00 xx 02 xx 04 xx 06
290    psrlw       mm1,        8;
291    01 xx 03 xx 05 xx 07 xx
292
293    psrlw       mm0,        8;
294    00 xx 02 xx 04 xx 06 xx
295    psllq       mm1,        16;
296    xx xx 01 xx 03 xx 05 xx
297
298    pmullw      mm0,        mm6
299
300    pmullw      mm1,        mm5
301    paddw       mm1,        mm0
302
303    paddw       mm1,        mm4
304    psrlw       mm1,        8
305
306    packuswb    mm1,        mm7
307    movd        eax,        mm1
308
309    mov         edx,        eax
310    shr         edx,        16
311
312    mov         WORD PTR[edi],   ax
313    mov         BYTE PTR[edi+2], dl
314
315  }
316
317}
318
319__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
320__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
321
322static
323void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
324
325  __asm {
326    push        ebx
327
328    mov         esi,    source                    // Get the source and destination pointer
329    mov         ecx,    src_pitch               // Get the pitch size
330
331    mov         edi,    dest                    // tow lines below
332    pxor        mm7,    mm7                     // clear out mm7
333
334    mov         edx,    dest_pitch               // Loop counter
335    movq        mm5,    one_thirds
336
337    movq        mm6,    two_thirds
338    mov         ebx,    dest_width;
339
340    vs_5_3_loop:
341
342    movd        mm0,    DWORD ptr [esi]         // src[0];
343    movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
344
345    movd        mm2,    DWORD ptr [esi+ecx*2]
346    lea         eax,    [esi+ecx*2]             //
347
348    punpcklbw   mm1,    mm7
349    punpcklbw   mm2,    mm7
350
351    pmullw      mm1,    mm5
352    pmullw      mm2,    mm6
353
354    movd        mm3,    DWORD ptr [eax+ecx]
355    movd        mm4,    DWORD ptr [eax+ecx*2]
356
357    punpcklbw   mm3,    mm7
358    punpcklbw   mm4,    mm7
359
360    pmullw      mm3,    mm6
361    pmullw      mm4,    mm5
362
363
364    movd        DWORD PTR [edi], mm0
365    paddw       mm1,    mm2
366
367    paddw       mm1,    round_values
368    psrlw       mm1,    8
369
370    packuswb    mm1,    mm7
371    paddw       mm3,    mm4
372
373    paddw       mm3,    round_values
374    movd        DWORD PTR [edi+edx], mm1
375
376    psrlw       mm3,    8
377    packuswb    mm3,    mm7
378
379    movd        DWORD PTR [edi+edx*2], mm3
380
381
382    add         edi,    4
383    add         esi,    4
384
385    sub         ebx,    4
386    jg          vs_5_3_loop
387
388    pop         ebx
389  }
390}
391
392
393
394
395/****************************************************************************
396 *
397 *  ROUTINE       : horizontal_line_2_1_scale
398 *
399 *  INPUTS        : const unsigned char *source :
400 *                  unsigned int source_width    :
401 *                  unsigned char *dest         :
402 *                  unsigned int dest_width      :
403 *
404 *  OUTPUTS       : None.
405 *
406 *  RETURNS       : void
407 *
408 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
409 *
410 *  SPECIAL NOTES : None.
411 *
412 ****************************************************************************/
413static
414void horizontal_line_2_1_scale_mmx
415(
416  const unsigned char *source,
417  unsigned int source_width,
418  unsigned char *dest,
419  unsigned int dest_width
420) {
421  (void) dest_width;
422  (void) source_width;
423  __asm {
424    mov         esi,    source
425    mov         edi,    dest
426
427    pxor        mm7,    mm7
428    mov         ecx,    dest_width
429
430    xor         edx,    edx
431    hs_2_1_loop:
432
433    movq        mm0,    [esi+edx*2]
434    psllw       mm0,    8
435
436    psrlw       mm0,    8
437    packuswb    mm0,    mm7
438
439    movd        DWORD Ptr [edi+edx], mm0;
440    add         edx,    4
441
442    cmp         edx,    ecx
443    jl          hs_2_1_loop
444
445  }
446}
447
448
449
450static
451void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
452  (void) dest_pitch;
453  (void) src_pitch;
454  vpx_memcpy(dest, source, dest_width);
455}
456
457
458__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
459__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
460
461static
462void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
463
464  (void) dest_pitch;
465  __asm {
466    mov         esi,        source
467    mov         edi,        dest
468
469    mov         eax,        src_pitch
470    mov         edx,        dest_width
471
472    pxor        mm7,        mm7
473    sub         esi,        eax             // back one line
474
475
476    lea         ecx,        [esi+edx];
477    movq        mm6,        round_values;
478
479    movq        mm5,        three_sixteenths;
480    movq        mm4,        ten_sixteenths;
481
482    vs_2_1_i_loop:
483    movd        mm0,        [esi]           //
484    movd        mm1,        [esi+eax]       //
485
486    movd        mm2,        [esi+eax*2]     //
487    punpcklbw   mm0,        mm7
488
489    pmullw      mm0,        mm5
490    punpcklbw   mm1,        mm7
491
492    pmullw      mm1,        mm4
493    punpcklbw   mm2,        mm7
494
495    pmullw      mm2,        mm5
496    paddw       mm0,        round_values
497
498    paddw       mm1,        mm2
499    paddw       mm0,        mm1
500
501    psrlw       mm0,        8
502    packuswb    mm0,        mm7
503
504    movd        DWORD PTR [edi],        mm0
505    add         esi,        4
506
507    add         edi,        4;
508    cmp         esi,        ecx
509    jl          vs_2_1_i_loop
510
511  }
512}
513
514
515
516void
517register_mmxscalers(void) {
518  vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
519  vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
520  vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
521  vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
522  vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
523  vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
524  vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
525}
526