row_win.cc revision 7cd8149e2cbad8b1ff6d481c37a4775d3c8cf2fa
1/*
2 *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
13extern "C" {
14
15#ifdef HAS_ARGBTOYROW_SSSE3
16#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
17
18// Constant multiplication table for converting ARGB to I400.
19extern "C" TALIGN16(const int8, kARGBToY[16]) = {
20  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
21};
22
23extern "C" TALIGN16(const int8, kARGBToU[16]) = {
24  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
25};
26
27extern "C" TALIGN16(const int8, kARGBToV[16]) = {
28  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
29};
30
31// Constants for BGRA
32extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
33  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
34};
35
36extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
37  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
38};
39
40extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
41  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
42};
43
44// Constants for ABGR
45extern "C" TALIGN16(const int8, kABGRToY[16]) = {
46  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
47};
48
49extern "C" TALIGN16(const int8, kABGRToU[16]) = {
50  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
51};
52
53extern "C" TALIGN16(const int8, kABGRToV[16]) = {
54  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
55};
56
57extern "C" TALIGN16(const uint8, kAddY16[16]) = {
58  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
59  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
60};
61
62extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
63  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
64  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
65};
66
67// Shuffle table for converting BG24 to ARGB.
68extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
69  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
70};
71
72// Shuffle table for converting RAW to ARGB.
73extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
74  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
75};
76
77// Convert 16 ARGB pixels (64 bytes) to 16 Y values
78__declspec(naked)
79void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
80__asm {
81    mov        eax, [esp + 4]   /* src_argb */
82    mov        edx, [esp + 8]   /* dst_y */
83    mov        ecx, [esp + 12]  /* pix */
84    movdqa     xmm7, _kARGBToY
85    movdqa     xmm6, _kAddY16
86
87 convertloop :
88    movdqa     xmm0, [eax]
89    movdqa     xmm1, [eax + 16]
90    movdqa     xmm2, [eax + 32]
91    movdqa     xmm3, [eax + 48]
92    pmaddubsw  xmm0, xmm7
93    pmaddubsw  xmm1, xmm7
94    pmaddubsw  xmm2, xmm7
95    pmaddubsw  xmm3, xmm7
96    lea        eax, [eax + 64]
97    phaddw     xmm0, xmm1
98    phaddw     xmm2, xmm3
99    psrlw      xmm0, 7
100    psrlw      xmm2, 7
101    packuswb   xmm0, xmm2
102    paddb      xmm0, xmm6
103    movdqa     [edx], xmm0
104    lea        edx, [edx + 16]
105    sub        ecx, 16
106    ja         convertloop
107    ret
108  }
109}
110
111__declspec(naked)
112void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
113__asm {
114    mov        eax, [esp + 4]   /* src_argb */
115    mov        edx, [esp + 8]   /* dst_y */
116    mov        ecx, [esp + 12]  /* pix */
117    movdqa     xmm7, _kBGRAToY
118    movdqa     xmm6, _kAddY16
119
120 convertloop :
121    movdqa     xmm0, [eax]
122    movdqa     xmm1, [eax + 16]
123    movdqa     xmm2, [eax + 32]
124    movdqa     xmm3, [eax + 48]
125    pmaddubsw  xmm0, xmm7
126    pmaddubsw  xmm1, xmm7
127    pmaddubsw  xmm2, xmm7
128    pmaddubsw  xmm3, xmm7
129    lea        eax, [eax + 64]
130    phaddw     xmm0, xmm1
131    phaddw     xmm2, xmm3
132    psrlw      xmm0, 7
133    psrlw      xmm2, 7
134    packuswb   xmm0, xmm2
135    paddb      xmm0, xmm6
136    movdqa     [edx], xmm0
137    lea        edx, [edx + 16]
138    sub        ecx, 16
139    ja         convertloop
140    ret
141  }
142}
143
144__declspec(naked)
145void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
146__asm {
147    mov        eax, [esp + 4]   /* src_argb */
148    mov        edx, [esp + 8]   /* dst_y */
149    mov        ecx, [esp + 12]  /* pix */
150    movdqa     xmm7, _kABGRToY
151    movdqa     xmm6, _kAddY16
152
153 convertloop :
154    movdqa     xmm0, [eax]
155    movdqa     xmm1, [eax + 16]
156    movdqa     xmm2, [eax + 32]
157    movdqa     xmm3, [eax + 48]
158    pmaddubsw  xmm0, xmm7
159    pmaddubsw  xmm1, xmm7
160    pmaddubsw  xmm2, xmm7
161    pmaddubsw  xmm3, xmm7
162    lea        eax, [eax + 64]
163    phaddw     xmm0, xmm1
164    phaddw     xmm2, xmm3
165    psrlw      xmm0, 7
166    psrlw      xmm2, 7
167    packuswb   xmm0, xmm2
168    paddb      xmm0, xmm6
169    movdqa     [edx], xmm0
170    lea        edx, [edx + 16]
171    sub        ecx, 16
172    ja         convertloop
173    ret
174  }
175}
176
177__declspec(naked)
178void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
179                       uint8* dst_u, uint8* dst_v, int width) {
180__asm {
181    push       esi
182    push       edi
183    mov        eax, [esp + 8 + 4]   // src_argb
184    mov        esi, [esp + 8 + 8]   // src_stride_argb
185    mov        edx, [esp + 8 + 12]  // dst_u
186    mov        edi, [esp + 8 + 16]  // dst_v
187    mov        ecx, [esp + 8 + 20]  // pix
188    movdqa     xmm7, _kARGBToU
189    movdqa     xmm6, _kARGBToV
190    movdqa     xmm5, _kAddUV128
191    sub        edi, edx             // stride from u to v
192
193 convertloop :
194    /* step 1 - subsample 16x2 argb pixels to 8x1 */
195    movdqa     xmm0, [eax]
196    movdqa     xmm1, [eax + 16]
197    movdqa     xmm2, [eax + 32]
198    movdqa     xmm3, [eax + 48]
199    pavgb      xmm0, [eax + esi]
200    pavgb      xmm1, [eax + esi + 16]
201    pavgb      xmm2, [eax + esi + 32]
202    pavgb      xmm3, [eax + esi + 48]
203    lea        eax,  [eax + 64]
204    movdqa     xmm4, xmm0
205    shufps     xmm0, xmm1, 0x88
206    shufps     xmm4, xmm1, 0xdd
207    pavgb      xmm0, xmm4
208    movdqa     xmm4, xmm2
209    shufps     xmm2, xmm3, 0x88
210    shufps     xmm4, xmm3, 0xdd
211    pavgb      xmm2, xmm4
212
213    // step 2 - convert to U and V
214    // from here down is very similar to Y code except
215    // instead of 16 different pixels, its 8 pixels of U and 8 of V
216    movdqa     xmm1, xmm0
217    movdqa     xmm3, xmm2
218    pmaddubsw  xmm0, xmm7  // U
219    pmaddubsw  xmm2, xmm7
220    pmaddubsw  xmm1, xmm6  // V
221    pmaddubsw  xmm3, xmm6
222    phaddw     xmm0, xmm2
223    phaddw     xmm1, xmm3
224    psraw      xmm0, 8
225    psraw      xmm1, 8
226    packsswb   xmm0, xmm1
227    paddb      xmm0, xmm5            // -> unsigned
228
229    // step 3 - store 8 U and 8 V values
230    movlps     qword ptr [edx], xmm0 // U
231    movhps     qword ptr [edx + edi], xmm0 // V
232    lea        edx, [edx + 8]
233    sub        ecx, 16
234    ja         convertloop
235    pop        edi
236    pop        esi
237    ret
238  }
239}
240
241__declspec(naked)
242void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
243                       uint8* dst_u, uint8* dst_v, int width) {
244__asm {
245    push       esi
246    push       edi
247    mov        eax, [esp + 8 + 4]   // src_argb
248    mov        esi, [esp + 8 + 8]   // src_stride_argb
249    mov        edx, [esp + 8 + 12]  // dst_u
250    mov        edi, [esp + 8 + 16]  // dst_v
251    mov        ecx, [esp + 8 + 20]  // pix
252    movdqa     xmm7, _kBGRAToU
253    movdqa     xmm6, _kBGRAToV
254    movdqa     xmm5, _kAddUV128
255    sub        edi, edx             // stride from u to v
256
257 convertloop :
258    /* step 1 - subsample 16x2 argb pixels to 8x1 */
259    movdqa     xmm0, [eax]
260    movdqa     xmm1, [eax + 16]
261    movdqa     xmm2, [eax + 32]
262    movdqa     xmm3, [eax + 48]
263    pavgb      xmm0, [eax + esi]
264    pavgb      xmm1, [eax + esi + 16]
265    pavgb      xmm2, [eax + esi + 32]
266    pavgb      xmm3, [eax + esi + 48]
267    lea        eax,  [eax + 64]
268    movdqa     xmm4, xmm0
269    shufps     xmm0, xmm1, 0x88
270    shufps     xmm4, xmm1, 0xdd
271    pavgb      xmm0, xmm4
272    movdqa     xmm4, xmm2
273    shufps     xmm2, xmm3, 0x88
274    shufps     xmm4, xmm3, 0xdd
275    pavgb      xmm2, xmm4
276
277    // step 2 - convert to U and V
278    // from here down is very similar to Y code except
279    // instead of 16 different pixels, its 8 pixels of U and 8 of V
280    movdqa     xmm1, xmm0
281    movdqa     xmm3, xmm2
282    pmaddubsw  xmm0, xmm7  // U
283    pmaddubsw  xmm2, xmm7
284    pmaddubsw  xmm1, xmm6  // V
285    pmaddubsw  xmm3, xmm6
286    phaddw     xmm0, xmm2
287    phaddw     xmm1, xmm3
288    psraw      xmm0, 8
289    psraw      xmm1, 8
290    packsswb   xmm0, xmm1
291    paddb      xmm0, xmm5            // -> unsigned
292
293    // step 3 - store 8 U and 8 V values
294    movlps     qword ptr [edx], xmm0 // U
295    movhps     qword ptr [edx + edi], xmm0 // V
296    lea        edx, [edx + 8]
297    sub        ecx, 16
298    ja         convertloop
299    pop        edi
300    pop        esi
301    ret
302  }
303}
304
305__declspec(naked)
306void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
307                       uint8* dst_u, uint8* dst_v, int width) {
308__asm {
309    push       esi
310    push       edi
311    mov        eax, [esp + 8 + 4]   // src_argb
312    mov        esi, [esp + 8 + 8]   // src_stride_argb
313    mov        edx, [esp + 8 + 12]  // dst_u
314    mov        edi, [esp + 8 + 16]  // dst_v
315    mov        ecx, [esp + 8 + 20]  // pix
316    movdqa     xmm7, _kABGRToU
317    movdqa     xmm6, _kABGRToV
318    movdqa     xmm5, _kAddUV128
319    sub        edi, edx             // stride from u to v
320
321 convertloop :
322    /* step 1 - subsample 16x2 argb pixels to 8x1 */
323    movdqa     xmm0, [eax]
324    movdqa     xmm1, [eax + 16]
325    movdqa     xmm2, [eax + 32]
326    movdqa     xmm3, [eax + 48]
327    pavgb      xmm0, [eax + esi]
328    pavgb      xmm1, [eax + esi + 16]
329    pavgb      xmm2, [eax + esi + 32]
330    pavgb      xmm3, [eax + esi + 48]
331    lea        eax,  [eax + 64]
332    movdqa     xmm4, xmm0
333    shufps     xmm0, xmm1, 0x88
334    shufps     xmm4, xmm1, 0xdd
335    pavgb      xmm0, xmm4
336    movdqa     xmm4, xmm2
337    shufps     xmm2, xmm3, 0x88
338    shufps     xmm4, xmm3, 0xdd
339    pavgb      xmm2, xmm4
340
341    // step 2 - convert to U and V
342    // from here down is very similar to Y code except
343    // instead of 16 different pixels, its 8 pixels of U and 8 of V
344    movdqa     xmm1, xmm0
345    movdqa     xmm3, xmm2
346    pmaddubsw  xmm0, xmm7  // U
347    pmaddubsw  xmm2, xmm7
348    pmaddubsw  xmm1, xmm6  // V
349    pmaddubsw  xmm3, xmm6
350    phaddw     xmm0, xmm2
351    phaddw     xmm1, xmm3
352    psraw      xmm0, 8
353    psraw      xmm1, 8
354    packsswb   xmm0, xmm1
355    paddb      xmm0, xmm5            // -> unsigned
356
357    // step 3 - store 8 U and 8 V values
358    movlps     qword ptr [edx], xmm0 // U
359    movhps     qword ptr [edx + edi], xmm0 // V
360    lea        edx, [edx + 8]
361    sub        ecx, 16
362    ja         convertloop
363    pop        edi
364    pop        esi
365    ret
366  }
367}
368
369__declspec(naked)
370void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
371__asm {
372    mov       eax, [esp + 4]   // src_bg24
373    mov       edx, [esp + 8]   // dst_argb
374    mov       ecx, [esp + 12]  // pix
375    pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
376    pslld     xmm7, 24
377    movdqa    xmm6, _kShuffleMaskBG24ToARGB
378
379 convertloop :
380    movdqa    xmm0, [eax]
381    movdqa    xmm1, [eax + 16]
382    movdqa    xmm3, [eax + 32]
383    lea       eax, [eax + 48]
384    movdqa    xmm2, xmm3
385    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
386    pshufb    xmm2, xmm6
387    por       xmm2, xmm7
388    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
389    pshufb    xmm0, xmm6
390    movdqa    [edx + 32], xmm2
391    por       xmm0, xmm7
392    pshufb    xmm1, xmm6
393    movdqa    [edx], xmm0
394    por       xmm1, xmm7
395    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
396    pshufb    xmm3, xmm6
397    movdqa    [edx + 16], xmm1
398    por       xmm3, xmm7
399    movdqa    [edx + 48], xmm3
400    lea       edx, [edx + 64]
401    sub       ecx, 16
402    ja        convertloop
403    ret
404  }
405}
406
407__declspec(naked)
408void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
409                        int pix) {
410__asm {
411    mov       eax, [esp + 4]   // src_raw
412    mov       edx, [esp + 8]   // dst_argb
413    mov       ecx, [esp + 12]  // pix
414    pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
415    pslld     xmm7, 24
416    movdqa    xmm6, _kShuffleMaskRAWToARGB
417
418 convertloop :
419    movdqa    xmm0, [eax]
420    movdqa    xmm1, [eax + 16]
421    movdqa    xmm3, [eax + 32]
422    lea       eax, [eax + 48]
423    movdqa    xmm2, xmm3
424    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
425    pshufb    xmm2, xmm6
426    por       xmm2, xmm7
427    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
428    pshufb    xmm0, xmm6
429    movdqa    [edx + 32], xmm2
430    por       xmm0, xmm7
431    pshufb    xmm1, xmm6
432    movdqa    [edx], xmm0
433    por       xmm1, xmm7
434    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
435    pshufb    xmm3, xmm6
436    movdqa    [edx + 16], xmm1
437    por       xmm3, xmm7
438    movdqa    [edx + 48], xmm3
439    lea       edx, [edx + 64]
440    sub       ecx, 16
441    ja        convertloop
442    ret
443  }
444}
445
446__declspec(naked)
447void FastConvertYUVToRGB32Row(const uint8* y_buf,
448                              const uint8* u_buf,
449                              const uint8* v_buf,
450                              uint8* rgb_buf,
451                              int width) {
452  __asm {
453    pushad
454    mov       edx, [esp + 32 + 4]
455    mov       edi, [esp + 32 + 8]
456    mov       esi, [esp + 32 + 12]
457    mov       ebp, [esp + 32 + 16]
458    mov       ecx, [esp + 32 + 20]
459
460 convertloop :
461    movzx     eax, byte ptr [edi]
462    lea       edi, [edi + 1]
463    movzx     ebx, byte ptr [esi]
464    lea       esi, [esi + 1]
465    movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
466    movzx     eax, byte ptr [edx]
467    paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
468    movzx     ebx, byte ptr [edx + 1]
469    movq      mm1, [_kCoefficientsRgbY + 8 * eax]
470    lea       edx, [edx + 2]
471    movq      mm2, [_kCoefficientsRgbY + 8 * ebx]
472    paddsw    mm1, mm0
473    paddsw    mm2, mm0
474    psraw     mm1, 6
475    psraw     mm2, 6
476    packuswb  mm1, mm2
477    movntq    [ebp], mm1
478    lea       ebp, [ebp + 8]
479    sub       ecx, 2
480    ja        convertloop
481
482    popad
483    ret
484  }
485}
486
487__declspec(naked)
488void FastConvertYUVToBGRARow(const uint8* y_buf,
489                             const uint8* u_buf,
490                             const uint8* v_buf,
491                             uint8* rgb_buf,
492                             int width) {
493  __asm {
494    pushad
495    mov       edx, [esp + 32 + 4]
496    mov       edi, [esp + 32 + 8]
497    mov       esi, [esp + 32 + 12]
498    mov       ebp, [esp + 32 + 16]
499    mov       ecx, [esp + 32 + 20]
500
501 convertloop :
502    movzx     eax, byte ptr [edi]
503    lea       edi, [edi + 1]
504    movzx     ebx, byte ptr [esi]
505    lea       esi, [esi + 1]
506    movq      mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
507    movzx     eax, byte ptr [edx]
508    paddsw    mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
509    movzx     ebx, byte ptr [edx + 1]
510    movq      mm1, [_kCoefficientsBgraY + 8 * eax]
511    lea       edx, [edx + 2]
512    movq      mm2, [_kCoefficientsBgraY + 8 * ebx]
513    paddsw    mm1, mm0
514    paddsw    mm2, mm0
515    psraw     mm1, 6
516    psraw     mm2, 6
517    packuswb  mm1, mm2
518    movntq    [ebp], mm1
519    lea       ebp, [ebp + 8]
520    sub       ecx, 2
521    ja        convertloop
522
523    popad
524    ret
525  }
526}
527
528__declspec(naked)
529void FastConvertYUVToABGRRow(const uint8* y_buf,
530                             const uint8* u_buf,
531                             const uint8* v_buf,
532                             uint8* rgb_buf,
533                             int width) {
534  __asm {
535    pushad
536    mov       edx, [esp + 32 + 4]
537    mov       edi, [esp + 32 + 8]
538    mov       esi, [esp + 32 + 12]
539    mov       ebp, [esp + 32 + 16]
540    mov       ecx, [esp + 32 + 20]
541
542 convertloop :
543    movzx     eax, byte ptr [edi]
544    lea       edi, [edi + 1]
545    movzx     ebx, byte ptr [esi]
546    lea       esi, [esi + 1]
547    movq      mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
548    movzx     eax, byte ptr [edx]
549    paddsw    mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
550    movzx     ebx, byte ptr [edx + 1]
551    movq      mm1, [_kCoefficientsAbgrY + 8 * eax]
552    lea       edx, [edx + 2]
553    movq      mm2, [_kCoefficientsAbgrY + 8 * ebx]
554    paddsw    mm1, mm0
555    paddsw    mm2, mm0
556    psraw     mm1, 6
557    psraw     mm2, 6
558    packuswb  mm1, mm2
559    movntq    [ebp], mm1
560    lea       ebp, [ebp + 8]
561    sub       ecx, 2
562    ja        convertloop
563
564    popad
565    ret
566  }
567}
568
569__declspec(naked)
570void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
571                                 const uint8* u_buf,
572                                 const uint8* v_buf,
573                                 uint8* rgb_buf,
574                                 int width) {
575  __asm {
576    pushad
577    mov       edx, [esp + 32 + 4]   // Y
578    mov       edi, [esp + 32 + 8]   // U
579    mov       esi, [esp + 32 + 12]  // V
580    mov       ebp, [esp + 32 + 16]  // rgb
581    mov       ecx, [esp + 32 + 20]  // width
582
583 convertloop :
584    movzx     eax, byte ptr [edi]
585    lea       edi, [edi + 1]
586    movzx     ebx, byte ptr [esi]
587    lea       esi, [esi + 1]
588    movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
589    movzx     eax, byte ptr [edx]
590    paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
591    lea       edx, [edx + 1]
592    paddsw    mm0, [_kCoefficientsRgbY + 8 * eax]
593    psraw     mm0, 6
594    packuswb  mm0, mm0
595    movd      [ebp], mm0
596    lea       ebp, [ebp + 4]
597    sub       ecx, 1
598    ja        convertloop
599
600    popad
601    ret
602  }
603}
604
605__declspec(naked)
606void FastConvertYToRGB32Row(const uint8* y_buf,
607                            uint8* rgb_buf,
608                            int width) {
609  __asm {
610    push      ebx
611    mov       eax, [esp + 4 + 4]   // Y
612    mov       edx, [esp + 4 + 8]   // rgb
613    mov       ecx, [esp + 4 + 12]  // width
614
615 convertloop :
616    movzx     ebx, byte ptr [eax]
617    movq      mm0, [_kCoefficientsRgbY + 8 * ebx]
618    psraw     mm0, 6
619    movzx     ebx, byte ptr [eax + 1]
620    movq      mm1, [_kCoefficientsRgbY + 8 * ebx]
621    psraw     mm1, 6
622    packuswb  mm0, mm1
623    lea       eax, [eax + 2]
624    movq      [edx], mm0
625    lea       edx, [edx + 8]
626    sub       ecx, 2
627    ja        convertloop
628
629    pop       ebx
630    ret
631  }
632}
633
634#endif
635
636}  // extern "C"
637