1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
18// This module is for Visual C x86.
19#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
20
21// TODO(fbarchard): I420ToRGB24, I420ToRAW
22#ifdef HAS_ARGBTOYROW_SSSE3
23
24// Constants for ARGB.
25static const vec8 kARGBToY = {
26  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
27};
28
29static const vec8 kARGBToU = {
30  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
31};
32
33static const vec8 kARGBToV = {
34  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
35};
36
37// Constants for BGRA.
38static const vec8 kBGRAToY = {
39  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
40};
41
42static const vec8 kBGRAToU = {
43  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
44};
45
46static const vec8 kBGRAToV = {
47  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
48};
49
50// Constants for ABGR.
51static const vec8 kABGRToY = {
52  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
53};
54
55static const vec8 kABGRToU = {
56  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
57};
58
59static const vec8 kABGRToV = {
60  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
61};
62
63// Constants for RGBA.
64static const vec8 kRGBAToY = {
65  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
66};
67
68static const vec8 kRGBAToU = {
69  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
70};
71
72static const vec8 kRGBAToV = {
73  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
74};
75
76static const uvec8 kAddY16 = {
77  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
78};
79
80static const uvec8 kAddUV128 = {
81  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
82  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
83};
84
85// Shuffle table for converting RGB24 to ARGB.
86static const uvec8 kShuffleMaskRGB24ToARGB = {
87  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
88};
89
90// Shuffle table for converting RAW to ARGB.
91static const uvec8 kShuffleMaskRAWToARGB = {
92  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
93};
94
95// Shuffle table for converting BGRA to ARGB.
96static const uvec8 kShuffleMaskBGRAToARGB = {
97  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
98};
99
100// Shuffle table for converting ABGR to ARGB.
101static const uvec8 kShuffleMaskABGRToARGB = {
102  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
103};
104
105// Shuffle table for converting RGBA to ARGB.
106static const uvec8 kShuffleMaskRGBAToARGB = {
107  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
108};
109
110// Shuffle table for converting ARGB to RGBA.
111static const uvec8 kShuffleMaskARGBToRGBA = {
112  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
113};
114
115// Shuffle table for converting ARGB to RGB24.
116static const uvec8 kShuffleMaskARGBToRGB24 = {
117  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
118};
119
120// Shuffle table for converting ARGB to RAW.
121static const uvec8 kShuffleMaskARGBToRAW = {
122  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
123};
124
125__declspec(naked) __declspec(align(16))
126void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
127  __asm {
128    mov        eax, [esp + 4]        // src_y
129    mov        edx, [esp + 8]        // dst_argb
130    mov        ecx, [esp + 12]       // pix
131    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
132    pslld      xmm5, 24
133
134    align      16
135  convertloop:
136    movq       xmm0, qword ptr [eax]
137    lea        eax,  [eax + 8]
138    punpcklbw  xmm0, xmm0
139    movdqa     xmm1, xmm0
140    punpcklwd  xmm0, xmm0
141    punpckhwd  xmm1, xmm1
142    por        xmm0, xmm5
143    por        xmm1, xmm5
144    movdqa     [edx], xmm0
145    movdqa     [edx + 16], xmm1
146    lea        edx, [edx + 32]
147    sub        ecx, 8
148    jg         convertloop
149    ret
150  }
151}
152
153__declspec(naked) __declspec(align(16))
154void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
155__asm {
156    mov       eax, [esp + 4]   // src_bgra
157    mov       edx, [esp + 8]   // dst_argb
158    mov       ecx, [esp + 12]  // pix
159    movdqa    xmm5, kShuffleMaskBGRAToARGB
160    sub       edx, eax
161
162    align      16
163 convertloop:
164    movdqa    xmm0, [eax]
165    pshufb    xmm0, xmm5
166    sub       ecx, 4
167    movdqa    [eax + edx], xmm0
168    lea       eax, [eax + 16]
169    jg        convertloop
170    ret
171  }
172}
173
174__declspec(naked) __declspec(align(16))
175void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
176__asm {
177    mov       eax, [esp + 4]   // src_abgr
178    mov       edx, [esp + 8]   // dst_argb
179    mov       ecx, [esp + 12]  // pix
180    movdqa    xmm5, kShuffleMaskABGRToARGB
181    sub       edx, eax
182
183    align      16
184 convertloop:
185    movdqa    xmm0, [eax]
186    pshufb    xmm0, xmm5
187    sub       ecx, 4
188    movdqa    [eax + edx], xmm0
189    lea       eax, [eax + 16]
190    jg        convertloop
191    ret
192  }
193}
194
195__declspec(naked) __declspec(align(16))
196void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
197__asm {
198    mov       eax, [esp + 4]   // src_rgba
199    mov       edx, [esp + 8]   // dst_argb
200    mov       ecx, [esp + 12]  // pix
201    movdqa    xmm5, kShuffleMaskRGBAToARGB
202    sub       edx, eax
203
204    align      16
205 convertloop:
206    movdqa    xmm0, [eax]
207    pshufb    xmm0, xmm5
208    sub       ecx, 4
209    movdqa    [eax + edx], xmm0
210    lea       eax, [eax + 16]
211    jg        convertloop
212    ret
213  }
214}
215
216__declspec(naked) __declspec(align(16))
217void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
218__asm {
219    mov       eax, [esp + 4]   // src_argb
220    mov       edx, [esp + 8]   // dst_rgba
221    mov       ecx, [esp + 12]  // pix
222    movdqa    xmm5, kShuffleMaskARGBToRGBA
223    sub       edx, eax
224
225    align      16
226 convertloop:
227    movdqa    xmm0, [eax]
228    pshufb    xmm0, xmm5
229    sub       ecx, 4
230    movdqa    [eax + edx], xmm0
231    lea       eax, [eax + 16]
232    jg        convertloop
233    ret
234  }
235}
236
237__declspec(naked) __declspec(align(16))
238void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
239__asm {
240    mov       eax, [esp + 4]   // src_rgb24
241    mov       edx, [esp + 8]   // dst_argb
242    mov       ecx, [esp + 12]  // pix
243    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
244    pslld     xmm5, 24
245    movdqa    xmm4, kShuffleMaskRGB24ToARGB
246
247    align      16
248 convertloop:
249    movdqu    xmm0, [eax]
250    movdqu    xmm1, [eax + 16]
251    movdqu    xmm3, [eax + 32]
252    lea       eax, [eax + 48]
253    movdqa    xmm2, xmm3
254    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
255    pshufb    xmm2, xmm4
256    por       xmm2, xmm5
257    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
258    pshufb    xmm0, xmm4
259    movdqa    [edx + 32], xmm2
260    por       xmm0, xmm5
261    pshufb    xmm1, xmm4
262    movdqa    [edx], xmm0
263    por       xmm1, xmm5
264    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
265    pshufb    xmm3, xmm4
266    movdqa    [edx + 16], xmm1
267    por       xmm3, xmm5
268    sub       ecx, 16
269    movdqa    [edx + 48], xmm3
270    lea       edx, [edx + 64]
271    jg        convertloop
272    ret
273  }
274}
275
276__declspec(naked) __declspec(align(16))
277void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
278                        int pix) {
279__asm {
280    mov       eax, [esp + 4]   // src_raw
281    mov       edx, [esp + 8]   // dst_argb
282    mov       ecx, [esp + 12]  // pix
283    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
284    pslld     xmm5, 24
285    movdqa    xmm4, kShuffleMaskRAWToARGB
286
287    align      16
288 convertloop:
289    movdqu    xmm0, [eax]
290    movdqu    xmm1, [eax + 16]
291    movdqu    xmm3, [eax + 32]
292    lea       eax, [eax + 48]
293    movdqa    xmm2, xmm3
294    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
295    pshufb    xmm2, xmm4
296    por       xmm2, xmm5
297    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
298    pshufb    xmm0, xmm4
299    movdqa    [edx + 32], xmm2
300    por       xmm0, xmm5
301    pshufb    xmm1, xmm4
302    movdqa    [edx], xmm0
303    por       xmm1, xmm5
304    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
305    pshufb    xmm3, xmm4
306    movdqa    [edx + 16], xmm1
307    por       xmm3, xmm5
308    sub       ecx, 16
309    movdqa    [edx + 48], xmm3
310    lea       edx, [edx + 64]
311    jg        convertloop
312    ret
313  }
314}
315
316// pmul method to replicate bits.
317// Math to replicate bits:
318// (v << 8) | (v << 3)
319// v * 256 + v * 8
320// v * (256 + 8)
321// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
322// 20 instructions.
323__declspec(naked) __declspec(align(16))
324void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
325                          int pix) {
326__asm {
327    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
328    movd      xmm5, eax
329    pshufd    xmm5, xmm5, 0
330    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
331    movd      xmm6, eax
332    pshufd    xmm6, xmm6, 0
333    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
334    psllw     xmm3, 11
335    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
336    psllw     xmm4, 10
337    psrlw     xmm4, 5
338    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
339    psllw     xmm7, 8
340
341    mov       eax, [esp + 4]   // src_rgb565
342    mov       edx, [esp + 8]   // dst_argb
343    mov       ecx, [esp + 12]  // pix
344    sub       edx, eax
345    sub       edx, eax
346
347    align      16
348 convertloop:
349    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
350    movdqa    xmm1, xmm0
351    movdqa    xmm2, xmm0
352    pand      xmm1, xmm3    // R in upper 5 bits
353    psllw     xmm2, 11      // B in upper 5 bits
354    pmulhuw   xmm1, xmm5    // * (256 + 8)
355    pmulhuw   xmm2, xmm5    // * (256 + 8)
356    psllw     xmm1, 8
357    por       xmm1, xmm2    // RB
358    pand      xmm0, xmm4    // G in middle 6 bits
359    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
360    por       xmm0, xmm7    // AG
361    movdqa    xmm2, xmm1
362    punpcklbw xmm1, xmm0
363    punpckhbw xmm2, xmm0
364    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
365    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
366    lea       eax, [eax + 16]
367    sub       ecx, 8
368    jg        convertloop
369    ret
370  }
371}
372
373// 24 instructions
374__declspec(naked) __declspec(align(16))
375void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
376                            int pix) {
377__asm {
378    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
379    movd      xmm5, eax
380    pshufd    xmm5, xmm5, 0
381    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
382    movd      xmm6, eax
383    pshufd    xmm6, xmm6, 0
384    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
385    psllw     xmm3, 11
386    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
387    psrlw     xmm4, 6
388    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
389    psllw     xmm7, 8
390
391    mov       eax, [esp + 4]   // src_argb1555
392    mov       edx, [esp + 8]   // dst_argb
393    mov       ecx, [esp + 12]  // pix
394    sub       edx, eax
395    sub       edx, eax
396
397    align      16
398 convertloop:
399    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
400    movdqa    xmm1, xmm0
401    movdqa    xmm2, xmm0
402    psllw     xmm1, 1       // R in upper 5 bits
403    psllw     xmm2, 11      // B in upper 5 bits
404    pand      xmm1, xmm3
405    pmulhuw   xmm2, xmm5    // * (256 + 8)
406    pmulhuw   xmm1, xmm5    // * (256 + 8)
407    psllw     xmm1, 8
408    por       xmm1, xmm2    // RB
409    movdqa    xmm2, xmm0
410    pand      xmm0, xmm4    // G in middle 5 bits
411    psraw     xmm2, 8       // A
412    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
413    pand      xmm2, xmm7
414    por       xmm0, xmm2    // AG
415    movdqa    xmm2, xmm1
416    punpcklbw xmm1, xmm0
417    punpckhbw xmm2, xmm0
418    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
419    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
420    lea       eax, [eax + 16]
421    sub       ecx, 8
422    jg        convertloop
423    ret
424  }
425}
426
427// 18 instructions.
428__declspec(naked) __declspec(align(16))
429void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
430                            int pix) {
431__asm {
432    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
433    movd      xmm4, eax
434    pshufd    xmm4, xmm4, 0
435    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
436    pslld     xmm5, 4
437    mov       eax, [esp + 4]   // src_argb4444
438    mov       edx, [esp + 8]   // dst_argb
439    mov       ecx, [esp + 12]  // pix
440    sub       edx, eax
441    sub       edx, eax
442
443    align      16
444 convertloop:
445    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
446    movdqa    xmm2, xmm0
447    pand      xmm0, xmm4    // mask low nibbles
448    pand      xmm2, xmm5    // mask high nibbles
449    movdqa    xmm1, xmm0
450    movdqa    xmm3, xmm2
451    psllw     xmm1, 4
452    psrlw     xmm3, 4
453    por       xmm0, xmm1
454    por       xmm2, xmm3
455    movdqa    xmm1, xmm0
456    punpcklbw xmm0, xmm2
457    punpckhbw xmm1, xmm2
458    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
459    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
460    lea       eax, [eax + 16]
461    sub       ecx, 8
462    jg        convertloop
463    ret
464  }
465}
466
467__declspec(naked) __declspec(align(16))
468void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
469__asm {
470    mov       eax, [esp + 4]   // src_argb
471    mov       edx, [esp + 8]   // dst_rgb
472    mov       ecx, [esp + 12]  // pix
473    movdqa    xmm6, kShuffleMaskARGBToRGB24
474
475    align      16
476 convertloop:
477    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
478    movdqa    xmm1, [eax + 16]
479    movdqa    xmm2, [eax + 32]
480    movdqa    xmm3, [eax + 48]
481    lea       eax, [eax + 64]
482    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
483    pshufb    xmm1, xmm6
484    pshufb    xmm2, xmm6
485    pshufb    xmm3, xmm6
486    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
487    psrldq    xmm1, 4      // 8 bytes from 1
488    pslldq    xmm4, 12     // 4 bytes from 1 for 0
489    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
490    por       xmm0, xmm4   // 4 bytes from 1 for 0
491    pslldq    xmm5, 8      // 8 bytes from 2 for 1
492    movdqa    [edx], xmm0  // store 0
493    por       xmm1, xmm5   // 8 bytes from 2 for 1
494    psrldq    xmm2, 8      // 4 bytes from 2
495    pslldq    xmm3, 4      // 12 bytes from 3 for 2
496    por       xmm2, xmm3   // 12 bytes from 3 for 2
497    movdqa    [edx + 16], xmm1   // store 1
498    movdqa    [edx + 32], xmm2   // store 2
499    lea       edx, [edx + 48]
500    sub       ecx, 16
501    jg        convertloop
502    ret
503  }
504}
505
506__declspec(naked) __declspec(align(16))
507void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
508__asm {
509    mov       eax, [esp + 4]   // src_argb
510    mov       edx, [esp + 8]   // dst_rgb
511    mov       ecx, [esp + 12]  // pix
512    movdqa    xmm6, kShuffleMaskARGBToRAW
513
514    align      16
515 convertloop:
516    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
517    movdqa    xmm1, [eax + 16]
518    movdqa    xmm2, [eax + 32]
519    movdqa    xmm3, [eax + 48]
520    lea       eax, [eax + 64]
521    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
522    pshufb    xmm1, xmm6
523    pshufb    xmm2, xmm6
524    pshufb    xmm3, xmm6
525    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
526    psrldq    xmm1, 4      // 8 bytes from 1
527    pslldq    xmm4, 12     // 4 bytes from 1 for 0
528    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
529    por       xmm0, xmm4   // 4 bytes from 1 for 0
530    pslldq    xmm5, 8      // 8 bytes from 2 for 1
531    movdqa    [edx], xmm0  // store 0
532    por       xmm1, xmm5   // 8 bytes from 2 for 1
533    psrldq    xmm2, 8      // 4 bytes from 2
534    pslldq    xmm3, 4      // 12 bytes from 3 for 2
535    por       xmm2, xmm3   // 12 bytes from 3 for 2
536    movdqa    [edx + 16], xmm1   // store 1
537    movdqa    [edx + 32], xmm2   // store 2
538    lea       edx, [edx + 48]
539    sub       ecx, 16
540    jg        convertloop
541    ret
542  }
543}
544
545__declspec(naked) __declspec(align(16))
546void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
547__asm {
548    mov       eax, [esp + 4]   // src_argb
549    mov       edx, [esp + 8]   // dst_rgb
550    mov       ecx, [esp + 12]  // pix
551    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
552    psrld     xmm3, 27
553    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
554    psrld     xmm4, 26
555    pslld     xmm4, 5
556    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
557    pslld     xmm5, 11
558
559    align      16
560 convertloop:
561    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
562    movdqa    xmm1, xmm0    // B
563    movdqa    xmm2, xmm0    // G
564    pslld     xmm0, 8       // R
565    psrld     xmm1, 3       // B
566    psrld     xmm2, 5       // G
567    psrad     xmm0, 16      // R
568    pand      xmm1, xmm3    // B
569    pand      xmm2, xmm4    // G
570    pand      xmm0, xmm5    // R
571    por       xmm1, xmm2    // BG
572    por       xmm0, xmm1    // BGR
573    packssdw  xmm0, xmm0
574    lea       eax, [eax + 16]
575    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
576    lea       edx, [edx + 8]
577    sub       ecx, 4
578    jg        convertloop
579    ret
580  }
581}
582
583// TODO(fbarchard): Improve sign extension/packing.
584__declspec(naked) __declspec(align(16))
585void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
586__asm {
587    mov       eax, [esp + 4]   // src_argb
588    mov       edx, [esp + 8]   // dst_rgb
589    mov       ecx, [esp + 12]  // pix
590    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
591    psrld     xmm4, 27
592    movdqa    xmm5, xmm4       // generate mask 0x000003e0
593    pslld     xmm5, 5
594    movdqa    xmm6, xmm4       // generate mask 0x00007c00
595    pslld     xmm6, 10
596    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
597    pslld     xmm7, 15
598
599    align      16
600 convertloop:
601    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
602    movdqa    xmm1, xmm0    // B
603    movdqa    xmm2, xmm0    // G
604    movdqa    xmm3, xmm0    // R
605    psrad     xmm0, 16      // A
606    psrld     xmm1, 3       // B
607    psrld     xmm2, 6       // G
608    psrld     xmm3, 9       // R
609    pand      xmm0, xmm7    // A
610    pand      xmm1, xmm4    // B
611    pand      xmm2, xmm5    // G
612    pand      xmm3, xmm6    // R
613    por       xmm0, xmm1    // BA
614    por       xmm2, xmm3    // GR
615    por       xmm0, xmm2    // BGRA
616    packssdw  xmm0, xmm0
617    lea       eax, [eax + 16]
618    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
619    lea       edx, [edx + 8]
620    sub       ecx, 4
621    jg        convertloop
622    ret
623  }
624}
625
626__declspec(naked) __declspec(align(16))
627void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
628__asm {
629    mov       eax, [esp + 4]   // src_argb
630    mov       edx, [esp + 8]   // dst_rgb
631    mov       ecx, [esp + 12]  // pix
632    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
633    psllw     xmm4, 12
634    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
635    psrlw     xmm3, 8
636
637    align      16
638 convertloop:
639    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
640    movdqa    xmm1, xmm0
641    pand      xmm0, xmm3    // low nibble
642    pand      xmm1, xmm4    // high nibble
643    psrl      xmm0, 4
644    psrl      xmm1, 8
645    por       xmm0, xmm1
646    packuswb  xmm0, xmm0
647    lea       eax, [eax + 16]
648    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
649    lea       edx, [edx + 8]
650    sub       ecx, 4
651    jg        convertloop
652    ret
653  }
654}
655
656// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
657__declspec(naked) __declspec(align(16))
658void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
659__asm {
660    mov        eax, [esp + 4]   /* src_argb */
661    mov        edx, [esp + 8]   /* dst_y */
662    mov        ecx, [esp + 12]  /* pix */
663    movdqa     xmm5, kAddY16
664    movdqa     xmm4, kARGBToY
665
666    align      16
667 convertloop:
668    movdqa     xmm0, [eax]
669    movdqa     xmm1, [eax + 16]
670    movdqa     xmm2, [eax + 32]
671    movdqa     xmm3, [eax + 48]
672    pmaddubsw  xmm0, xmm4
673    pmaddubsw  xmm1, xmm4
674    pmaddubsw  xmm2, xmm4
675    pmaddubsw  xmm3, xmm4
676    lea        eax, [eax + 64]
677    phaddw     xmm0, xmm1
678    phaddw     xmm2, xmm3
679    psrlw      xmm0, 7
680    psrlw      xmm2, 7
681    packuswb   xmm0, xmm2
682    paddb      xmm0, xmm5
683    sub        ecx, 16
684    movdqa     [edx], xmm0
685    lea        edx, [edx + 16]
686    jg         convertloop
687    ret
688  }
689}
690
691__declspec(naked) __declspec(align(16))
692void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
693__asm {
694    mov        eax, [esp + 4]   /* src_argb */
695    mov        edx, [esp + 8]   /* dst_y */
696    mov        ecx, [esp + 12]  /* pix */
697    movdqa     xmm5, kAddY16
698    movdqa     xmm4, kARGBToY
699
700    align      16
701 convertloop:
702    movdqu     xmm0, [eax]
703    movdqu     xmm1, [eax + 16]
704    movdqu     xmm2, [eax + 32]
705    movdqu     xmm3, [eax + 48]
706    pmaddubsw  xmm0, xmm4
707    pmaddubsw  xmm1, xmm4
708    pmaddubsw  xmm2, xmm4
709    pmaddubsw  xmm3, xmm4
710    lea        eax, [eax + 64]
711    phaddw     xmm0, xmm1
712    phaddw     xmm2, xmm3
713    psrlw      xmm0, 7
714    psrlw      xmm2, 7
715    packuswb   xmm0, xmm2
716    paddb      xmm0, xmm5
717    sub        ecx, 16
718    movdqu     [edx], xmm0
719    lea        edx, [edx + 16]
720    jg         convertloop
721    ret
722  }
723}
724
725__declspec(naked) __declspec(align(16))
726void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
727__asm {
728    mov        eax, [esp + 4]   /* src_argb */
729    mov        edx, [esp + 8]   /* dst_y */
730    mov        ecx, [esp + 12]  /* pix */
731    movdqa     xmm5, kAddY16
732    movdqa     xmm4, kBGRAToY
733
734    align      16
735 convertloop:
736    movdqa     xmm0, [eax]
737    movdqa     xmm1, [eax + 16]
738    movdqa     xmm2, [eax + 32]
739    movdqa     xmm3, [eax + 48]
740    pmaddubsw  xmm0, xmm4
741    pmaddubsw  xmm1, xmm4
742    pmaddubsw  xmm2, xmm4
743    pmaddubsw  xmm3, xmm4
744    lea        eax, [eax + 64]
745    phaddw     xmm0, xmm1
746    phaddw     xmm2, xmm3
747    psrlw      xmm0, 7
748    psrlw      xmm2, 7
749    packuswb   xmm0, xmm2
750    paddb      xmm0, xmm5
751    sub        ecx, 16
752    movdqa     [edx], xmm0
753    lea        edx, [edx + 16]
754    jg         convertloop
755    ret
756  }
757}
758
759__declspec(naked) __declspec(align(16))
760void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
761__asm {
762    mov        eax, [esp + 4]   /* src_argb */
763    mov        edx, [esp + 8]   /* dst_y */
764    mov        ecx, [esp + 12]  /* pix */
765    movdqa     xmm5, kAddY16
766    movdqa     xmm4, kBGRAToY
767
768    align      16
769 convertloop:
770    movdqu     xmm0, [eax]
771    movdqu     xmm1, [eax + 16]
772    movdqu     xmm2, [eax + 32]
773    movdqu     xmm3, [eax + 48]
774    pmaddubsw  xmm0, xmm4
775    pmaddubsw  xmm1, xmm4
776    pmaddubsw  xmm2, xmm4
777    pmaddubsw  xmm3, xmm4
778    lea        eax, [eax + 64]
779    phaddw     xmm0, xmm1
780    phaddw     xmm2, xmm3
781    psrlw      xmm0, 7
782    psrlw      xmm2, 7
783    packuswb   xmm0, xmm2
784    paddb      xmm0, xmm5
785    sub        ecx, 16
786    movdqu     [edx], xmm0
787    lea        edx, [edx + 16]
788    jg         convertloop
789    ret
790  }
791}
792
793__declspec(naked) __declspec(align(16))
794void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
795__asm {
796    mov        eax, [esp + 4]   /* src_argb */
797    mov        edx, [esp + 8]   /* dst_y */
798    mov        ecx, [esp + 12]  /* pix */
799    movdqa     xmm5, kAddY16
800    movdqa     xmm4, kABGRToY
801
802    align      16
803 convertloop:
804    movdqa     xmm0, [eax]
805    movdqa     xmm1, [eax + 16]
806    movdqa     xmm2, [eax + 32]
807    movdqa     xmm3, [eax + 48]
808    pmaddubsw  xmm0, xmm4
809    pmaddubsw  xmm1, xmm4
810    pmaddubsw  xmm2, xmm4
811    pmaddubsw  xmm3, xmm4
812    lea        eax, [eax + 64]
813    phaddw     xmm0, xmm1
814    phaddw     xmm2, xmm3
815    psrlw      xmm0, 7
816    psrlw      xmm2, 7
817    packuswb   xmm0, xmm2
818    paddb      xmm0, xmm5
819    sub        ecx, 16
820    movdqa     [edx], xmm0
821    lea        edx, [edx + 16]
822    jg         convertloop
823    ret
824  }
825}
826
827__declspec(naked) __declspec(align(16))
828void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
829__asm {
830    mov        eax, [esp + 4]   /* src_argb */
831    mov        edx, [esp + 8]   /* dst_y */
832    mov        ecx, [esp + 12]  /* pix */
833    movdqa     xmm5, kAddY16
834    movdqa     xmm4, kABGRToY
835
836    align      16
837 convertloop:
838    movdqu     xmm0, [eax]
839    movdqu     xmm1, [eax + 16]
840    movdqu     xmm2, [eax + 32]
841    movdqu     xmm3, [eax + 48]
842    pmaddubsw  xmm0, xmm4
843    pmaddubsw  xmm1, xmm4
844    pmaddubsw  xmm2, xmm4
845    pmaddubsw  xmm3, xmm4
846    lea        eax, [eax + 64]
847    phaddw     xmm0, xmm1
848    phaddw     xmm2, xmm3
849    psrlw      xmm0, 7
850    psrlw      xmm2, 7
851    packuswb   xmm0, xmm2
852    paddb      xmm0, xmm5
853    sub        ecx, 16
854    movdqu     [edx], xmm0
855    lea        edx, [edx + 16]
856    jg         convertloop
857    ret
858  }
859}
860
861__declspec(naked) __declspec(align(16))
862void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
863__asm {
864    mov        eax, [esp + 4]   /* src_argb */
865    mov        edx, [esp + 8]   /* dst_y */
866    mov        ecx, [esp + 12]  /* pix */
867    movdqa     xmm5, kAddY16
868    movdqa     xmm4, kRGBAToY
869
870    align      16
871 convertloop:
872    movdqa     xmm0, [eax]
873    movdqa     xmm1, [eax + 16]
874    movdqa     xmm2, [eax + 32]
875    movdqa     xmm3, [eax + 48]
876    pmaddubsw  xmm0, xmm4
877    pmaddubsw  xmm1, xmm4
878    pmaddubsw  xmm2, xmm4
879    pmaddubsw  xmm3, xmm4
880    lea        eax, [eax + 64]
881    phaddw     xmm0, xmm1
882    phaddw     xmm2, xmm3
883    psrlw      xmm0, 7
884    psrlw      xmm2, 7
885    packuswb   xmm0, xmm2
886    paddb      xmm0, xmm5
887    sub        ecx, 16
888    movdqa     [edx], xmm0
889    lea        edx, [edx + 16]
890    jg         convertloop
891    ret
892  }
893}
894
895__declspec(naked) __declspec(align(16))
896void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
897__asm {
898    mov        eax, [esp + 4]   /* src_argb */
899    mov        edx, [esp + 8]   /* dst_y */
900    mov        ecx, [esp + 12]  /* pix */
901    movdqa     xmm5, kAddY16
902    movdqa     xmm4, kRGBAToY
903
904    align      16
905 convertloop:
906    movdqu     xmm0, [eax]
907    movdqu     xmm1, [eax + 16]
908    movdqu     xmm2, [eax + 32]
909    movdqu     xmm3, [eax + 48]
910    pmaddubsw  xmm0, xmm4
911    pmaddubsw  xmm1, xmm4
912    pmaddubsw  xmm2, xmm4
913    pmaddubsw  xmm3, xmm4
914    lea        eax, [eax + 64]
915    phaddw     xmm0, xmm1
916    phaddw     xmm2, xmm3
917    psrlw      xmm0, 7
918    psrlw      xmm2, 7
919    packuswb   xmm0, xmm2
920    paddb      xmm0, xmm5
921    sub        ecx, 16
922    movdqu     [edx], xmm0
923    lea        edx, [edx + 16]
924    jg         convertloop
925    ret
926  }
927}
928
929__declspec(naked) __declspec(align(16))
930void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
931                       uint8* dst_u, uint8* dst_v, int width) {
932__asm {
933    push       esi
934    push       edi
935    mov        eax, [esp + 8 + 4]   // src_argb
936    mov        esi, [esp + 8 + 8]   // src_stride_argb
937    mov        edx, [esp + 8 + 12]  // dst_u
938    mov        edi, [esp + 8 + 16]  // dst_v
939    mov        ecx, [esp + 8 + 20]  // pix
940    movdqa     xmm7, kARGBToU
941    movdqa     xmm6, kARGBToV
942    movdqa     xmm5, kAddUV128
943    sub        edi, edx             // stride from u to v
944
945    align      16
946 convertloop:
947    /* step 1 - subsample 16x2 argb pixels to 8x1 */
948    movdqa     xmm0, [eax]
949    movdqa     xmm1, [eax + 16]
950    movdqa     xmm2, [eax + 32]
951    movdqa     xmm3, [eax + 48]
952    pavgb      xmm0, [eax + esi]
953    pavgb      xmm1, [eax + esi + 16]
954    pavgb      xmm2, [eax + esi + 32]
955    pavgb      xmm3, [eax + esi + 48]
956    lea        eax,  [eax + 64]
957    movdqa     xmm4, xmm0
958    shufps     xmm0, xmm1, 0x88
959    shufps     xmm4, xmm1, 0xdd
960    pavgb      xmm0, xmm4
961    movdqa     xmm4, xmm2
962    shufps     xmm2, xmm3, 0x88
963    shufps     xmm4, xmm3, 0xdd
964    pavgb      xmm2, xmm4
965
966    // step 2 - convert to U and V
967    // from here down is very similar to Y code except
968    // instead of 16 different pixels, its 8 pixels of U and 8 of V
969    movdqa     xmm1, xmm0
970    movdqa     xmm3, xmm2
971    pmaddubsw  xmm0, xmm7  // U
972    pmaddubsw  xmm2, xmm7
973    pmaddubsw  xmm1, xmm6  // V
974    pmaddubsw  xmm3, xmm6
975    phaddw     xmm0, xmm2
976    phaddw     xmm1, xmm3
977    psraw      xmm0, 8
978    psraw      xmm1, 8
979    packsswb   xmm0, xmm1
980    paddb      xmm0, xmm5            // -> unsigned
981
982    // step 3 - store 8 U and 8 V values
983    sub        ecx, 16
984    movlps     qword ptr [edx], xmm0 // U
985    movhps     qword ptr [edx + edi], xmm0 // V
986    lea        edx, [edx + 8]
987    jg         convertloop
988
989    pop        edi
990    pop        esi
991    ret
992  }
993}
994
995__declspec(naked) __declspec(align(16))
996void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
997                                 uint8* dst_u, uint8* dst_v, int width) {
998__asm {
999    push       esi
1000    push       edi
1001    mov        eax, [esp + 8 + 4]   // src_argb
1002    mov        esi, [esp + 8 + 8]   // src_stride_argb
1003    mov        edx, [esp + 8 + 12]  // dst_u
1004    mov        edi, [esp + 8 + 16]  // dst_v
1005    mov        ecx, [esp + 8 + 20]  // pix
1006    movdqa     xmm7, kARGBToU
1007    movdqa     xmm6, kARGBToV
1008    movdqa     xmm5, kAddUV128
1009    sub        edi, edx             // stride from u to v
1010
1011    align      16
1012 convertloop:
1013    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1014    movdqu     xmm0, [eax]
1015    movdqu     xmm1, [eax + 16]
1016    movdqu     xmm2, [eax + 32]
1017    movdqu     xmm3, [eax + 48]
1018    movdqu     xmm4, [eax + esi]
1019    pavgb      xmm0, xmm4
1020    movdqu     xmm4, [eax + esi + 16]
1021    pavgb      xmm1, xmm4
1022    movdqu     xmm4, [eax + esi + 32]
1023    pavgb      xmm2, xmm4
1024    movdqu     xmm4, [eax + esi + 48]
1025    pavgb      xmm3, xmm4
1026    lea        eax,  [eax + 64]
1027    movdqa     xmm4, xmm0
1028    shufps     xmm0, xmm1, 0x88
1029    shufps     xmm4, xmm1, 0xdd
1030    pavgb      xmm0, xmm4
1031    movdqa     xmm4, xmm2
1032    shufps     xmm2, xmm3, 0x88
1033    shufps     xmm4, xmm3, 0xdd
1034    pavgb      xmm2, xmm4
1035
1036    // step 2 - convert to U and V
1037    // from here down is very similar to Y code except
1038    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1039    movdqa     xmm1, xmm0
1040    movdqa     xmm3, xmm2
1041    pmaddubsw  xmm0, xmm7  // U
1042    pmaddubsw  xmm2, xmm7
1043    pmaddubsw  xmm1, xmm6  // V
1044    pmaddubsw  xmm3, xmm6
1045    phaddw     xmm0, xmm2
1046    phaddw     xmm1, xmm3
1047    psraw      xmm0, 8
1048    psraw      xmm1, 8
1049    packsswb   xmm0, xmm1
1050    paddb      xmm0, xmm5            // -> unsigned
1051
1052    // step 3 - store 8 U and 8 V values
1053    sub        ecx, 16
1054    movlps     qword ptr [edx], xmm0 // U
1055    movhps     qword ptr [edx + edi], xmm0 // V
1056    lea        edx, [edx + 8]
1057    jg         convertloop
1058
1059    pop        edi
1060    pop        esi
1061    ret
1062  }
1063}
1064
1065__declspec(naked) __declspec(align(16))
1066void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1067                       uint8* dst_u, uint8* dst_v, int width) {
1068__asm {
1069    push       esi
1070    push       edi
1071    mov        eax, [esp + 8 + 4]   // src_argb
1072    mov        esi, [esp + 8 + 8]   // src_stride_argb
1073    mov        edx, [esp + 8 + 12]  // dst_u
1074    mov        edi, [esp + 8 + 16]  // dst_v
1075    mov        ecx, [esp + 8 + 20]  // pix
1076    movdqa     xmm7, kBGRAToU
1077    movdqa     xmm6, kBGRAToV
1078    movdqa     xmm5, kAddUV128
1079    sub        edi, edx             // stride from u to v
1080
1081    align      16
1082 convertloop:
1083    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1084    movdqa     xmm0, [eax]
1085    movdqa     xmm1, [eax + 16]
1086    movdqa     xmm2, [eax + 32]
1087    movdqa     xmm3, [eax + 48]
1088    pavgb      xmm0, [eax + esi]
1089    pavgb      xmm1, [eax + esi + 16]
1090    pavgb      xmm2, [eax + esi + 32]
1091    pavgb      xmm3, [eax + esi + 48]
1092    lea        eax,  [eax + 64]
1093    movdqa     xmm4, xmm0
1094    shufps     xmm0, xmm1, 0x88
1095    shufps     xmm4, xmm1, 0xdd
1096    pavgb      xmm0, xmm4
1097    movdqa     xmm4, xmm2
1098    shufps     xmm2, xmm3, 0x88
1099    shufps     xmm4, xmm3, 0xdd
1100    pavgb      xmm2, xmm4
1101
1102    // step 2 - convert to U and V
1103    // from here down is very similar to Y code except
1104    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1105    movdqa     xmm1, xmm0
1106    movdqa     xmm3, xmm2
1107    pmaddubsw  xmm0, xmm7  // U
1108    pmaddubsw  xmm2, xmm7
1109    pmaddubsw  xmm1, xmm6  // V
1110    pmaddubsw  xmm3, xmm6
1111    phaddw     xmm0, xmm2
1112    phaddw     xmm1, xmm3
1113    psraw      xmm0, 8
1114    psraw      xmm1, 8
1115    packsswb   xmm0, xmm1
1116    paddb      xmm0, xmm5            // -> unsigned
1117
1118    // step 3 - store 8 U and 8 V values
1119    sub        ecx, 16
1120    movlps     qword ptr [edx], xmm0 // U
1121    movhps     qword ptr [edx + edi], xmm0 // V
1122    lea        edx, [edx + 8]
1123    jg         convertloop
1124
1125    pop        edi
1126    pop        esi
1127    ret
1128  }
1129}
1130
1131__declspec(naked) __declspec(align(16))
1132void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1133                                 uint8* dst_u, uint8* dst_v, int width) {
1134__asm {
1135    push       esi
1136    push       edi
1137    mov        eax, [esp + 8 + 4]   // src_argb
1138    mov        esi, [esp + 8 + 8]   // src_stride_argb
1139    mov        edx, [esp + 8 + 12]  // dst_u
1140    mov        edi, [esp + 8 + 16]  // dst_v
1141    mov        ecx, [esp + 8 + 20]  // pix
1142    movdqa     xmm7, kBGRAToU
1143    movdqa     xmm6, kBGRAToV
1144    movdqa     xmm5, kAddUV128
1145    sub        edi, edx             // stride from u to v
1146
1147    align      16
1148 convertloop:
1149    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1150    movdqu     xmm0, [eax]
1151    movdqu     xmm1, [eax + 16]
1152    movdqu     xmm2, [eax + 32]
1153    movdqu     xmm3, [eax + 48]
1154    movdqu     xmm4, [eax + esi]
1155    pavgb      xmm0, xmm4
1156    movdqu     xmm4, [eax + esi + 16]
1157    pavgb      xmm1, xmm4
1158    movdqu     xmm4, [eax + esi + 32]
1159    pavgb      xmm2, xmm4
1160    movdqu     xmm4, [eax + esi + 48]
1161    pavgb      xmm3, xmm4
1162    lea        eax,  [eax + 64]
1163    movdqa     xmm4, xmm0
1164    shufps     xmm0, xmm1, 0x88
1165    shufps     xmm4, xmm1, 0xdd
1166    pavgb      xmm0, xmm4
1167    movdqa     xmm4, xmm2
1168    shufps     xmm2, xmm3, 0x88
1169    shufps     xmm4, xmm3, 0xdd
1170    pavgb      xmm2, xmm4
1171
1172    // step 2 - convert to U and V
1173    // from here down is very similar to Y code except
1174    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1175    movdqa     xmm1, xmm0
1176    movdqa     xmm3, xmm2
1177    pmaddubsw  xmm0, xmm7  // U
1178    pmaddubsw  xmm2, xmm7
1179    pmaddubsw  xmm1, xmm6  // V
1180    pmaddubsw  xmm3, xmm6
1181    phaddw     xmm0, xmm2
1182    phaddw     xmm1, xmm3
1183    psraw      xmm0, 8
1184    psraw      xmm1, 8
1185    packsswb   xmm0, xmm1
1186    paddb      xmm0, xmm5            // -> unsigned
1187
1188    // step 3 - store 8 U and 8 V values
1189    sub        ecx, 16
1190    movlps     qword ptr [edx], xmm0 // U
1191    movhps     qword ptr [edx + edi], xmm0 // V
1192    lea        edx, [edx + 8]
1193    jg         convertloop
1194
1195    pop        edi
1196    pop        esi
1197    ret
1198  }
1199}
1200
1201__declspec(naked) __declspec(align(16))
1202void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1203                       uint8* dst_u, uint8* dst_v, int width) {
1204__asm {
1205    push       esi
1206    push       edi
1207    mov        eax, [esp + 8 + 4]   // src_argb
1208    mov        esi, [esp + 8 + 8]   // src_stride_argb
1209    mov        edx, [esp + 8 + 12]  // dst_u
1210    mov        edi, [esp + 8 + 16]  // dst_v
1211    mov        ecx, [esp + 8 + 20]  // pix
1212    movdqa     xmm7, kABGRToU
1213    movdqa     xmm6, kABGRToV
1214    movdqa     xmm5, kAddUV128
1215    sub        edi, edx             // stride from u to v
1216
1217    align      16
1218 convertloop:
1219    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1220    movdqa     xmm0, [eax]
1221    movdqa     xmm1, [eax + 16]
1222    movdqa     xmm2, [eax + 32]
1223    movdqa     xmm3, [eax + 48]
1224    pavgb      xmm0, [eax + esi]
1225    pavgb      xmm1, [eax + esi + 16]
1226    pavgb      xmm2, [eax + esi + 32]
1227    pavgb      xmm3, [eax + esi + 48]
1228    lea        eax,  [eax + 64]
1229    movdqa     xmm4, xmm0
1230    shufps     xmm0, xmm1, 0x88
1231    shufps     xmm4, xmm1, 0xdd
1232    pavgb      xmm0, xmm4
1233    movdqa     xmm4, xmm2
1234    shufps     xmm2, xmm3, 0x88
1235    shufps     xmm4, xmm3, 0xdd
1236    pavgb      xmm2, xmm4
1237
1238    // step 2 - convert to U and V
1239    // from here down is very similar to Y code except
1240    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1241    movdqa     xmm1, xmm0
1242    movdqa     xmm3, xmm2
1243    pmaddubsw  xmm0, xmm7  // U
1244    pmaddubsw  xmm2, xmm7
1245    pmaddubsw  xmm1, xmm6  // V
1246    pmaddubsw  xmm3, xmm6
1247    phaddw     xmm0, xmm2
1248    phaddw     xmm1, xmm3
1249    psraw      xmm0, 8
1250    psraw      xmm1, 8
1251    packsswb   xmm0, xmm1
1252    paddb      xmm0, xmm5            // -> unsigned
1253
1254    // step 3 - store 8 U and 8 V values
1255    sub        ecx, 16
1256    movlps     qword ptr [edx], xmm0 // U
1257    movhps     qword ptr [edx + edi], xmm0 // V
1258    lea        edx, [edx + 8]
1259    jg         convertloop
1260
1261    pop        edi
1262    pop        esi
1263    ret
1264  }
1265}
1266
1267__declspec(naked) __declspec(align(16))
1268void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1269                                 uint8* dst_u, uint8* dst_v, int width) {
1270__asm {
1271    push       esi
1272    push       edi
1273    mov        eax, [esp + 8 + 4]   // src_argb
1274    mov        esi, [esp + 8 + 8]   // src_stride_argb
1275    mov        edx, [esp + 8 + 12]  // dst_u
1276    mov        edi, [esp + 8 + 16]  // dst_v
1277    mov        ecx, [esp + 8 + 20]  // pix
1278    movdqa     xmm7, kABGRToU
1279    movdqa     xmm6, kABGRToV
1280    movdqa     xmm5, kAddUV128
1281    sub        edi, edx             // stride from u to v
1282
1283    align      16
1284 convertloop:
1285    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1286    movdqu     xmm0, [eax]
1287    movdqu     xmm1, [eax + 16]
1288    movdqu     xmm2, [eax + 32]
1289    movdqu     xmm3, [eax + 48]
1290    movdqu     xmm4, [eax + esi]
1291    pavgb      xmm0, xmm4
1292    movdqu     xmm4, [eax + esi + 16]
1293    pavgb      xmm1, xmm4
1294    movdqu     xmm4, [eax + esi + 32]
1295    pavgb      xmm2, xmm4
1296    movdqu     xmm4, [eax + esi + 48]
1297    pavgb      xmm3, xmm4
1298    lea        eax,  [eax + 64]
1299    movdqa     xmm4, xmm0
1300    shufps     xmm0, xmm1, 0x88
1301    shufps     xmm4, xmm1, 0xdd
1302    pavgb      xmm0, xmm4
1303    movdqa     xmm4, xmm2
1304    shufps     xmm2, xmm3, 0x88
1305    shufps     xmm4, xmm3, 0xdd
1306    pavgb      xmm2, xmm4
1307
1308    // step 2 - convert to U and V
1309    // from here down is very similar to Y code except
1310    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1311    movdqa     xmm1, xmm0
1312    movdqa     xmm3, xmm2
1313    pmaddubsw  xmm0, xmm7  // U
1314    pmaddubsw  xmm2, xmm7
1315    pmaddubsw  xmm1, xmm6  // V
1316    pmaddubsw  xmm3, xmm6
1317    phaddw     xmm0, xmm2
1318    phaddw     xmm1, xmm3
1319    psraw      xmm0, 8
1320    psraw      xmm1, 8
1321    packsswb   xmm0, xmm1
1322    paddb      xmm0, xmm5            // -> unsigned
1323
1324    // step 3 - store 8 U and 8 V values
1325    sub        ecx, 16
1326    movlps     qword ptr [edx], xmm0 // U
1327    movhps     qword ptr [edx + edi], xmm0 // V
1328    lea        edx, [edx + 8]
1329    jg         convertloop
1330
1331    pop        edi
1332    pop        esi
1333    ret
1334  }
1335}
1336
1337__declspec(naked) __declspec(align(16))
1338void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1339                       uint8* dst_u, uint8* dst_v, int width) {
1340__asm {
1341    push       esi
1342    push       edi
1343    mov        eax, [esp + 8 + 4]   // src_argb
1344    mov        esi, [esp + 8 + 8]   // src_stride_argb
1345    mov        edx, [esp + 8 + 12]  // dst_u
1346    mov        edi, [esp + 8 + 16]  // dst_v
1347    mov        ecx, [esp + 8 + 20]  // pix
1348    movdqa     xmm7, kRGBAToU
1349    movdqa     xmm6, kRGBAToV
1350    movdqa     xmm5, kAddUV128
1351    sub        edi, edx             // stride from u to v
1352
1353    align      16
1354 convertloop:
1355    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1356    movdqa     xmm0, [eax]
1357    movdqa     xmm1, [eax + 16]
1358    movdqa     xmm2, [eax + 32]
1359    movdqa     xmm3, [eax + 48]
1360    pavgb      xmm0, [eax + esi]
1361    pavgb      xmm1, [eax + esi + 16]
1362    pavgb      xmm2, [eax + esi + 32]
1363    pavgb      xmm3, [eax + esi + 48]
1364    lea        eax,  [eax + 64]
1365    movdqa     xmm4, xmm0
1366    shufps     xmm0, xmm1, 0x88
1367    shufps     xmm4, xmm1, 0xdd
1368    pavgb      xmm0, xmm4
1369    movdqa     xmm4, xmm2
1370    shufps     xmm2, xmm3, 0x88
1371    shufps     xmm4, xmm3, 0xdd
1372    pavgb      xmm2, xmm4
1373
1374    // step 2 - convert to U and V
1375    // from here down is very similar to Y code except
1376    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1377    movdqa     xmm1, xmm0
1378    movdqa     xmm3, xmm2
1379    pmaddubsw  xmm0, xmm7  // U
1380    pmaddubsw  xmm2, xmm7
1381    pmaddubsw  xmm1, xmm6  // V
1382    pmaddubsw  xmm3, xmm6
1383    phaddw     xmm0, xmm2
1384    phaddw     xmm1, xmm3
1385    psraw      xmm0, 8
1386    psraw      xmm1, 8
1387    packsswb   xmm0, xmm1
1388    paddb      xmm0, xmm5            // -> unsigned
1389
1390    // step 3 - store 8 U and 8 V values
1391    sub        ecx, 16
1392    movlps     qword ptr [edx], xmm0 // U
1393    movhps     qword ptr [edx + edi], xmm0 // V
1394    lea        edx, [edx + 8]
1395    jg         convertloop
1396
1397    pop        edi
1398    pop        esi
1399    ret
1400  }
1401}
1402
1403__declspec(naked) __declspec(align(16))
1404void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1405                                 uint8* dst_u, uint8* dst_v, int width) {
1406__asm {
1407    push       esi
1408    push       edi
1409    mov        eax, [esp + 8 + 4]   // src_argb
1410    mov        esi, [esp + 8 + 8]   // src_stride_argb
1411    mov        edx, [esp + 8 + 12]  // dst_u
1412    mov        edi, [esp + 8 + 16]  // dst_v
1413    mov        ecx, [esp + 8 + 20]  // pix
1414    movdqa     xmm7, kRGBAToU
1415    movdqa     xmm6, kRGBAToV
1416    movdqa     xmm5, kAddUV128
1417    sub        edi, edx             // stride from u to v
1418
1419    align      16
1420 convertloop:
1421    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1422    movdqu     xmm0, [eax]
1423    movdqu     xmm1, [eax + 16]
1424    movdqu     xmm2, [eax + 32]
1425    movdqu     xmm3, [eax + 48]
1426    movdqu     xmm4, [eax + esi]
1427    pavgb      xmm0, xmm4
1428    movdqu     xmm4, [eax + esi + 16]
1429    pavgb      xmm1, xmm4
1430    movdqu     xmm4, [eax + esi + 32]
1431    pavgb      xmm2, xmm4
1432    movdqu     xmm4, [eax + esi + 48]
1433    pavgb      xmm3, xmm4
1434    lea        eax,  [eax + 64]
1435    movdqa     xmm4, xmm0
1436    shufps     xmm0, xmm1, 0x88
1437    shufps     xmm4, xmm1, 0xdd
1438    pavgb      xmm0, xmm4
1439    movdqa     xmm4, xmm2
1440    shufps     xmm2, xmm3, 0x88
1441    shufps     xmm4, xmm3, 0xdd
1442    pavgb      xmm2, xmm4
1443
1444    // step 2 - convert to U and V
1445    // from here down is very similar to Y code except
1446    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1447    movdqa     xmm1, xmm0
1448    movdqa     xmm3, xmm2
1449    pmaddubsw  xmm0, xmm7  // U
1450    pmaddubsw  xmm2, xmm7
1451    pmaddubsw  xmm1, xmm6  // V
1452    pmaddubsw  xmm3, xmm6
1453    phaddw     xmm0, xmm2
1454    phaddw     xmm1, xmm3
1455    psraw      xmm0, 8
1456    psraw      xmm1, 8
1457    packsswb   xmm0, xmm1
1458    paddb      xmm0, xmm5            // -> unsigned
1459
1460    // step 3 - store 8 U and 8 V values
1461    sub        ecx, 16
1462    movlps     qword ptr [edx], xmm0 // U
1463    movhps     qword ptr [edx + edi], xmm0 // V
1464    lea        edx, [edx + 8]
1465    jg         convertloop
1466
1467    pop        edi
1468    pop        esi
1469    ret
1470  }
1471}
1472#endif  // HAS_ARGBTOYROW_SSSE3
1473
1474#ifdef HAS_I422TOARGBROW_SSSE3
1475
1476#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
1477
1478#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1479#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1480#define UR 0
1481
1482#define VB 0
1483#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1484#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1485
1486// Bias
1487#define BB UB * 128 + VB * 128
1488#define BG UG * 128 + VG * 128
1489#define BR UR * 128 + VR * 128
1490
1491static const vec8 kUVToB = {
1492  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
1493};
1494
1495static const vec8 kUVToR = {
1496  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
1497};
1498
1499static const vec8 kUVToG = {
1500  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
1501};
1502
1503static const vec8 kVUToB = {
1504  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
1505};
1506
1507static const vec8 kVUToR = {
1508  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
1509};
1510
1511static const vec8 kVUToG = {
1512  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1513};
1514
1515static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
1516static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
1517static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
1518static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
1519static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
1520
1521// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
1522// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
1523
1524// Read 8 UV from 411.
1525#define READYUV444 __asm {                                                     \
1526    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
1527    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
1528    __asm lea        esi,  [esi + 8]                                           \
1529    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
1530  }
1531
1532// Read 4 UV from 422, upsample to 8 UV.
1533#define READYUV422 __asm {                                                     \
1534    __asm movd       xmm0, [esi]          /* U */                              \
1535    __asm movd       xmm1, [esi + edi]    /* V */                              \
1536    __asm lea        esi,  [esi + 4]                                           \
1537    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
1538    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
1539  }
1540
1541// Read 2 UV from 411, upsample to 8 UV.
1542#define READYUV411 __asm {                                                     \
1543    __asm movd       xmm0, [esi]          /* U */                              \
1544    __asm movd       xmm1, [esi + edi]    /* V */                              \
1545    __asm lea        esi,  [esi + 2]                                           \
1546    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
1547    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
1548    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
1549  }
1550
1551// Read 4 UV from NV12, upsample to 8 UV.
1552#define READNV12 __asm {                                                       \
1553    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
1554    __asm lea        esi,  [esi + 8]                                           \
1555    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
1556  }
1557
1558// Convert 8 pixels: 8 UV and 8 Y.
1559#define YUVTORGB __asm {                                                       \
1560    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
1561    __asm movdqa     xmm1, xmm0                                                \
1562    __asm movdqa     xmm2, xmm0                                                \
1563    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
1564    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
1565    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
1566    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
1567    __asm psubw      xmm1, kUVBiasG                                            \
1568    __asm psubw      xmm2, kUVBiasR                                            \
1569    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
1570    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
1571    __asm lea        eax, [eax + 8]                                            \
1572    __asm punpcklbw  xmm3, xmm4                                                \
1573    __asm psubsw     xmm3, kYSub16                                             \
1574    __asm pmullw     xmm3, kYToRgb                                             \
1575    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
1576    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
1577    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
1578    __asm psraw      xmm0, 6                                                   \
1579    __asm psraw      xmm1, 6                                                   \
1580    __asm psraw      xmm2, 6                                                   \
1581    __asm packuswb   xmm0, xmm0           /* B */                              \
1582    __asm packuswb   xmm1, xmm1           /* G */                              \
1583    __asm packuswb   xmm2, xmm2           /* R */                              \
1584  }
1585
1586// Convert 8 pixels: 8 VU and 8 Y.
1587#define YVUTORGB __asm {                                                       \
1588    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
1589    __asm movdqa     xmm1, xmm0                                                \
1590    __asm movdqa     xmm2, xmm0                                                \
1591    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
1592    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
1593    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
1594    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
1595    __asm psubw      xmm1, kUVBiasG                                            \
1596    __asm psubw      xmm2, kUVBiasR                                            \
1597    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
1598    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
1599    __asm lea        eax, [eax + 8]                                            \
1600    __asm punpcklbw  xmm3, xmm4                                                \
1601    __asm psubsw     xmm3, kYSub16                                             \
1602    __asm pmullw     xmm3, kYToRgb                                             \
1603    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
1604    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
1605    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
1606    __asm psraw      xmm0, 6                                                   \
1607    __asm psraw      xmm1, 6                                                   \
1608    __asm psraw      xmm2, 6                                                   \
1609    __asm packuswb   xmm0, xmm0           /* B */                              \
1610    __asm packuswb   xmm1, xmm1           /* G */                              \
1611    __asm packuswb   xmm2, xmm2           /* R */                              \
1612  }
1613
1614// 8 pixels, dest aligned 16.
1615// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
1616__declspec(naked) __declspec(align(16))
1617void I444ToARGBRow_SSSE3(const uint8* y_buf,
1618                         const uint8* u_buf,
1619                         const uint8* v_buf,
1620                         uint8* argb_buf,
1621                         int width) {
1622  __asm {
1623    push       esi
1624    push       edi
1625    mov        eax, [esp + 8 + 4]   // Y
1626    mov        esi, [esp + 8 + 8]   // U
1627    mov        edi, [esp + 8 + 12]  // V
1628    mov        edx, [esp + 8 + 16]  // argb
1629    mov        ecx, [esp + 8 + 20]  // width
1630    sub        edi, esi
1631    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1632    pxor       xmm4, xmm4
1633
1634    align      16
1635 convertloop:
1636    READYUV444
1637    YUVTORGB
1638
1639    // Step 3: Weave into ARGB
1640    punpcklbw  xmm0, xmm1           // BG
1641    punpcklbw  xmm2, xmm5           // RA
1642    movdqa     xmm1, xmm0
1643    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1644    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1645    movdqa     [edx], xmm0
1646    movdqa     [edx + 16], xmm1
1647    lea        edx,  [edx + 32]
1648    sub        ecx, 8
1649    jg         convertloop
1650
1651    pop        edi
1652    pop        esi
1653    ret
1654  }
1655}
1656
1657// 8 pixels, dest aligned 16.
1658// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1659__declspec(naked) __declspec(align(16))
1660void I422ToARGBRow_SSSE3(const uint8* y_buf,
1661                         const uint8* u_buf,
1662                         const uint8* v_buf,
1663                         uint8* argb_buf,
1664                         int width) {
1665  __asm {
1666    push       esi
1667    push       edi
1668    mov        eax, [esp + 8 + 4]   // Y
1669    mov        esi, [esp + 8 + 8]   // U
1670    mov        edi, [esp + 8 + 12]  // V
1671    mov        edx, [esp + 8 + 16]  // argb
1672    mov        ecx, [esp + 8 + 20]  // width
1673    sub        edi, esi
1674    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1675    pxor       xmm4, xmm4
1676
1677    align      16
1678 convertloop:
1679    READYUV422
1680    YUVTORGB
1681
1682    // Step 3: Weave into ARGB
1683    punpcklbw  xmm0, xmm1           // BG
1684    punpcklbw  xmm2, xmm5           // RA
1685    movdqa     xmm1, xmm0
1686    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1687    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1688    movdqa     [edx], xmm0
1689    movdqa     [edx + 16], xmm1
1690    lea        edx,  [edx + 32]
1691    sub        ecx, 8
1692    jg         convertloop
1693
1694    pop        edi
1695    pop        esi
1696    ret
1697  }
1698}
1699
1700// 8 pixels, dest aligned 16.
1701// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1702// Similar to I420 but duplicate UV once more.
1703__declspec(naked) __declspec(align(16))
1704void I411ToARGBRow_SSSE3(const uint8* y_buf,
1705                         const uint8* u_buf,
1706                         const uint8* v_buf,
1707                         uint8* argb_buf,
1708                         int width) {
1709  __asm {
1710    push       esi
1711    push       edi
1712    mov        eax, [esp + 8 + 4]   // Y
1713    mov        esi, [esp + 8 + 8]   // U
1714    mov        edi, [esp + 8 + 12]  // V
1715    mov        edx, [esp + 8 + 16]  // argb
1716    mov        ecx, [esp + 8 + 20]  // width
1717    sub        edi, esi
1718    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1719    pxor       xmm4, xmm4
1720
1721    align      16
1722 convertloop:
1723    READYUV411
1724    YUVTORGB
1725
1726    // Step 3: Weave into ARGB
1727    punpcklbw  xmm0, xmm1           // BG
1728    punpcklbw  xmm2, xmm5           // RA
1729    movdqa     xmm1, xmm0
1730    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1731    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1732    movdqa     [edx], xmm0
1733    movdqa     [edx + 16], xmm1
1734    lea        edx,  [edx + 32]
1735    sub        ecx, 8
1736    jg         convertloop
1737
1738    pop        edi
1739    pop        esi
1740    ret
1741  }
1742}
1743
1744// 8 pixels, dest aligned 16.
1745// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1746__declspec(naked) __declspec(align(16))
1747void NV12ToARGBRow_SSSE3(const uint8* y_buf,
1748                         const uint8* uv_buf,
1749                         uint8* argb_buf,
1750                         int width) {
1751  __asm {
1752    push       esi
1753    mov        eax, [esp + 4 + 4]   // Y
1754    mov        esi, [esp + 4 + 8]   // UV
1755    mov        edx, [esp + 4 + 12]  // argb
1756    mov        ecx, [esp + 4 + 16]  // width
1757    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1758    pxor       xmm4, xmm4
1759
1760    align      16
1761 convertloop:
1762    READNV12
1763    YUVTORGB
1764
1765    // Step 3: Weave into ARGB
1766    punpcklbw  xmm0, xmm1           // BG
1767    punpcklbw  xmm2, xmm5           // RA
1768    movdqa     xmm1, xmm0
1769    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1770    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1771    movdqa     [edx], xmm0
1772    movdqa     [edx + 16], xmm1
1773    lea        edx,  [edx + 32]
1774    sub        ecx, 8
1775    jg         convertloop
1776
1777    pop        esi
1778    ret
1779  }
1780}
1781
1782// 8 pixels, dest aligned 16.
1783// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1784__declspec(naked) __declspec(align(16))
1785void NV21ToARGBRow_SSSE3(const uint8* y_buf,
1786                         const uint8* uv_buf,
1787                         uint8* argb_buf,
1788                         int width) {
1789  __asm {
1790    push       esi
1791    mov        eax, [esp + 4 + 4]   // Y
1792    mov        esi, [esp + 4 + 8]   // VU
1793    mov        edx, [esp + 4 + 12]  // argb
1794    mov        ecx, [esp + 4 + 16]  // width
1795    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1796    pxor       xmm4, xmm4
1797
1798    align      16
1799 convertloop:
1800    READNV12
1801    YVUTORGB
1802
1803    // Step 3: Weave into ARGB
1804    punpcklbw  xmm0, xmm1           // BG
1805    punpcklbw  xmm2, xmm5           // RA
1806    movdqa     xmm1, xmm0
1807    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1808    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1809    movdqa     [edx], xmm0
1810    movdqa     [edx + 16], xmm1
1811    lea        edx,  [edx + 32]
1812    sub        ecx, 8
1813    jg         convertloop
1814
1815    pop        esi
1816    ret
1817  }
1818}
1819
1820// 8 pixels, unaligned.
1821// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
1822__declspec(naked) __declspec(align(16))
1823void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1824                                   const uint8* u_buf,
1825                                   const uint8* v_buf,
1826                                   uint8* argb_buf,
1827                                   int width) {
1828  __asm {
1829    push       esi
1830    push       edi
1831    mov        eax, [esp + 8 + 4]   // Y
1832    mov        esi, [esp + 8 + 8]   // U
1833    mov        edi, [esp + 8 + 12]  // V
1834    mov        edx, [esp + 8 + 16]  // argb
1835    mov        ecx, [esp + 8 + 20]  // width
1836    sub        edi, esi
1837    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1838    pxor       xmm4, xmm4
1839
1840    align      16
1841 convertloop:
1842    READYUV444
1843    YUVTORGB
1844
1845    // Step 3: Weave into ARGB
1846    punpcklbw  xmm0, xmm1           // BG
1847    punpcklbw  xmm2, xmm5           // RA
1848    movdqa     xmm1, xmm0
1849    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1850    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1851    movdqu     [edx], xmm0
1852    movdqu     [edx + 16], xmm1
1853    lea        edx,  [edx + 32]
1854    sub        ecx, 8
1855    jg         convertloop
1856
1857    pop        edi
1858    pop        esi
1859    ret
1860  }
1861}
1862
1863// 8 pixels, unaligned.
1864// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1865__declspec(naked) __declspec(align(16))
1866void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1867                                   const uint8* u_buf,
1868                                   const uint8* v_buf,
1869                                   uint8* argb_buf,
1870                                   int width) {
1871  __asm {
1872    push       esi
1873    push       edi
1874    mov        eax, [esp + 8 + 4]   // Y
1875    mov        esi, [esp + 8 + 8]   // U
1876    mov        edi, [esp + 8 + 12]  // V
1877    mov        edx, [esp + 8 + 16]  // argb
1878    mov        ecx, [esp + 8 + 20]  // width
1879    sub        edi, esi
1880    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1881    pxor       xmm4, xmm4
1882
1883    align      16
1884 convertloop:
1885    READYUV422
1886    YUVTORGB
1887
1888    // Step 3: Weave into ARGB
1889    punpcklbw  xmm0, xmm1           // BG
1890    punpcklbw  xmm2, xmm5           // RA
1891    movdqa     xmm1, xmm0
1892    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1893    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1894    movdqu     [edx], xmm0
1895    movdqu     [edx + 16], xmm1
1896    lea        edx,  [edx + 32]
1897    sub        ecx, 8
1898    jg         convertloop
1899
1900    pop        edi
1901    pop        esi
1902    ret
1903  }
1904}
1905
1906// 8 pixels, unaligned.
1907// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1908// Similar to I420 but duplicate UV once more.
1909__declspec(naked) __declspec(align(16))
1910void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1911                                   const uint8* u_buf,
1912                                   const uint8* v_buf,
1913                                   uint8* argb_buf,
1914                                   int width) {
1915  __asm {
1916    push       esi
1917    push       edi
1918    mov        eax, [esp + 8 + 4]   // Y
1919    mov        esi, [esp + 8 + 8]   // U
1920    mov        edi, [esp + 8 + 12]  // V
1921    mov        edx, [esp + 8 + 16]  // argb
1922    mov        ecx, [esp + 8 + 20]  // width
1923    sub        edi, esi
1924    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1925    pxor       xmm4, xmm4
1926
1927    align      16
1928 convertloop:
1929    READYUV411
1930    YUVTORGB
1931
1932    // Step 3: Weave into ARGB
1933    punpcklbw  xmm0, xmm1           // BG
1934    punpcklbw  xmm2, xmm5           // RA
1935    movdqa     xmm1, xmm0
1936    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1937    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1938    movdqu     [edx], xmm0
1939    movdqu     [edx + 16], xmm1
1940    lea        edx,  [edx + 32]
1941    sub        ecx, 8
1942    jg         convertloop
1943
1944    pop        edi
1945    pop        esi
1946    ret
1947  }
1948}
1949
1950
1951// 8 pixels, dest aligned 16.
1952// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1953__declspec(naked) __declspec(align(16))
1954void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1955                                   const uint8* uv_buf,
1956                                   uint8* argb_buf,
1957                                   int width) {
1958  __asm {
1959    push       esi
1960    mov        eax, [esp + 4 + 4]   // Y
1961    mov        esi, [esp + 4 + 8]   // UV
1962    mov        edx, [esp + 4 + 12]  // argb
1963    mov        ecx, [esp + 4 + 16]  // width
1964    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1965    pxor       xmm4, xmm4
1966
1967    align      16
1968 convertloop:
1969    READNV12
1970    YUVTORGB
1971
1972    // Step 3: Weave into ARGB
1973    punpcklbw  xmm0, xmm1           // BG
1974    punpcklbw  xmm2, xmm5           // RA
1975    movdqa     xmm1, xmm0
1976    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1977    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1978    movdqu     [edx], xmm0
1979    movdqu     [edx + 16], xmm1
1980    lea        edx,  [edx + 32]
1981    sub        ecx, 8
1982    jg         convertloop
1983
1984    pop        esi
1985    ret
1986  }
1987}
1988
1989// 8 pixels, dest aligned 16.
1990// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1991__declspec(naked) __declspec(align(16))
1992void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1993                                   const uint8* uv_buf,
1994                                   uint8* argb_buf,
1995                                   int width) {
1996  __asm {
1997    push       esi
1998    mov        eax, [esp + 4 + 4]   // Y
1999    mov        esi, [esp + 4 + 8]   // VU
2000    mov        edx, [esp + 4 + 12]  // argb
2001    mov        ecx, [esp + 4 + 16]  // width
2002    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2003    pxor       xmm4, xmm4
2004
2005    align      16
2006 convertloop:
2007    READNV12
2008    YVUTORGB
2009
2010    // Step 3: Weave into ARGB
2011    punpcklbw  xmm0, xmm1           // BG
2012    punpcklbw  xmm2, xmm5           // RA
2013    movdqa     xmm1, xmm0
2014    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
2015    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
2016    movdqu     [edx], xmm0
2017    movdqu     [edx + 16], xmm1
2018    lea        edx,  [edx + 32]
2019    sub        ecx, 8
2020    jg         convertloop
2021
2022    pop        esi
2023    ret
2024  }
2025}
2026
2027__declspec(naked) __declspec(align(16))
2028void I422ToBGRARow_SSSE3(const uint8* y_buf,
2029                         const uint8* u_buf,
2030                         const uint8* v_buf,
2031                         uint8* bgra_buf,
2032                         int width) {
2033  __asm {
2034    push       esi
2035    push       edi
2036    mov        eax, [esp + 8 + 4]   // Y
2037    mov        esi, [esp + 8 + 8]   // U
2038    mov        edi, [esp + 8 + 12]  // V
2039    mov        edx, [esp + 8 + 16]  // bgra
2040    mov        ecx, [esp + 8 + 20]  // width
2041    sub        edi, esi
2042    pxor       xmm4, xmm4
2043
2044    align      16
2045 convertloop:
2046    READYUV422
2047    YUVTORGB
2048
2049    // Step 3: Weave into BGRA
2050    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2051    punpcklbw  xmm1, xmm0           // GB
2052    punpcklbw  xmm5, xmm2           // AR
2053    movdqa     xmm0, xmm5
2054    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
2055    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
2056    movdqa     [edx], xmm5
2057    movdqa     [edx + 16], xmm0
2058    lea        edx,  [edx + 32]
2059    sub        ecx, 8
2060    jg         convertloop
2061
2062    pop        edi
2063    pop        esi
2064    ret
2065  }
2066}
2067
2068__declspec(naked) __declspec(align(16))
2069void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
2070                                   const uint8* u_buf,
2071                                   const uint8* v_buf,
2072                                   uint8* bgra_buf,
2073                                   int width) {
2074  __asm {
2075    push       esi
2076    push       edi
2077    mov        eax, [esp + 8 + 4]   // Y
2078    mov        esi, [esp + 8 + 8]   // U
2079    mov        edi, [esp + 8 + 12]  // V
2080    mov        edx, [esp + 8 + 16]  // bgra
2081    mov        ecx, [esp + 8 + 20]  // width
2082    sub        edi, esi
2083    pxor       xmm4, xmm4
2084
2085    align      16
2086 convertloop:
2087    READYUV422
2088    YUVTORGB
2089
2090    // Step 3: Weave into BGRA
2091    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2092    punpcklbw  xmm1, xmm0           // GB
2093    punpcklbw  xmm5, xmm2           // AR
2094    movdqa     xmm0, xmm5
2095    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
2096    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
2097    movdqu     [edx], xmm5
2098    movdqu     [edx + 16], xmm0
2099    lea        edx,  [edx + 32]
2100    sub        ecx, 8
2101    jg         convertloop
2102
2103    pop        edi
2104    pop        esi
2105    ret
2106  }
2107}
2108
2109__declspec(naked) __declspec(align(16))
2110void I422ToABGRRow_SSSE3(const uint8* y_buf,
2111                         const uint8* u_buf,
2112                         const uint8* v_buf,
2113                         uint8* abgr_buf,
2114                         int width) {
2115  __asm {
2116    push       esi
2117    push       edi
2118    mov        eax, [esp + 8 + 4]   // Y
2119    mov        esi, [esp + 8 + 8]   // U
2120    mov        edi, [esp + 8 + 12]  // V
2121    mov        edx, [esp + 8 + 16]  // abgr
2122    mov        ecx, [esp + 8 + 20]  // width
2123    sub        edi, esi
2124    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2125    pxor       xmm4, xmm4
2126
2127    align      16
2128 convertloop:
2129    READYUV422
2130    YUVTORGB
2131
2132    // Step 3: Weave into ARGB
2133    punpcklbw  xmm2, xmm1           // RG
2134    punpcklbw  xmm0, xmm5           // BA
2135    movdqa     xmm1, xmm2
2136    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
2137    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
2138    movdqa     [edx], xmm2
2139    movdqa     [edx + 16], xmm1
2140    lea        edx,  [edx + 32]
2141    sub        ecx, 8
2142    jg         convertloop
2143
2144    pop        edi
2145    pop        esi
2146    ret
2147  }
2148}
2149
2150__declspec(naked) __declspec(align(16))
2151void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
2152                                   const uint8* u_buf,
2153                                   const uint8* v_buf,
2154                                   uint8* abgr_buf,
2155                                   int width) {
2156  __asm {
2157    push       esi
2158    push       edi
2159    mov        eax, [esp + 8 + 4]   // Y
2160    mov        esi, [esp + 8 + 8]   // U
2161    mov        edi, [esp + 8 + 12]  // V
2162    mov        edx, [esp + 8 + 16]  // abgr
2163    mov        ecx, [esp + 8 + 20]  // width
2164    sub        edi, esi
2165    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2166    pxor       xmm4, xmm4
2167
2168    align      16
2169 convertloop:
2170    READYUV422
2171    YUVTORGB
2172
2173    // Step 3: Weave into ARGB
2174    punpcklbw  xmm2, xmm1           // RG
2175    punpcklbw  xmm0, xmm5           // BA
2176    movdqa     xmm1, xmm2
2177    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
2178    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
2179    movdqu     [edx], xmm2
2180    movdqu     [edx + 16], xmm1
2181    lea        edx,  [edx + 32]
2182    sub        ecx, 8
2183    jg         convertloop
2184
2185    pop        edi
2186    pop        esi
2187    ret
2188  }
2189}
2190
2191__declspec(naked) __declspec(align(16))
2192void I422ToRGBARow_SSSE3(const uint8* y_buf,
2193                         const uint8* u_buf,
2194                         const uint8* v_buf,
2195                         uint8* rgba_buf,
2196                         int width) {
2197  __asm {
2198    push       esi
2199    push       edi
2200    mov        eax, [esp + 8 + 4]   // Y
2201    mov        esi, [esp + 8 + 8]   // U
2202    mov        edi, [esp + 8 + 12]  // V
2203    mov        edx, [esp + 8 + 16]  // rgba
2204    mov        ecx, [esp + 8 + 20]  // width
2205    sub        edi, esi
2206    pxor       xmm4, xmm4
2207
2208    align      16
2209 convertloop:
2210    READYUV422
2211    YUVTORGB
2212
2213    // Step 3: Weave into RGBA
2214    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2215    punpcklbw  xmm1, xmm2           // GR
2216    punpcklbw  xmm5, xmm0           // AB
2217    movdqa     xmm0, xmm5
2218    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
2219    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
2220    movdqa     [edx], xmm5
2221    movdqa     [edx + 16], xmm0
2222    lea        edx,  [edx + 32]
2223    sub        ecx, 8
2224    jg         convertloop
2225
2226    pop        edi
2227    pop        esi
2228    ret
2229  }
2230}
2231
2232__declspec(naked) __declspec(align(16))
2233void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2234                                   const uint8* u_buf,
2235                                   const uint8* v_buf,
2236                                   uint8* rgba_buf,
2237                                   int width) {
2238  __asm {
2239    push       esi
2240    push       edi
2241    mov        eax, [esp + 8 + 4]   // Y
2242    mov        esi, [esp + 8 + 8]   // U
2243    mov        edi, [esp + 8 + 12]  // V
2244    mov        edx, [esp + 8 + 16]  // rgba
2245    mov        ecx, [esp + 8 + 20]  // width
2246    sub        edi, esi
2247    pxor       xmm4, xmm4
2248
2249    align      16
2250 convertloop:
2251    READYUV422
2252    YUVTORGB
2253
2254    // Step 3: Weave into RGBA
2255    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2256    punpcklbw  xmm1, xmm2           // GR
2257    punpcklbw  xmm5, xmm0           // AB
2258    movdqa     xmm0, xmm5
2259    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
2260    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
2261    movdqu     [edx], xmm5
2262    movdqu     [edx + 16], xmm0
2263    lea        edx,  [edx + 32]
2264    sub        ecx, 8
2265    jg         convertloop
2266
2267    pop        edi
2268    pop        esi
2269    ret
2270  }
2271}
2272
2273#endif  // HAS_I422TOARGBROW_SSSE3
2274
2275#ifdef HAS_YTOARGBROW_SSE2
2276__declspec(naked) __declspec(align(16))
2277void YToARGBRow_SSE2(const uint8* y_buf,
2278                     uint8* rgb_buf,
2279                     int width) {
2280  __asm {
2281    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
2282    pslld      xmm4, 24
2283    mov        eax,0x10001000
2284    movd       xmm3,eax
2285    pshufd     xmm3,xmm3,0
2286    mov        eax,0x012a012a
2287    movd       xmm2,eax
2288    pshufd     xmm2,xmm2,0
2289    mov        eax, [esp + 4]       // Y
2290    mov        edx, [esp + 8]       // rgb
2291    mov        ecx, [esp + 12]      // width
2292
2293    align      16
2294 convertloop:
2295    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2296    movq       xmm0, qword ptr [eax]
2297    lea        eax, [eax + 8]
2298    punpcklbw  xmm0, xmm0           // Y.Y
2299    psubusw    xmm0, xmm3
2300    pmulhuw    xmm0, xmm2
2301    packuswb   xmm0, xmm0           // G
2302
2303    // Step 2: Weave into ARGB
2304    punpcklbw  xmm0, xmm0           // GG
2305    movdqa     xmm1, xmm0
2306    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
2307    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
2308    por        xmm0, xmm4
2309    por        xmm1, xmm4
2310    movdqa     [edx], xmm0
2311    movdqa     [edx + 16], xmm1
2312    lea        edx,  [edx + 32]
2313    sub        ecx, 8
2314    jg         convertloop
2315
2316    ret
2317  }
2318}
2319#endif  // HAS_YTOARGBROW_SSE2
2320
2321#ifdef HAS_MIRRORROW_SSSE3
2322
2323// Shuffle table for reversing the bytes.
2324static const uvec8 kShuffleMirror = {
2325  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2326};
2327
2328__declspec(naked) __declspec(align(16))
2329void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2330__asm {
2331    mov       eax, [esp + 4]   // src
2332    mov       edx, [esp + 8]   // dst
2333    mov       ecx, [esp + 12]  // width
2334    movdqa    xmm5, kShuffleMirror
2335    lea       eax, [eax - 16]
2336
2337    align      16
2338 convertloop:
2339    movdqa    xmm0, [eax + ecx]
2340    pshufb    xmm0, xmm5
2341    sub       ecx, 16
2342    movdqa    [edx], xmm0
2343    lea       edx, [edx + 16]
2344    jg        convertloop
2345    ret
2346  }
2347}
2348#endif  // HAS_MIRRORROW_SSSE3
2349
2350#ifdef HAS_MIRRORROW_SSE2
2351// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
2352// version can not.
2353__declspec(naked) __declspec(align(16))
2354void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2355__asm {
2356    mov       eax, [esp + 4]   // src
2357    mov       edx, [esp + 8]   // dst
2358    mov       ecx, [esp + 12]  // width
2359    lea       eax, [eax - 16]
2360
2361    align      16
2362 convertloop:
2363    movdqu    xmm0, [eax + ecx]
2364    movdqa    xmm1, xmm0        // swap bytes
2365    psllw     xmm0, 8
2366    psrlw     xmm1, 8
2367    por       xmm0, xmm1
2368    pshuflw   xmm0, xmm0, 0x1b  // swap words
2369    pshufhw   xmm0, xmm0, 0x1b
2370    pshufd    xmm0, xmm0, 0x4e  // swap qwords
2371    sub       ecx, 16
2372    movdqu    [edx], xmm0
2373    lea       edx, [edx + 16]
2374    jg        convertloop
2375    ret
2376  }
2377}
2378#endif  // HAS_MIRRORROW_SSE2
2379
2380#ifdef HAS_MIRRORROW_UV_SSSE3
2381// Shuffle table for reversing the bytes of UV channels.
2382static const uvec8 kShuffleMirrorUV = {
2383  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2384};
2385
2386__declspec(naked) __declspec(align(16))
2387void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2388                       int width) {
2389  __asm {
2390    push      edi
2391    mov       eax, [esp + 4 + 4]   // src
2392    mov       edx, [esp + 4 + 8]   // dst_u
2393    mov       edi, [esp + 4 + 12]  // dst_v
2394    mov       ecx, [esp + 4 + 16]  // width
2395    movdqa    xmm1, kShuffleMirrorUV
2396    lea       eax, [eax + ecx * 2 - 16]
2397    sub       edi, edx
2398
2399    align      16
2400 convertloop:
2401    movdqa    xmm0, [eax]
2402    lea       eax, [eax - 16]
2403    pshufb    xmm0, xmm1
2404    sub       ecx, 8
2405    movlpd    qword ptr [edx], xmm0
2406    movhpd    qword ptr [edx + edi], xmm0
2407    lea       edx, [edx + 8]
2408    jg        convertloop
2409
2410    pop       edi
2411    ret
2412  }
2413}
2414#endif  // HAS_MIRRORROW_UV_SSSE3
2415
2416#ifdef HAS_ARGBMIRRORROW_SSSE3
2417
2418// Shuffle table for reversing the bytes.
2419static const uvec8 kARGBShuffleMirror = {
2420  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2421};
2422
2423__declspec(naked) __declspec(align(16))
2424void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2425__asm {
2426    mov       eax, [esp + 4]   // src
2427    mov       edx, [esp + 8]   // dst
2428    mov       ecx, [esp + 12]  // width
2429    movdqa    xmm5, kARGBShuffleMirror
2430    lea       eax, [eax - 16]
2431
2432    align      16
2433 convertloop:
2434    movdqa    xmm0, [eax + ecx * 4]
2435    pshufb    xmm0, xmm5
2436    sub       ecx, 4
2437    movdqa    [edx], xmm0
2438    lea       edx, [edx + 16]
2439    jg        convertloop
2440    ret
2441  }
2442}
2443#endif  // HAS_ARGBMIRRORROW_SSSE3
2444
2445#ifdef HAS_SPLITUV_SSE2
2446__declspec(naked) __declspec(align(16))
2447void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2448  __asm {
2449    push       edi
2450    mov        eax, [esp + 4 + 4]    // src_uv
2451    mov        edx, [esp + 4 + 8]    // dst_u
2452    mov        edi, [esp + 4 + 12]   // dst_v
2453    mov        ecx, [esp + 4 + 16]   // pix
2454    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2455    psrlw      xmm5, 8
2456    sub        edi, edx
2457
2458    align      16
2459  convertloop:
2460    movdqa     xmm0, [eax]
2461    movdqa     xmm1, [eax + 16]
2462    lea        eax,  [eax + 32]
2463    movdqa     xmm2, xmm0
2464    movdqa     xmm3, xmm1
2465    pand       xmm0, xmm5   // even bytes
2466    pand       xmm1, xmm5
2467    packuswb   xmm0, xmm1
2468    psrlw      xmm2, 8      // odd bytes
2469    psrlw      xmm3, 8
2470    packuswb   xmm2, xmm3
2471    movdqa     [edx], xmm0
2472    movdqa     [edx + edi], xmm2
2473    lea        edx, [edx + 16]
2474    sub        ecx, 16
2475    jg         convertloop
2476
2477    pop        edi
2478    ret
2479  }
2480}
2481#endif  // HAS_SPLITUV_SSE2
2482
2483#ifdef HAS_COPYROW_SSE2
2484// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
2485__declspec(naked) __declspec(align(16))
2486void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2487  __asm {
2488    mov        eax, [esp + 4]   // src
2489    mov        edx, [esp + 8]   // dst
2490    mov        ecx, [esp + 12]  // count
2491    sub        edx, eax
2492
2493    align      16
2494  convertloop:
2495    movdqa     xmm0, [eax]
2496    movdqa     xmm1, [eax + 16]
2497    movdqa     [eax + edx], xmm0
2498    movdqa     [eax + edx + 16], xmm1
2499    lea        eax, [eax + 32]
2500    sub        ecx, 32
2501    jg         convertloop
2502    ret
2503  }
2504}
2505#endif  // HAS_COPYROW_SSE2
2506
2507#ifdef HAS_COPYROW_X86
2508__declspec(naked) __declspec(align(16))
2509void CopyRow_X86(const uint8* src, uint8* dst, int count) {
2510  __asm {
2511    mov        eax, esi
2512    mov        edx, edi
2513    mov        esi, [esp + 4]   // src
2514    mov        edi, [esp + 8]   // dst
2515    mov        ecx, [esp + 12]  // count
2516    shr        ecx, 2
2517    rep movsd
2518    mov        edi, edx
2519    mov        esi, eax
2520    ret
2521  }
2522}
2523#endif  // HAS_COPYROW_X86
2524
2525#ifdef HAS_SETROW_X86
2526// SetRow8 writes 'count' bytes using a 32 bit value repeated.
2527__declspec(naked) __declspec(align(16))
2528void SetRow8_X86(uint8* dst, uint32 v32, int count) {
2529  __asm {
2530    mov        edx, edi
2531    mov        edi, [esp + 4]   // dst
2532    mov        eax, [esp + 8]   // v32
2533    mov        ecx, [esp + 12]  // count
2534    shr        ecx, 2
2535    rep stosd
2536    mov        edi, edx
2537    ret
2538  }
2539}
2540
2541// SetRow32 writes 'count' words using a 32 bit value repeated.
2542__declspec(naked) __declspec(align(16))
2543void SetRows32_X86(uint8* dst, uint32 v32, int width,
2544                   int dst_stride, int height) {
2545  __asm {
2546    push       esi
2547    push       edi
2548    push       ebp
2549    mov        edi, [esp + 12 + 4]   // dst
2550    mov        eax, [esp + 12 + 8]   // v32
2551    mov        ebp, [esp + 12 + 12]  // width
2552    mov        edx, [esp + 12 + 16]  // dst_stride
2553    mov        esi, [esp + 12 + 20]  // height
2554    lea        ecx, [ebp * 4]
2555    sub        edx, ecx             // stride - width * 4
2556
2557    align      16
2558  convertloop:
2559    mov        ecx, ebp
2560    rep stosd
2561    add        edi, edx
2562    sub        esi, 1
2563    jg         convertloop
2564
2565    pop        ebp
2566    pop        edi
2567    pop        esi
2568    ret
2569  }
2570}
2571#endif  // HAS_SETROW_X86
2572
2573#ifdef HAS_YUY2TOYROW_SSE2
2574__declspec(naked) __declspec(align(16))
2575void YUY2ToYRow_SSE2(const uint8* src_yuy2,
2576                     uint8* dst_y, int pix) {
2577  __asm {
2578    mov        eax, [esp + 4]    // src_yuy2
2579    mov        edx, [esp + 8]    // dst_y
2580    mov        ecx, [esp + 12]   // pix
2581    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
2582    psrlw      xmm5, 8
2583
2584    align      16
2585  convertloop:
2586    movdqa     xmm0, [eax]
2587    movdqa     xmm1, [eax + 16]
2588    lea        eax,  [eax + 32]
2589    pand       xmm0, xmm5   // even bytes are Y
2590    pand       xmm1, xmm5
2591    packuswb   xmm0, xmm1
2592    sub        ecx, 16
2593    movdqa     [edx], xmm0
2594    lea        edx, [edx + 16]
2595    jg         convertloop
2596    ret
2597  }
2598}
2599
2600__declspec(naked) __declspec(align(16))
2601void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2602                      uint8* dst_u, uint8* dst_v, int pix) {
2603  __asm {
2604    push       esi
2605    push       edi
2606    mov        eax, [esp + 8 + 4]    // src_yuy2
2607    mov        esi, [esp + 8 + 8]    // stride_yuy2
2608    mov        edx, [esp + 8 + 12]   // dst_u
2609    mov        edi, [esp + 8 + 16]   // dst_v
2610    mov        ecx, [esp + 8 + 20]   // pix
2611    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2612    psrlw      xmm5, 8
2613    sub        edi, edx
2614
2615    align      16
2616  convertloop:
2617    movdqa     xmm0, [eax]
2618    movdqa     xmm1, [eax + 16]
2619    movdqa     xmm2, [eax + esi]
2620    movdqa     xmm3, [eax + esi + 16]
2621    lea        eax,  [eax + 32]
2622    pavgb      xmm0, xmm2
2623    pavgb      xmm1, xmm3
2624    psrlw      xmm0, 8      // YUYV -> UVUV
2625    psrlw      xmm1, 8
2626    packuswb   xmm0, xmm1
2627    movdqa     xmm1, xmm0
2628    pand       xmm0, xmm5  // U
2629    packuswb   xmm0, xmm0
2630    psrlw      xmm1, 8     // V
2631    packuswb   xmm1, xmm1
2632    movq       qword ptr [edx], xmm0
2633    movq       qword ptr [edx + edi], xmm1
2634    lea        edx, [edx + 8]
2635    sub        ecx, 16
2636    jg         convertloop
2637
2638    pop        edi
2639    pop        esi
2640    ret
2641  }
2642}
2643
2644__declspec(naked) __declspec(align(16))
2645void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2646                         uint8* dst_u, uint8* dst_v, int pix) {
2647  __asm {
2648    push       edi
2649    mov        eax, [esp + 4 + 4]    // src_yuy2
2650    mov        edx, [esp + 4 + 8]    // dst_u
2651    mov        edi, [esp + 4 + 12]   // dst_v
2652    mov        ecx, [esp + 4 + 16]   // pix
2653    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2654    psrlw      xmm5, 8
2655    sub        edi, edx
2656
2657    align      16
2658  convertloop:
2659    movdqa     xmm0, [eax]
2660    movdqa     xmm1, [eax + 16]
2661    lea        eax,  [eax + 32]
2662    psrlw      xmm0, 8      // YUYV -> UVUV
2663    psrlw      xmm1, 8
2664    packuswb   xmm0, xmm1
2665    movdqa     xmm1, xmm0
2666    pand       xmm0, xmm5  // U
2667    packuswb   xmm0, xmm0
2668    psrlw      xmm1, 8     // V
2669    packuswb   xmm1, xmm1
2670    movq       qword ptr [edx], xmm0
2671    movq       qword ptr [edx + edi], xmm1
2672    lea        edx, [edx + 8]
2673    sub        ecx, 16
2674    jg         convertloop
2675
2676    pop        edi
2677    ret
2678  }
2679}
2680
2681__declspec(naked) __declspec(align(16))
2682void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2683                               uint8* dst_y, int pix) {
2684  __asm {
2685    mov        eax, [esp + 4]    // src_yuy2
2686    mov        edx, [esp + 8]    // dst_y
2687    mov        ecx, [esp + 12]   // pix
2688    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
2689    psrlw      xmm5, 8
2690
2691    align      16
2692  convertloop:
2693    movdqu     xmm0, [eax]
2694    movdqu     xmm1, [eax + 16]
2695    lea        eax,  [eax + 32]
2696    pand       xmm0, xmm5   // even bytes are Y
2697    pand       xmm1, xmm5
2698    packuswb   xmm0, xmm1
2699    sub        ecx, 16
2700    movdqu     [edx], xmm0
2701    lea        edx, [edx + 16]
2702    jg         convertloop
2703    ret
2704  }
2705}
2706
2707__declspec(naked) __declspec(align(16))
2708void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
2709                                uint8* dst_u, uint8* dst_v, int pix) {
2710  __asm {
2711    push       esi
2712    push       edi
2713    mov        eax, [esp + 8 + 4]    // src_yuy2
2714    mov        esi, [esp + 8 + 8]    // stride_yuy2
2715    mov        edx, [esp + 8 + 12]   // dst_u
2716    mov        edi, [esp + 8 + 16]   // dst_v
2717    mov        ecx, [esp + 8 + 20]   // pix
2718    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2719    psrlw      xmm5, 8
2720    sub        edi, edx
2721
2722    align      16
2723  convertloop:
2724    movdqu     xmm0, [eax]
2725    movdqu     xmm1, [eax + 16]
2726    movdqu     xmm2, [eax + esi]
2727    movdqu     xmm3, [eax + esi + 16]
2728    lea        eax,  [eax + 32]
2729    pavgb      xmm0, xmm2
2730    pavgb      xmm1, xmm3
2731    psrlw      xmm0, 8      // YUYV -> UVUV
2732    psrlw      xmm1, 8
2733    packuswb   xmm0, xmm1
2734    movdqa     xmm1, xmm0
2735    pand       xmm0, xmm5  // U
2736    packuswb   xmm0, xmm0
2737    psrlw      xmm1, 8     // V
2738    packuswb   xmm1, xmm1
2739    movq       qword ptr [edx], xmm0
2740    movq       qword ptr [edx + edi], xmm1
2741    lea        edx, [edx + 8]
2742    sub        ecx, 16
2743    jg         convertloop
2744
2745    pop        edi
2746    pop        esi
2747    ret
2748  }
2749}
2750
2751__declspec(naked) __declspec(align(16))
2752void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2753                                   uint8* dst_u, uint8* dst_v, int pix) {
2754  __asm {
2755    push       edi
2756    mov        eax, [esp + 4 + 4]    // src_yuy2
2757    mov        edx, [esp + 4 + 8]    // dst_u
2758    mov        edi, [esp + 4 + 12]   // dst_v
2759    mov        ecx, [esp + 4 + 16]   // pix
2760    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2761    psrlw      xmm5, 8
2762    sub        edi, edx
2763
2764    align      16
2765  convertloop:
2766    movdqu     xmm0, [eax]
2767    movdqu     xmm1, [eax + 16]
2768    lea        eax,  [eax + 32]
2769    psrlw      xmm0, 8      // YUYV -> UVUV
2770    psrlw      xmm1, 8
2771    packuswb   xmm0, xmm1
2772    movdqa     xmm1, xmm0
2773    pand       xmm0, xmm5  // U
2774    packuswb   xmm0, xmm0
2775    psrlw      xmm1, 8     // V
2776    packuswb   xmm1, xmm1
2777    movq       qword ptr [edx], xmm0
2778    movq       qword ptr [edx + edi], xmm1
2779    lea        edx, [edx + 8]
2780    sub        ecx, 16
2781    jg         convertloop
2782
2783    pop        edi
2784    ret
2785  }
2786}
2787
2788__declspec(naked) __declspec(align(16))
2789void UYVYToYRow_SSE2(const uint8* src_uyvy,
2790                     uint8* dst_y, int pix) {
2791  __asm {
2792    mov        eax, [esp + 4]    // src_uyvy
2793    mov        edx, [esp + 8]    // dst_y
2794    mov        ecx, [esp + 12]   // pix
2795
2796    align      16
2797  convertloop:
2798    movdqa     xmm0, [eax]
2799    movdqa     xmm1, [eax + 16]
2800    lea        eax,  [eax + 32]
2801    psrlw      xmm0, 8    // odd bytes are Y
2802    psrlw      xmm1, 8
2803    packuswb   xmm0, xmm1
2804    sub        ecx, 16
2805    movdqa     [edx], xmm0
2806    lea        edx, [edx + 16]
2807    jg         convertloop
2808    ret
2809  }
2810}
2811
2812__declspec(naked) __declspec(align(16))
2813void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2814                      uint8* dst_u, uint8* dst_v, int pix) {
2815  __asm {
2816    push       esi
2817    push       edi
2818    mov        eax, [esp + 8 + 4]    // src_yuy2
2819    mov        esi, [esp + 8 + 8]    // stride_yuy2
2820    mov        edx, [esp + 8 + 12]   // dst_u
2821    mov        edi, [esp + 8 + 16]   // dst_v
2822    mov        ecx, [esp + 8 + 20]   // pix
2823    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2824    psrlw      xmm5, 8
2825    sub        edi, edx
2826
2827    align      16
2828  convertloop:
2829    movdqa     xmm0, [eax]
2830    movdqa     xmm1, [eax + 16]
2831    movdqa     xmm2, [eax + esi]
2832    movdqa     xmm3, [eax + esi + 16]
2833    lea        eax,  [eax + 32]
2834    pavgb      xmm0, xmm2
2835    pavgb      xmm1, xmm3
2836    pand       xmm0, xmm5   // UYVY -> UVUV
2837    pand       xmm1, xmm5
2838    packuswb   xmm0, xmm1
2839    movdqa     xmm1, xmm0
2840    pand       xmm0, xmm5  // U
2841    packuswb   xmm0, xmm0
2842    psrlw      xmm1, 8     // V
2843    packuswb   xmm1, xmm1
2844    movq       qword ptr [edx], xmm0
2845    movq       qword ptr [edx + edi], xmm1
2846    lea        edx, [edx + 8]
2847    sub        ecx, 16
2848    jg         convertloop
2849
2850    pop        edi
2851    pop        esi
2852    ret
2853  }
2854}
2855
2856__declspec(naked) __declspec(align(16))
2857void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2858                         uint8* dst_u, uint8* dst_v, int pix) {
2859  __asm {
2860    push       edi
2861    mov        eax, [esp + 4 + 4]    // src_yuy2
2862    mov        edx, [esp + 4 + 8]    // dst_u
2863    mov        edi, [esp + 4 + 12]   // dst_v
2864    mov        ecx, [esp + 4 + 16]   // pix
2865    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2866    psrlw      xmm5, 8
2867    sub        edi, edx
2868
2869    align      16
2870  convertloop:
2871    movdqa     xmm0, [eax]
2872    movdqa     xmm1, [eax + 16]
2873    lea        eax,  [eax + 32]
2874    pand       xmm0, xmm5   // UYVY -> UVUV
2875    pand       xmm1, xmm5
2876    packuswb   xmm0, xmm1
2877    movdqa     xmm1, xmm0
2878    pand       xmm0, xmm5  // U
2879    packuswb   xmm0, xmm0
2880    psrlw      xmm1, 8     // V
2881    packuswb   xmm1, xmm1
2882    movq       qword ptr [edx], xmm0
2883    movq       qword ptr [edx + edi], xmm1
2884    lea        edx, [edx + 8]
2885    sub        ecx, 16
2886    jg         convertloop
2887
2888    pop        edi
2889    ret
2890  }
2891}
2892
2893__declspec(naked) __declspec(align(16))
2894void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2895                               uint8* dst_y, int pix) {
2896  __asm {
2897    mov        eax, [esp + 4]    // src_uyvy
2898    mov        edx, [esp + 8]    // dst_y
2899    mov        ecx, [esp + 12]   // pix
2900
2901    align      16
2902  convertloop:
2903    movdqu     xmm0, [eax]
2904    movdqu     xmm1, [eax + 16]
2905    lea        eax,  [eax + 32]
2906    psrlw      xmm0, 8    // odd bytes are Y
2907    psrlw      xmm1, 8
2908    packuswb   xmm0, xmm1
2909    sub        ecx, 16
2910    movdqu     [edx], xmm0
2911    lea        edx, [edx + 16]
2912    jg         convertloop
2913    ret
2914  }
2915}
2916
2917__declspec(naked) __declspec(align(16))
2918void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2919                                uint8* dst_u, uint8* dst_v, int pix) {
2920  __asm {
2921    push       esi
2922    push       edi
2923    mov        eax, [esp + 8 + 4]    // src_yuy2
2924    mov        esi, [esp + 8 + 8]    // stride_yuy2
2925    mov        edx, [esp + 8 + 12]   // dst_u
2926    mov        edi, [esp + 8 + 16]   // dst_v
2927    mov        ecx, [esp + 8 + 20]   // pix
2928    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2929    psrlw      xmm5, 8
2930    sub        edi, edx
2931
2932    align      16
2933  convertloop:
2934    movdqu     xmm0, [eax]
2935    movdqu     xmm1, [eax + 16]
2936    movdqu     xmm2, [eax + esi]
2937    movdqu     xmm3, [eax + esi + 16]
2938    lea        eax,  [eax + 32]
2939    pavgb      xmm0, xmm2
2940    pavgb      xmm1, xmm3
2941    pand       xmm0, xmm5   // UYVY -> UVUV
2942    pand       xmm1, xmm5
2943    packuswb   xmm0, xmm1
2944    movdqa     xmm1, xmm0
2945    pand       xmm0, xmm5  // U
2946    packuswb   xmm0, xmm0
2947    psrlw      xmm1, 8     // V
2948    packuswb   xmm1, xmm1
2949    movq       qword ptr [edx], xmm0
2950    movq       qword ptr [edx + edi], xmm1
2951    lea        edx, [edx + 8]
2952    sub        ecx, 16
2953    jg         convertloop
2954
2955    pop        edi
2956    pop        esi
2957    ret
2958  }
2959}
2960
2961__declspec(naked) __declspec(align(16))
2962void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2963                                   uint8* dst_u, uint8* dst_v, int pix) {
2964  __asm {
2965    push       edi
2966    mov        eax, [esp + 4 + 4]    // src_yuy2
2967    mov        edx, [esp + 4 + 8]    // dst_u
2968    mov        edi, [esp + 4 + 12]   // dst_v
2969    mov        ecx, [esp + 4 + 16]   // pix
2970    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2971    psrlw      xmm5, 8
2972    sub        edi, edx
2973
2974    align      16
2975  convertloop:
2976    movdqu     xmm0, [eax]
2977    movdqu     xmm1, [eax + 16]
2978    lea        eax,  [eax + 32]
2979    pand       xmm0, xmm5   // UYVY -> UVUV
2980    pand       xmm1, xmm5
2981    packuswb   xmm0, xmm1
2982    movdqa     xmm1, xmm0
2983    pand       xmm0, xmm5  // U
2984    packuswb   xmm0, xmm0
2985    psrlw      xmm1, 8     // V
2986    packuswb   xmm1, xmm1
2987    movq       qword ptr [edx], xmm0
2988    movq       qword ptr [edx + edi], xmm1
2989    lea        edx, [edx + 8]
2990    sub        ecx, 16
2991    jg         convertloop
2992
2993    pop        edi
2994    ret
2995  }
2996}
2997#endif  // HAS_YUY2TOYROW_SSE2
2998
2999#ifdef HAS_ARGBBLENDROW_SSE2
3000// Blend 8 pixels at a time.
3001__declspec(naked) __declspec(align(16))
3002void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3003                       uint8* dst_argb, int width) {
3004  __asm {
3005    push       esi
3006    mov        eax, [esp + 4 + 4]   // src_argb0
3007    mov        esi, [esp + 4 + 8]   // src_argb1
3008    mov        edx, [esp + 4 + 12]  // dst_argb
3009    mov        ecx, [esp + 4 + 16]  // width
3010    pcmpeqb    xmm7, xmm7       // generate constant 1
3011    psrlw      xmm7, 15
3012    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
3013    psrlw      xmm6, 8
3014    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
3015    psllw      xmm5, 8
3016    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3017    pslld      xmm4, 24
3018
3019    sub        ecx, 1
3020    je         convertloop1     // only 1 pixel?
3021    jl         convertloop1b
3022
3023    // 1 pixel loop until destination pointer is aligned.
3024  alignloop1:
3025    test       edx, 15          // aligned?
3026    je         alignloop1b
3027    movd       xmm3, [eax]
3028    lea        eax, [eax + 4]
3029    movdqa     xmm0, xmm3       // src argb
3030    pxor       xmm3, xmm4       // ~alpha
3031    movd       xmm2, [esi]      // _r_b
3032    psrlw      xmm3, 8          // alpha
3033    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3034    pshuflw    xmm3, xmm3,0F5h
3035    pand       xmm2, xmm6       // _r_b
3036    paddw      xmm3, xmm7       // 256 - alpha
3037    pmullw     xmm2, xmm3       // _r_b * alpha
3038    movd       xmm1, [esi]      // _a_g
3039    lea        esi, [esi + 4]
3040    psrlw      xmm1, 8          // _a_g
3041    por        xmm0, xmm4       // set alpha to 255
3042    pmullw     xmm1, xmm3       // _a_g * alpha
3043    psrlw      xmm2, 8          // _r_b convert to 8 bits again
3044    paddusb    xmm0, xmm2       // + src argb
3045    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3046    paddusb    xmm0, xmm1       // + src argb
3047    sub        ecx, 1
3048    movd       [edx], xmm0
3049    lea        edx, [edx + 4]
3050    jge        alignloop1
3051
3052  alignloop1b:
3053    add        ecx, 1 - 4
3054    jl         convertloop4b
3055
3056    // 4 pixel loop.
3057  convertloop4:
3058    movdqu     xmm3, [eax]      // src argb
3059    lea        eax, [eax + 16]
3060    movdqa     xmm0, xmm3       // src argb
3061    pxor       xmm3, xmm4       // ~alpha
3062    movdqu     xmm2, [esi]      // _r_b
3063    psrlw      xmm3, 8          // alpha
3064    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3065    pshuflw    xmm3, xmm3,0F5h
3066    pand       xmm2, xmm6       // _r_b
3067    paddw      xmm3, xmm7       // 256 - alpha
3068    pmullw     xmm2, xmm3       // _r_b * alpha
3069    movdqu     xmm1, [esi]      // _a_g
3070    lea        esi, [esi + 16]
3071    psrlw      xmm1, 8          // _a_g
3072    por        xmm0, xmm4       // set alpha to 255
3073    pmullw     xmm1, xmm3       // _a_g * alpha
3074    psrlw      xmm2, 8          // _r_b convert to 8 bits again
3075    paddusb    xmm0, xmm2       // + src argb
3076    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3077    paddusb    xmm0, xmm1       // + src argb
3078    sub        ecx, 4
3079    movdqa     [edx], xmm0
3080    lea        edx, [edx + 16]
3081    jge        convertloop4
3082
3083  convertloop4b:
3084    add        ecx, 4 - 1
3085    jl         convertloop1b
3086
3087    // 1 pixel loop.
3088  convertloop1:
3089    movd       xmm3, [eax]      // src argb
3090    lea        eax, [eax + 4]
3091    movdqa     xmm0, xmm3       // src argb
3092    pxor       xmm3, xmm4       // ~alpha
3093    movd       xmm2, [esi]      // _r_b
3094    psrlw      xmm3, 8          // alpha
3095    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3096    pshuflw    xmm3, xmm3,0F5h
3097    pand       xmm2, xmm6       // _r_b
3098    paddw      xmm3, xmm7       // 256 - alpha
3099    pmullw     xmm2, xmm3       // _r_b * alpha
3100    movd       xmm1, [esi]      // _a_g
3101    lea        esi, [esi + 4]
3102    psrlw      xmm1, 8          // _a_g
3103    por        xmm0, xmm4       // set alpha to 255
3104    pmullw     xmm1, xmm3       // _a_g * alpha
3105    psrlw      xmm2, 8          // _r_b convert to 8 bits again
3106    paddusb    xmm0, xmm2       // + src argb
3107    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3108    paddusb    xmm0, xmm1       // + src argb
3109    sub        ecx, 1
3110    movd       [edx], xmm0
3111    lea        edx, [edx + 4]
3112    jge        convertloop1
3113
3114  convertloop1b:
3115    pop        esi
3116    ret
3117  }
3118}
3119#endif  // HAS_ARGBBLENDROW_SSE2
3120
3121#ifdef HAS_ARGBBLENDROW_SSSE3
3122// Shuffle table for isolating alpha.
3123static const uvec8 kShuffleAlpha = {
3124  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3125  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3126};
3127// Same as SSE2, but replaces:
3128//    psrlw      xmm3, 8          // alpha
3129//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3130//    pshuflw    xmm3, xmm3,0F5h
3131// with..
3132//    pshufb     xmm3, kShuffleAlpha // alpha
3133// Blend 8 pixels at a time.
3134
3135__declspec(naked) __declspec(align(16))
3136void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3137                        uint8* dst_argb, int width) {
3138  __asm {
3139    push       esi
3140    mov        eax, [esp + 4 + 4]   // src_argb0
3141    mov        esi, [esp + 4 + 8]   // src_argb1
3142    mov        edx, [esp + 4 + 12]  // dst_argb
3143    mov        ecx, [esp + 4 + 16]  // width
3144    pcmpeqb    xmm7, xmm7       // generate constant 1
3145    psrlw      xmm7, 15
3146    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
3147    psrlw      xmm6, 8
3148    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
3149    psllw      xmm5, 8
3150    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3151    pslld      xmm4, 24
3152
3153    sub        ecx, 1
3154    je         convertloop1     // only 1 pixel?
3155    jl         convertloop1b
3156
3157    // 1 pixel loop until destination pointer is aligned.
3158  alignloop1:
3159    test       edx, 15          // aligned?
3160    je         alignloop1b
3161    movd       xmm3, [eax]
3162    lea        eax, [eax + 4]
3163    movdqa     xmm0, xmm3       // src argb
3164    pxor       xmm3, xmm4       // ~alpha
3165    movd       xmm2, [esi]      // _r_b
3166    pshufb     xmm3, kShuffleAlpha // alpha
3167    pand       xmm2, xmm6       // _r_b
3168    paddw      xmm3, xmm7       // 256 - alpha
3169    pmullw     xmm2, xmm3       // _r_b * alpha
3170    movd       xmm1, [esi]      // _a_g
3171    lea        esi, [esi + 4]
3172    psrlw      xmm1, 8          // _a_g
3173    por        xmm0, xmm4       // set alpha to 255
3174    pmullw     xmm1, xmm3       // _a_g * alpha
3175    psrlw      xmm2, 8          // _r_b convert to 8 bits again
3176    paddusb    xmm0, xmm2       // + src argb
3177    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3178    paddusb    xmm0, xmm1       // + src argb
3179    sub        ecx, 1
3180    movd       [edx], xmm0
3181    lea        edx, [edx + 4]
3182    jge        alignloop1
3183
3184  alignloop1b:
3185    add        ecx, 1 - 4
3186    jl         convertloop4b
3187
3188    test       eax, 15          // unaligned?
3189    jne        convertuloop4
3190    test       esi, 15          // unaligned?
3191    jne        convertuloop4
3192
3193    // 4 pixel loop.
3194  convertloop4:
3195    movdqa     xmm3, [eax]      // src argb
3196    lea        eax, [eax + 16]
3197    movdqa     xmm0, xmm3       // src argb
3198    pxor       xmm3, xmm4       // ~alpha
3199    movdqa     xmm2, [esi]      // _r_b
3200    pshufb     xmm3, kShuffleAlpha // alpha
3201    pand       xmm2, xmm6       // _r_b
3202    paddw      xmm3, xmm7       // 256 - alpha
3203    pmullw     xmm2, xmm3       // _r_b * alpha
3204    movdqa     xmm1, [esi]      // _a_g
3205    lea        esi, [esi + 16]
3206    psrlw      xmm1, 8          // _a_g
3207    por        xmm0, xmm4       // set alpha to 255
3208    pmullw     xmm1, xmm3       // _a_g * alpha
3209    psrlw      xmm2, 8          // _r_b convert to 8 bits again
3210    paddusb    xmm0, xmm2       // + src argb
3211    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3212    paddusb    xmm0, xmm1       // + src argb
3213    sub        ecx, 4
3214    movdqa     [edx], xmm0
3215    lea        edx, [edx + 16]
3216    jge        convertloop4
3217    jmp        convertloop4b
3218
3219    // 4 pixel unaligned loop.
3220  convertuloop4:
3221    movdqu     xmm3, [eax]      // src argb
3222    lea        eax, [eax + 16]
3223    movdqa     xmm0, xmm3       // src argb
3224    pxor       xmm3, xmm4       // ~alpha
3225    movdqu     xmm2, [esi]      // _r_b
3226    pshufb     xmm3, kShuffleAlpha // alpha
3227    pand       xmm2, xmm6       // _r_b
3228    paddw      xmm3, xmm7       // 256 - alpha
3229    pmullw     xmm2, xmm3       // _r_b * alpha
3230    movdqu     xmm1, [esi]      // _a_g
3231    lea        esi, [esi + 16]
3232    psrlw      xmm1, 8          // _a_g
3233    por        xmm0, xmm4       // set alpha to 255
3234    pmullw     xmm1, xmm3       // _a_g * alpha
3235    psrlw      xmm2, 8          // _r_b convert to 8 bits again
3236    paddusb    xmm0, xmm2       // + src argb
3237    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3238    paddusb    xmm0, xmm1       // + src argb
3239    sub        ecx, 4
3240    movdqa     [edx], xmm0
3241    lea        edx, [edx + 16]
3242    jge        convertuloop4
3243
3244  convertloop4b:
3245    add        ecx, 4 - 1
3246    jl         convertloop1b
3247
3248    // 1 pixel loop.
3249  convertloop1:
3250    movd       xmm3, [eax]      // src argb
3251    lea        eax, [eax + 4]
3252    movdqa     xmm0, xmm3       // src argb
3253    pxor       xmm3, xmm4       // ~alpha
3254    movd       xmm2, [esi]      // _r_b
3255    pshufb     xmm3, kShuffleAlpha // alpha
3256    pand       xmm2, xmm6       // _r_b
3257    paddw      xmm3, xmm7       // 256 - alpha
3258    pmullw     xmm2, xmm3       // _r_b * alpha
3259    movd       xmm1, [esi]      // _a_g
3260    lea        esi, [esi + 4]
3261    psrlw      xmm1, 8          // _a_g
3262    por        xmm0, xmm4       // set alpha to 255
3263    pmullw     xmm1, xmm3       // _a_g * alpha
3264    psrlw      xmm2, 8          // _r_b convert to 8 bits again
3265    paddusb    xmm0, xmm2       // + src argb
3266    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3267    paddusb    xmm0, xmm1       // + src argb
3268    sub        ecx, 1
3269    movd       [edx], xmm0
3270    lea        edx, [edx + 4]
3271    jge        convertloop1
3272
3273  convertloop1b:
3274    pop        esi
3275    ret
3276  }
3277}
3278#endif  // HAS_ARGBBLENDROW_SSSE3
3279
3280#ifdef HAS_ARGBATTENUATE_SSE2
3281// Attenuate 4 pixels at a time.
3282// Aligned to 16 bytes.
3283__declspec(naked) __declspec(align(16))
3284void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3285  __asm {
3286    mov        eax, [esp + 4]   // src_argb0
3287    mov        edx, [esp + 8]   // dst_argb
3288    mov        ecx, [esp + 12]  // width
3289    sub        edx, eax
3290    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3291    pslld      xmm4, 24
3292    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
3293    psrld      xmm5, 8
3294
3295    align      16
3296 convertloop:
3297    movdqa     xmm0, [eax]      // read 4 pixels
3298    punpcklbw  xmm0, xmm0       // first 2
3299    pshufhw    xmm2, xmm0,0FFh  // 8 alpha words
3300    pshuflw    xmm2, xmm2,0FFh
3301    pmulhuw    xmm0, xmm2       // rgb * a
3302    movdqa     xmm1, [eax]      // read 4 pixels
3303    punpckhbw  xmm1, xmm1       // next 2 pixels
3304    pshufhw    xmm2, xmm1,0FFh  // 8 alpha words
3305    pshuflw    xmm2, xmm2,0FFh
3306    pmulhuw    xmm1, xmm2       // rgb * a
3307    movdqa     xmm2, [eax]      // alphas
3308    psrlw      xmm0, 8
3309    pand       xmm2, xmm4
3310    psrlw      xmm1, 8
3311    packuswb   xmm0, xmm1
3312    pand       xmm0, xmm5       // keep original alphas
3313    por        xmm0, xmm2
3314    sub        ecx, 4
3315    movdqa     [eax + edx], xmm0
3316    lea        eax, [eax + 16]
3317    jg         convertloop
3318
3319    ret
3320  }
3321}
3322#endif  // HAS_ARGBATTENUATE_SSE2
3323
3324#ifdef HAS_ARGBATTENUATEROW_SSSE3
3325// Shuffle table duplicating alpha.
3326static const uvec8 kShuffleAlpha0 = {
3327  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3328};
3329static const uvec8 kShuffleAlpha1 = {
3330  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3331  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3332};
3333__declspec(naked) __declspec(align(16))
3334void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3335  __asm {
3336    mov        eax, [esp + 4]   // src_argb0
3337    mov        edx, [esp + 8]   // dst_argb
3338    mov        ecx, [esp + 12]  // width
3339    sub        edx, eax
3340    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
3341    pslld      xmm3, 24
3342    movdqa     xmm4, kShuffleAlpha0
3343    movdqa     xmm5, kShuffleAlpha1
3344
3345    align      16
3346 convertloop:
3347    movdqa     xmm0, [eax]      // read 4 pixels
3348    pshufb     xmm0, xmm4       // isolate first 2 alphas
3349    movdqa     xmm1, [eax]      // read 4 pixels
3350    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
3351    pmulhuw    xmm0, xmm1       // rgb * a
3352    movdqa     xmm1, [eax]      // read 4 pixels
3353    pshufb     xmm1, xmm5       // isolate next 2 alphas
3354    movdqa     xmm2, [eax]      // read 4 pixels
3355    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
3356    pmulhuw    xmm1, xmm2       // rgb * a
3357    movdqa     xmm2, [eax]      // mask original alpha
3358    pand       xmm2, xmm3
3359    psrlw      xmm0, 8
3360    psrlw      xmm1, 8
3361    packuswb   xmm0, xmm1
3362    por        xmm0, xmm2       // copy original alpha
3363    sub        ecx, 4
3364    movdqa     [eax + edx], xmm0
3365    lea        eax, [eax + 16]
3366    jg         convertloop
3367
3368    ret
3369  }
3370}
3371#endif  // HAS_ARGBATTENUATEROW_SSSE3
3372
3373#ifdef HAS_ARGBUNATTENUATEROW_SSE2
3374// Unattenuate 4 pixels at a time.
3375// Aligned to 16 bytes.
3376__declspec(naked) __declspec(align(16))
3377void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3378                             int width) {
3379  __asm {
3380    push       esi
3381    push       edi
3382    mov        eax, [esp + 8 + 4]   // src_argb0
3383    mov        edx, [esp + 8 + 8]   // dst_argb
3384    mov        ecx, [esp + 8 + 12]  // width
3385    sub        edx, eax
3386    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3387    pslld      xmm4, 24
3388
3389    align      16
3390 convertloop:
3391    movdqa     xmm0, [eax]      // read 4 pixels
3392    movzx      esi, byte ptr [eax + 3]  // first alpha
3393    movzx      edi, byte ptr [eax + 7]  // second alpha
3394    punpcklbw  xmm0, xmm0       // first 2
3395    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
3396    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
3397    pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
3398    pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
3399    movlhps    xmm2, xmm3
3400    pmulhuw    xmm0, xmm2       // rgb * a
3401
3402    movdqa     xmm1, [eax]      // read 4 pixels
3403    movzx      esi, byte ptr [eax + 11]  // third alpha
3404    movzx      edi, byte ptr [eax + 15]  // forth alpha
3405    punpckhbw  xmm1, xmm1       // next 2
3406    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
3407    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
3408    pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
3409    pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
3410    movlhps    xmm2, xmm3
3411    pmulhuw    xmm1, xmm2       // rgb * a
3412
3413    movdqa     xmm2, [eax]      // alphas
3414    pand       xmm2, xmm4
3415    packuswb   xmm0, xmm1
3416    por        xmm0, xmm2
3417    sub        ecx, 4
3418    movdqa     [eax + edx], xmm0
3419    lea        eax, [eax + 16]
3420    jg         convertloop
3421    pop        edi
3422    pop        esi
3423    ret
3424  }
3425}
3426#endif  // HAS_ARGBUNATTENUATEROW_SSE2
3427
3428#ifdef HAS_ARGBGRAYROW_SSSE3
3429// Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
3430static const vec8 kARGBToGray = {
3431  14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3432};
3433
3434// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
3435__declspec(naked) __declspec(align(16))
3436void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3437  __asm {
3438    mov        eax, [esp + 4]   /* src_argb */
3439    mov        edx, [esp + 8]   /* dst_argb */
3440    mov        ecx, [esp + 12]  /* width */
3441    movdqa     xmm4, kARGBToGray
3442    sub        edx, eax
3443
3444    align      16
3445 convertloop:
3446    movdqa     xmm0, [eax]  // G
3447    movdqa     xmm1, [eax + 16]
3448    pmaddubsw  xmm0, xmm4
3449    pmaddubsw  xmm1, xmm4
3450    phaddw     xmm0, xmm1
3451    psrlw      xmm0, 7
3452    packuswb   xmm0, xmm0   // 8 G bytes
3453    movdqa     xmm2, [eax]  // A
3454    movdqa     xmm3, [eax + 16]
3455    psrld      xmm2, 24
3456    psrld      xmm3, 24
3457    packuswb   xmm2, xmm3
3458    packuswb   xmm2, xmm2   // 8 A bytes
3459    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
3460    punpcklbw  xmm0, xmm0   // 8 GG words
3461    punpcklbw  xmm3, xmm2   // 8 GA words
3462    movdqa     xmm1, xmm0
3463    punpcklwd  xmm0, xmm3   // GGGA first 4
3464    punpckhwd  xmm1, xmm3   // GGGA next 4
3465    sub        ecx, 8
3466    movdqa     [eax + edx], xmm0
3467    movdqa     [eax + edx + 16], xmm1
3468    lea        eax, [eax + 32]
3469    jg         convertloop
3470    ret
3471  }
3472}
3473#endif  // HAS_ARGBGRAYROW_SSSE3
3474
3475#ifdef HAS_ARGBSEPIAROW_SSSE3
3476//    b = (r * 35 + g * 68 + b * 17) >> 7
3477//    g = (r * 45 + g * 88 + b * 22) >> 7
3478//    r = (r * 50 + g * 98 + b * 24) >> 7
3479// Constant for ARGB color to sepia tone.
3480static const vec8 kARGBToSepiaB = {
3481  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3482};
3483
3484static const vec8 kARGBToSepiaG = {
3485  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3486};
3487
3488static const vec8 kARGBToSepiaR = {
3489  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3490};
3491
3492// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3493__declspec(naked) __declspec(align(16))
3494void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3495  __asm {
3496    mov        eax, [esp + 4]   /* dst_argb */
3497    mov        ecx, [esp + 8]   /* width */
3498    movdqa     xmm2, kARGBToSepiaB
3499    movdqa     xmm3, kARGBToSepiaG
3500    movdqa     xmm4, kARGBToSepiaR
3501
3502    align      16
3503 convertloop:
3504    movdqa     xmm0, [eax]  // B
3505    movdqa     xmm6, [eax + 16]
3506    pmaddubsw  xmm0, xmm2
3507    pmaddubsw  xmm6, xmm2
3508    phaddw     xmm0, xmm6
3509    psrlw      xmm0, 7
3510    packuswb   xmm0, xmm0   // 8 B values
3511    movdqa     xmm5, [eax]  // G
3512    movdqa     xmm1, [eax + 16]
3513    pmaddubsw  xmm5, xmm3
3514    pmaddubsw  xmm1, xmm3
3515    phaddw     xmm5, xmm1
3516    psrlw      xmm5, 7
3517    packuswb   xmm5, xmm5   // 8 G values
3518    punpcklbw  xmm0, xmm5   // 8 BG values
3519    movdqa     xmm5, [eax]  // R
3520    movdqa     xmm1, [eax + 16]
3521    pmaddubsw  xmm5, xmm4
3522    pmaddubsw  xmm1, xmm4
3523    phaddw     xmm5, xmm1
3524    psrlw      xmm5, 7
3525    packuswb   xmm5, xmm5   // 8 R values
3526    movdqa     xmm6, [eax]  // A
3527    movdqa     xmm1, [eax + 16]
3528    psrld      xmm6, 24
3529    psrld      xmm1, 24
3530    packuswb   xmm6, xmm1
3531    packuswb   xmm6, xmm6   // 8 A values
3532    punpcklbw  xmm5, xmm6   // 8 RA values
3533    movdqa     xmm1, xmm0   // Weave BG, RA together
3534    punpcklwd  xmm0, xmm5   // BGRA first 4
3535    punpckhwd  xmm1, xmm5   // BGRA next 4
3536    sub        ecx, 8
3537    movdqa     [eax], xmm0
3538    movdqa     [eax + 16], xmm1
3539    lea        eax, [eax + 32]
3540    jg         convertloop
3541    ret
3542  }
3543}
3544#endif  // HAS_ARGBSEPIAROW_SSSE3
3545
3546#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3547// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3548// Same as Sepia except matrix is provided.
3549// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
3550// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
3551__declspec(naked) __declspec(align(16))
3552void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3553                              int width) {
3554  __asm {
3555    mov        eax, [esp + 4]   /* dst_argb */
3556    mov        edx, [esp + 8]   /* matrix_argb */
3557    mov        ecx, [esp + 12]  /* width */
3558    movd       xmm2, [edx]
3559    movd       xmm3, [edx + 4]
3560    movd       xmm4, [edx + 8]
3561    pshufd     xmm2, xmm2, 0
3562    pshufd     xmm3, xmm3, 0
3563    pshufd     xmm4, xmm4, 0
3564
3565    align      16
3566 convertloop:
3567    movdqa     xmm0, [eax]  // B
3568    movdqa     xmm6, [eax + 16]
3569    pmaddubsw  xmm0, xmm2
3570    pmaddubsw  xmm6, xmm2
3571    movdqa     xmm5, [eax]  // G
3572    movdqa     xmm1, [eax + 16]
3573    pmaddubsw  xmm5, xmm3
3574    pmaddubsw  xmm1, xmm3
3575    phaddsw    xmm0, xmm6   // B
3576    phaddsw    xmm5, xmm1   // G
3577    psraw      xmm0, 7      // B
3578    psraw      xmm5, 7      // G
3579    packuswb   xmm0, xmm0   // 8 B values
3580    packuswb   xmm5, xmm5   // 8 G values
3581    punpcklbw  xmm0, xmm5   // 8 BG values
3582    movdqa     xmm5, [eax]  // R
3583    movdqa     xmm1, [eax + 16]
3584    pmaddubsw  xmm5, xmm4
3585    pmaddubsw  xmm1, xmm4
3586    phaddsw    xmm5, xmm1
3587    psraw      xmm5, 7
3588    packuswb   xmm5, xmm5   // 8 R values
3589    movdqa     xmm6, [eax]  // A
3590    movdqa     xmm1, [eax + 16]
3591    psrld      xmm6, 24
3592    psrld      xmm1, 24
3593    packuswb   xmm6, xmm1
3594    packuswb   xmm6, xmm6   // 8 A values
3595    movdqa     xmm1, xmm0   // Weave BG, RA together
3596    punpcklbw  xmm5, xmm6   // 8 RA values
3597    punpcklwd  xmm0, xmm5   // BGRA first 4
3598    punpckhwd  xmm1, xmm5   // BGRA next 4
3599    sub        ecx, 8
3600    movdqa     [eax], xmm0
3601    movdqa     [eax + 16], xmm1
3602    lea        eax, [eax + 32]
3603    jg         convertloop
3604    ret
3605  }
3606}
3607#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
3608
3609#ifdef HAS_ARGBCOLORTABLEROW_X86
3610// Tranform ARGB pixels with color table.
3611__declspec(naked) __declspec(align(16))
3612void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
3613                           int width) {
3614  __asm {
3615    push       ebx
3616    push       esi
3617    push       edi
3618    push       ebp
3619    mov        eax, [esp + 16 + 4]   /* dst_argb */
3620    mov        edi, [esp + 16 + 8]   /* table_argb */
3621    mov        ecx, [esp + 16 + 12]  /* width */
3622    xor        ebx, ebx
3623    xor        edx, edx
3624
3625    align      16
3626 convertloop:
3627    mov        ebp, dword ptr [eax]  // BGRA
3628    mov        esi, ebp
3629    and        ebp, 255
3630    shr        esi, 8
3631    and        esi, 255
3632    mov        bl, [edi + ebp * 4 + 0]  // B
3633    mov        dl, [edi + esi * 4 + 1]  // G
3634    mov        ebp, dword ptr [eax]  // BGRA
3635    mov        esi, ebp
3636    shr        ebp, 16
3637    shr        esi, 24
3638    and        ebp, 255
3639    mov        [eax], bl
3640    mov        [eax + 1], dl
3641    mov        bl, [edi + ebp * 4 + 2]  // R
3642    mov        dl, [edi + esi * 4 + 3]  // A
3643    mov        [eax + 2], bl
3644    mov        [eax + 3], dl
3645    lea        eax, [eax + 4]
3646    sub        ecx, 1
3647    jg         convertloop
3648    pop        ebp
3649    pop        edi
3650    pop        esi
3651    pop        ebx
3652    ret
3653  }
3654}
3655#endif  // HAS_ARGBCOLORTABLEROW_X86
3656
3657#ifdef HAS_ARGBQUANTIZEROW_SSE2
3658// Quantize 4 ARGB pixels (16 bytes).
3659// Aligned to 16 bytes.
3660__declspec(naked) __declspec(align(16))
3661void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3662                          int interval_offset, int width) {
3663  __asm {
3664    mov        eax, [esp + 4]    /* dst_argb */
3665    movd       xmm2, [esp + 8]   /* scale */
3666    movd       xmm3, [esp + 12]  /* interval_size */
3667    movd       xmm4, [esp + 16]  /* interval_offset */
3668    mov        ecx, [esp + 20]   /* width */
3669    pshuflw    xmm2, xmm2, 040h
3670    pshufd     xmm2, xmm2, 044h
3671    pshuflw    xmm3, xmm3, 040h
3672    pshufd     xmm3, xmm3, 044h
3673    pshuflw    xmm4, xmm4, 040h
3674    pshufd     xmm4, xmm4, 044h
3675    pxor       xmm5, xmm5  // constant 0
3676    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
3677    pslld      xmm6, 24
3678
3679    align      16
3680 convertloop:
3681    movdqa     xmm0, [eax]  // read 4 pixels
3682    punpcklbw  xmm0, xmm5   // first 2 pixels
3683    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
3684    movdqa     xmm1, [eax]  // read 4 pixels
3685    punpckhbw  xmm1, xmm5   // next 2 pixels
3686    pmulhuw    xmm1, xmm2
3687    pmullw     xmm0, xmm3   // * interval_size
3688    movdqa     xmm7, [eax]  // read 4 pixels
3689    pmullw     xmm1, xmm3
3690    pand       xmm7, xmm6   // mask alpha
3691    paddw      xmm0, xmm4   // + interval_size / 2
3692    paddw      xmm1, xmm4
3693    packuswb   xmm0, xmm1
3694    por        xmm0, xmm7
3695    sub        ecx, 4
3696    movdqa     [eax], xmm0
3697    lea        eax, [eax + 16]
3698    jg         convertloop
3699    ret
3700  }
3701}
3702#endif  // HAS_ARGBQUANTIZEROW_SSE2
3703
3704#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3705// Consider float CumulativeSum.
3706// Consider calling CumulativeSum one row at time as needed.
3707// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
3708// Convert cumulative sum for an area to an average for 1 pixel.
3709// topleft is pointer to top left of CumulativeSum buffer for area.
3710// botleft is pointer to bottom left of CumulativeSum buffer.
3711// width is offset from left to right of area in CumulativeSum buffer measured
3712//   in number of ints.
3713// area is the number of pixels in the area being averaged.
3714// dst points to pixel to store result to.
3715// count is number of averaged pixels to produce.
3716// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
3717// aligned.
3718void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3719                                 int width, int area, uint8* dst, int count) {
3720  __asm {
3721    mov        eax, topleft  // eax topleft
3722    mov        esi, botleft  // esi botleft
3723    mov        edx, width
3724    movd       xmm4, area
3725    mov        edi, dst
3726    mov        ecx, count
3727    cvtdq2ps   xmm4, xmm4
3728    rcpss      xmm4, xmm4  // 1.0f / area
3729    pshufd     xmm4, xmm4, 0
3730    sub        ecx, 4
3731    jl         l4b
3732
3733    // 4 pixel loop
3734    align      4
3735  l4:
3736    // top left
3737    movdqa     xmm0, [eax]
3738    movdqa     xmm1, [eax + 16]
3739    movdqa     xmm2, [eax + 32]
3740    movdqa     xmm3, [eax + 48]
3741
3742    // - top right
3743    psubd      xmm0, [eax + edx * 4]
3744    psubd      xmm1, [eax + edx * 4 + 16]
3745    psubd      xmm2, [eax + edx * 4 + 32]
3746    psubd      xmm3, [eax + edx * 4 + 48]
3747    lea        eax, [eax + 64]
3748
3749    // - bottom left
3750    psubd      xmm0, [esi]
3751    psubd      xmm1, [esi + 16]
3752    psubd      xmm2, [esi + 32]
3753    psubd      xmm3, [esi + 48]
3754
3755    // + bottom right
3756    paddd      xmm0, [esi + edx * 4]
3757    paddd      xmm1, [esi + edx * 4 + 16]
3758    paddd      xmm2, [esi + edx * 4 + 32]
3759    paddd      xmm3, [esi + edx * 4 + 48]
3760    lea        esi, [esi + 64]
3761
3762    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
3763    cvtdq2ps   xmm1, xmm1
3764    mulps      xmm0, xmm4
3765    mulps      xmm1, xmm4
3766    cvtdq2ps   xmm2, xmm2
3767    cvtdq2ps   xmm3, xmm3
3768    mulps      xmm2, xmm4
3769    mulps      xmm3, xmm4
3770    cvtps2dq   xmm0, xmm0
3771    cvtps2dq   xmm1, xmm1
3772    cvtps2dq   xmm2, xmm2
3773    cvtps2dq   xmm3, xmm3
3774    packssdw   xmm0, xmm1
3775    packssdw   xmm2, xmm3
3776    packuswb   xmm0, xmm2
3777    movdqu     [edi], xmm0
3778    lea        edi, [edi + 16]
3779    sub        ecx, 4
3780    jge        l4
3781
3782  l4b:
3783    add        ecx, 4 - 1
3784    jl         l1b
3785
3786    // 1 pixel loop
3787    align      4
3788  l1:
3789    movdqa     xmm0, [eax]
3790    psubd      xmm0, [eax + edx * 4]
3791    lea        eax, [eax + 16]
3792    psubd      xmm0, [esi]
3793    paddd      xmm0, [esi + edx * 4]
3794    lea        esi, [esi + 16]
3795    cvtdq2ps   xmm0, xmm0
3796    mulps      xmm0, xmm4
3797    cvtps2dq   xmm0, xmm0
3798    packssdw   xmm0, xmm0
3799    packuswb   xmm0, xmm0
3800    movd       dword ptr [edi], xmm0
3801    lea        edi, [edi + 4]
3802    sub        ecx, 1
3803    jge        l1
3804  l1b:
3805  }
3806}
3807#endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
3808
3809#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3810// Creates a table of cumulative sums where each value is a sum of all values
3811// above and to the left of the value.
3812void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
3813                                  const int32* previous_cumsum, int width) {
3814  __asm {
3815    mov        eax, row
3816    mov        edx, cumsum
3817    mov        esi, previous_cumsum
3818    mov        ecx, width
3819    sub        esi, edx
3820    pxor       xmm0, xmm0
3821    pxor       xmm1, xmm1
3822
3823    sub        ecx, 4
3824    jl         l4b
3825    test       edx, 15
3826    jne        l4b
3827
3828    // 4 pixel loop
3829    align      4
3830  l4:
3831    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
3832    lea        eax, [eax + 16]
3833    movdqa     xmm4, xmm2
3834
3835    punpcklbw  xmm2, xmm1
3836    movdqa     xmm3, xmm2
3837    punpcklwd  xmm2, xmm1
3838    punpckhwd  xmm3, xmm1
3839
3840    punpckhbw  xmm4, xmm1
3841    movdqa     xmm5, xmm4
3842    punpcklwd  xmm4, xmm1
3843    punpckhwd  xmm5, xmm1
3844
3845    paddd      xmm0, xmm2
3846    movdqa     xmm2, [edx + esi]  // previous row above.
3847    paddd      xmm2, xmm0
3848
3849    paddd      xmm0, xmm3
3850    movdqa     xmm3, [edx + esi + 16]
3851    paddd      xmm3, xmm0
3852
3853    paddd      xmm0, xmm4
3854    movdqa     xmm4, [edx + esi + 32]
3855    paddd      xmm4, xmm0
3856
3857    paddd      xmm0, xmm5
3858    movdqa     xmm5, [edx + esi + 48]
3859    paddd      xmm5, xmm0
3860
3861    movdqa     [edx], xmm2
3862    movdqa     [edx + 16], xmm3
3863    movdqa     [edx + 32], xmm4
3864    movdqa     [edx + 48], xmm5
3865
3866    lea        edx, [edx + 64]
3867    sub        ecx, 4
3868    jge        l4
3869
3870  l4b:
3871    add        ecx, 4 - 1
3872    jl         l1b
3873
3874    // 1 pixel loop
3875    align      4
3876  l1:
3877    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
3878    lea        eax, [eax + 4]
3879    punpcklbw  xmm2, xmm1
3880    punpcklwd  xmm2, xmm1
3881    paddd      xmm0, xmm2
3882    movdqu     xmm2, [edx + esi]
3883    paddd      xmm2, xmm0
3884    movdqu     [edx], xmm2
3885    lea        edx, [edx + 16]
3886    sub        ecx, 1
3887    jge        l1
3888
3889 l1b:
3890  }
3891}
3892#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
3893
3894#ifdef HAS_ARGBSHADE_SSE2
3895// Shade 4 pixels at a time by specified value.
3896// Aligned to 16 bytes.
3897__declspec(naked) __declspec(align(16))
3898void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3899                       uint32 value) {
3900  __asm {
3901    mov        eax, [esp + 4]   // src_argb
3902    mov        edx, [esp + 8]   // dst_argb
3903    mov        ecx, [esp + 12]  // width
3904    movd       xmm2, [esp + 16]  // value
3905    sub        edx, eax
3906    punpcklbw  xmm2, xmm2
3907    punpcklqdq xmm2, xmm2
3908
3909    align      16
3910 convertloop:
3911    movdqa     xmm0, [eax]      // read 4 pixels
3912    movdqa     xmm1, xmm0
3913    punpcklbw  xmm0, xmm0       // first 2
3914    punpckhbw  xmm1, xmm1       // next 2
3915    pmulhuw    xmm0, xmm2       // argb * value
3916    pmulhuw    xmm1, xmm2       // argb * value
3917    psrlw      xmm0, 8
3918    psrlw      xmm1, 8
3919    packuswb   xmm0, xmm1
3920    sub        ecx, 4
3921    movdqa     [eax + edx], xmm0
3922    lea        eax, [eax + 16]
3923    jg         convertloop
3924
3925    ret
3926  }
3927}
3928#endif  // HAS_ARGBSHADE_SSE2
3929
3930#ifdef HAS_ARGBAFFINEROW_SSE2
3931// Copy ARGB pixels from source image with slope to a row of destination.
3932__declspec(naked) __declspec(align(16))
3933LIBYUV_API
3934void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3935                        uint8* dst_argb, const float* uv_dudv, int width) {
3936  __asm {
3937    push       esi
3938    push       edi
3939    mov        eax, [esp + 12]   // src_argb
3940    mov        esi, [esp + 16]  // stride
3941    mov        edx, [esp + 20]  // dst_argb
3942    mov        ecx, [esp + 24]  // pointer to uv_dudv
3943    movq       xmm2, qword ptr [ecx]  // uv
3944    movq       xmm7, qword ptr [ecx + 8]  // dudv
3945    mov        ecx, [esp + 28]  // width
3946    shl        esi, 16          // 4, stride
3947    add        esi, 4
3948    movd       xmm5, esi
3949    sub        ecx, 4
3950    jl         l4b
3951
3952    // setup for 4 pixel loop
3953    pshufd     xmm7, xmm7, 0x44  // dup dudv
3954    pshufd     xmm5, xmm5, 0  // dup 4, stride
3955    movdqa     xmm0, xmm2    // x0, y0, x1, y1
3956    addps      xmm0, xmm7
3957    movlhps    xmm2, xmm0
3958    movdqa     xmm4, xmm7
3959    addps      xmm4, xmm4    // dudv *= 2
3960    movdqa     xmm3, xmm2    // x2, y2, x3, y3
3961    addps      xmm3, xmm4
3962    addps      xmm4, xmm4    // dudv *= 4
3963
3964    // 4 pixel loop
3965    align      4
3966  l4:
3967    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
3968    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
3969    packssdw   xmm0, xmm1    // x, y as 8 shorts
3970    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
3971    movd       esi, xmm0
3972    pshufd     xmm0, xmm0, 0x39  // shift right
3973    movd       edi, xmm0
3974    pshufd     xmm0, xmm0, 0x39  // shift right
3975    movd       xmm1, [eax + esi]  // read pixel 0
3976    movd       xmm6, [eax + edi]  // read pixel 1
3977    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
3978    addps      xmm2, xmm4    // x, y += dx, dy first 2
3979    movq       qword ptr [edx], xmm1
3980    movd       esi, xmm0
3981    pshufd     xmm0, xmm0, 0x39  // shift right
3982    movd       edi, xmm0
3983    movd       xmm6, [eax + esi]  // read pixel 2
3984    movd       xmm0, [eax + edi]  // read pixel 3
3985    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
3986    addps      xmm3, xmm4    // x, y += dx, dy next 2
3987    sub        ecx, 4
3988    movq       qword ptr 8[edx], xmm6
3989    lea        edx, [edx + 16]
3990    jge        l4
3991
3992  l4b:
3993    add        ecx, 4 - 1
3994    jl         l1b
3995
3996    // 1 pixel loop
3997    align      4
3998  l1:
3999    cvttps2dq  xmm0, xmm2    // x, y float to int
4000    packssdw   xmm0, xmm0    // x, y as shorts
4001    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
4002    addps      xmm2, xmm7    // x, y += dx, dy
4003    movd       esi, xmm0
4004    movd       xmm0, [eax + esi]  // copy a pixel
4005    sub        ecx, 1
4006    movd       [edx], xmm0
4007    lea        edx, [edx + 4]
4008    jge        l1
4009  l1b:
4010    pop        edi
4011    pop        esi
4012    ret
4013  }
4014}
4015#endif  // HAS_ARGBAFFINEROW_SSE2
4016
4017// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
4018__declspec(naked) __declspec(align(16))
4019void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4020                              ptrdiff_t src_stride, int dst_width,
4021                              int source_y_fraction) {
4022  __asm {
4023    push       esi
4024    push       edi
4025    mov        edi, [esp + 8 + 4]   // dst_ptr
4026    mov        esi, [esp + 8 + 8]   // src_ptr
4027    mov        edx, [esp + 8 + 12]  // src_stride
4028    mov        ecx, [esp + 8 + 16]  // dst_width
4029    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
4030    sub        edi, esi
4031    shr        eax, 1
4032    cmp        eax, 0
4033    je         xloop1
4034    cmp        eax, 64
4035    je         xloop2
4036    movd       xmm0, eax  // high fraction 0..127
4037    neg        eax
4038    add        eax, 128
4039    movd       xmm5, eax  // low fraction 128..1
4040    punpcklbw  xmm5, xmm0
4041    punpcklwd  xmm5, xmm5
4042    pshufd     xmm5, xmm5, 0
4043
4044    align      16
4045  xloop:
4046    movdqa     xmm0, [esi]
4047    movdqa     xmm2, [esi + edx]
4048    movdqa     xmm1, xmm0
4049    punpcklbw  xmm0, xmm2
4050    punpckhbw  xmm1, xmm2
4051    pmaddubsw  xmm0, xmm5
4052    pmaddubsw  xmm1, xmm5
4053    psrlw      xmm0, 7
4054    psrlw      xmm1, 7
4055    packuswb   xmm0, xmm1
4056    sub        ecx, 4
4057    movdqa     [esi + edi], xmm0
4058    lea        esi, [esi + 16]
4059    jg         xloop
4060
4061    pop        edi
4062    pop        esi
4063    ret
4064
4065    align      16
4066  xloop1:
4067    movdqa     xmm0, [esi]
4068    sub        ecx, 4
4069    movdqa     [esi + edi], xmm0
4070    lea        esi, [esi + 16]
4071    jg         xloop1
4072
4073    pop        edi
4074    pop        esi
4075    ret
4076
4077    align      16
4078  xloop2:
4079    movdqa     xmm0, [esi]
4080    pavgb      xmm0, [esi + edx]
4081    sub        ecx, 4
4082    movdqa     [esi + edi], xmm0
4083    lea        esi, [esi + 16]
4084    jg         xloop2
4085
4086    pop        edi
4087    pop        esi
4088    ret
4089  }
4090}
4091
4092#endif  // _M_IX86
4093
4094#ifdef __cplusplus
4095}  // extern "C"
4096}  // namespace libyuv
4097#endif
4098