1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13// This module is for Visual C 32/64 bit and clangcl 32 bit
14#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
15    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
16
17#if defined(_M_X64)
18#include <emmintrin.h>
19#include <tmmintrin.h>  // For _mm_maddubs_epi16
20#endif
21
22#ifdef __cplusplus
23namespace libyuv {
24extern "C" {
25#endif
26
27// 64 bit
28#if defined(_M_X64)
29
30// Read 4 UV from 422, upsample to 8 UV.
31#define READYUV422                                                             \
32    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
33    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
34    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
35    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
36    u_buf += 4;                                                                \
37    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
38    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
39    y_buf += 8;
40
41// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
42#define READYUVA422                                                            \
43    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
44    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
45    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
46    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
47    u_buf += 4;                                                                \
48    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
49    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
50    y_buf += 8;                                                                \
51    xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                                   \
52    a_buf += 8;
53
54// Convert 8 pixels: 8 UV and 8 Y.
55#define YUVTORGB(yuvconstants)                                                 \
56    xmm1 = _mm_loadu_si128(&xmm0);                                             \
57    xmm2 = _mm_loadu_si128(&xmm0);                                             \
58    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);           \
59    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);           \
60    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);           \
61    xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);             \
62    xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);             \
63    xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);             \
64    xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);            \
65    xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \
66    xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \
67    xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \
68    xmm0 = _mm_srai_epi16(xmm0, 6);                                            \
69    xmm1 = _mm_srai_epi16(xmm1, 6);                                            \
70    xmm2 = _mm_srai_epi16(xmm2, 6);                                            \
71    xmm0 = _mm_packus_epi16(xmm0, xmm0);                                       \
72    xmm1 = _mm_packus_epi16(xmm1, xmm1);                                       \
73    xmm2 = _mm_packus_epi16(xmm2, xmm2);
74
75// Store 8 ARGB values.
76#define STOREARGB                                                              \
77    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
78    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);                                      \
79    xmm1 = _mm_loadu_si128(&xmm0);                                             \
80    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);                                     \
81    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);                                     \
82    _mm_storeu_si128((__m128i *)dst_argb, xmm0);                               \
83    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);                        \
84    dst_argb += 32;
85
86
87#if defined(HAS_I422TOARGBROW_SSSE3)
88void I422ToARGBRow_SSSE3(const uint8* y_buf,
89                         const uint8* u_buf,
90                         const uint8* v_buf,
91                         uint8* dst_argb,
92                         const struct YuvConstants* yuvconstants,
93                         int width) {
94  __m128i xmm0, xmm1, xmm2, xmm4;
95  const __m128i xmm5 = _mm_set1_epi8(-1);
96  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
97  while (width > 0) {
98    READYUV422
99    YUVTORGB(yuvconstants)
100    STOREARGB
101    width -= 8;
102  }
103}
104#endif
105
106#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
107void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
108                              const uint8* u_buf,
109                              const uint8* v_buf,
110                              const uint8* a_buf,
111                              uint8* dst_argb,
112                              const struct YuvConstants* yuvconstants,
113                              int width) {
114  __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
115  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
116  while (width > 0) {
117    READYUVA422
118    YUVTORGB(yuvconstants)
119    STOREARGB
120    width -= 8;
121  }
122}
123#endif
124
125// 32 bit
126#else  // defined(_M_X64)
127#ifdef HAS_ARGBTOYROW_SSSE3
128
129// Constants for ARGB.
130static const vec8 kARGBToY = {
131  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
132};
133
134// JPeg full range.
135static const vec8 kARGBToYJ = {
136  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
137};
138
139static const vec8 kARGBToU = {
140  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
141};
142
143static const vec8 kARGBToUJ = {
144  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
145};
146
147static const vec8 kARGBToV = {
148  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
149};
150
151static const vec8 kARGBToVJ = {
152  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
153};
154
155// vpshufb for vphaddw + vpackuswb packed to shorts.
156static const lvec8 kShufARGBToUV_AVX = {
157  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
158  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
159};
160
161// Constants for BGRA.
162static const vec8 kBGRAToY = {
163  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
164};
165
166static const vec8 kBGRAToU = {
167  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
168};
169
170static const vec8 kBGRAToV = {
171  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
172};
173
174// Constants for ABGR.
175static const vec8 kABGRToY = {
176  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
177};
178
179static const vec8 kABGRToU = {
180  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
181};
182
183static const vec8 kABGRToV = {
184  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
185};
186
187// Constants for RGBA.
188static const vec8 kRGBAToY = {
189  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
190};
191
192static const vec8 kRGBAToU = {
193  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
194};
195
196static const vec8 kRGBAToV = {
197  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
198};
199
200static const uvec8 kAddY16 = {
201  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
202};
203
204// 7 bit fixed point 0.5.
205static const vec16 kAddYJ64 = {
206  64, 64, 64, 64, 64, 64, 64, 64
207};
208
209static const uvec8 kAddUV128 = {
210  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
211  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
212};
213
214static const uvec16 kAddUVJ128 = {
215  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
216};
217
218// Shuffle table for converting RGB24 to ARGB.
219static const uvec8 kShuffleMaskRGB24ToARGB = {
220  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
221};
222
223// Shuffle table for converting RAW to ARGB.
224static const uvec8 kShuffleMaskRAWToARGB = {
225  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
226};
227
228// Shuffle table for converting RAW to RGB24.  First 8.
229static const uvec8 kShuffleMaskRAWToRGB24_0 = {
230  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
231  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
232};
233
234// Shuffle table for converting RAW to RGB24.  Middle 8.
235static const uvec8 kShuffleMaskRAWToRGB24_1 = {
236  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
237  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
238};
239
240// Shuffle table for converting RAW to RGB24.  Last 8.
241static const uvec8 kShuffleMaskRAWToRGB24_2 = {
242  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
243  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
244};
245
246// Shuffle table for converting ARGB to RGB24.
247static const uvec8 kShuffleMaskARGBToRGB24 = {
248  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
249};
250
251// Shuffle table for converting ARGB to RAW.
252static const uvec8 kShuffleMaskARGBToRAW = {
253  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
254};
255
256// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
257static const uvec8 kShuffleMaskARGBToRGB24_0 = {
258  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
259};
260
261// YUY2 shuf 16 Y to 32 Y.
262static const lvec8 kShuffleYUY2Y = {
263  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
264  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
265};
266
267// YUY2 shuf 8 UV to 16 UV.
268static const lvec8 kShuffleYUY2UV = {
269  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
270  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
271};
272
273// UYVY shuf 16 Y to 32 Y.
274static const lvec8 kShuffleUYVYY = {
275  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
276  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
277};
278
279// UYVY shuf 8 UV to 16 UV.
280static const lvec8 kShuffleUYVYUV = {
281  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
282  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
283};
284
285// NV21 shuf 8 VU to 16 UV.
286static const lvec8 kShuffleNV21 = {
287  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
288  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
289};
290
291// Duplicates gray value 3 times and fills in alpha opaque.
292__declspec(naked)
293void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
294  __asm {
295    mov        eax, [esp + 4]        // src_y
296    mov        edx, [esp + 8]        // dst_argb
297    mov        ecx, [esp + 12]       // width
298    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
299    pslld      xmm5, 24
300
301  convertloop:
302    movq       xmm0, qword ptr [eax]
303    lea        eax,  [eax + 8]
304    punpcklbw  xmm0, xmm0
305    movdqa     xmm1, xmm0
306    punpcklwd  xmm0, xmm0
307    punpckhwd  xmm1, xmm1
308    por        xmm0, xmm5
309    por        xmm1, xmm5
310    movdqu     [edx], xmm0
311    movdqu     [edx + 16], xmm1
312    lea        edx, [edx + 32]
313    sub        ecx, 8
314    jg         convertloop
315    ret
316  }
317}
318
319#ifdef HAS_J400TOARGBROW_AVX2
320// Duplicates gray value 3 times and fills in alpha opaque.
321__declspec(naked)
322void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
323  __asm {
324    mov         eax, [esp + 4]        // src_y
325    mov         edx, [esp + 8]        // dst_argb
326    mov         ecx, [esp + 12]       // width
327    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
328    vpslld      ymm5, ymm5, 24
329
330  convertloop:
331    vmovdqu     xmm0, [eax]
332    lea         eax,  [eax + 16]
333    vpermq      ymm0, ymm0, 0xd8
334    vpunpcklbw  ymm0, ymm0, ymm0
335    vpermq      ymm0, ymm0, 0xd8
336    vpunpckhwd  ymm1, ymm0, ymm0
337    vpunpcklwd  ymm0, ymm0, ymm0
338    vpor        ymm0, ymm0, ymm5
339    vpor        ymm1, ymm1, ymm5
340    vmovdqu     [edx], ymm0
341    vmovdqu     [edx + 32], ymm1
342    lea         edx, [edx + 64]
343    sub         ecx, 16
344    jg          convertloop
345    vzeroupper
346    ret
347  }
348}
349#endif  // HAS_J400TOARGBROW_AVX2
350
351__declspec(naked)
352void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
353  __asm {
354    mov       eax, [esp + 4]   // src_rgb24
355    mov       edx, [esp + 8]   // dst_argb
356    mov       ecx, [esp + 12]  // width
357    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
358    pslld     xmm5, 24
359    movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
360
361 convertloop:
362    movdqu    xmm0, [eax]
363    movdqu    xmm1, [eax + 16]
364    movdqu    xmm3, [eax + 32]
365    lea       eax, [eax + 48]
366    movdqa    xmm2, xmm3
367    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
368    pshufb    xmm2, xmm4
369    por       xmm2, xmm5
370    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
371    pshufb    xmm0, xmm4
372    movdqu    [edx + 32], xmm2
373    por       xmm0, xmm5
374    pshufb    xmm1, xmm4
375    movdqu    [edx], xmm0
376    por       xmm1, xmm5
377    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
378    pshufb    xmm3, xmm4
379    movdqu    [edx + 16], xmm1
380    por       xmm3, xmm5
381    movdqu    [edx + 48], xmm3
382    lea       edx, [edx + 64]
383    sub       ecx, 16
384    jg        convertloop
385    ret
386  }
387}
388
389__declspec(naked)
390void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
391                        int width) {
392  __asm {
393    mov       eax, [esp + 4]   // src_raw
394    mov       edx, [esp + 8]   // dst_argb
395    mov       ecx, [esp + 12]  // width
396    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
397    pslld     xmm5, 24
398    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
399
400 convertloop:
401    movdqu    xmm0, [eax]
402    movdqu    xmm1, [eax + 16]
403    movdqu    xmm3, [eax + 32]
404    lea       eax, [eax + 48]
405    movdqa    xmm2, xmm3
406    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
407    pshufb    xmm2, xmm4
408    por       xmm2, xmm5
409    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
410    pshufb    xmm0, xmm4
411    movdqu    [edx + 32], xmm2
412    por       xmm0, xmm5
413    pshufb    xmm1, xmm4
414    movdqu    [edx], xmm0
415    por       xmm1, xmm5
416    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
417    pshufb    xmm3, xmm4
418    movdqu    [edx + 16], xmm1
419    por       xmm3, xmm5
420    movdqu    [edx + 48], xmm3
421    lea       edx, [edx + 64]
422    sub       ecx, 16
423    jg        convertloop
424    ret
425  }
426}
427
428__declspec(naked)
429void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
430  __asm {
431    mov       eax, [esp + 4]   // src_raw
432    mov       edx, [esp + 8]   // dst_rgb24
433    mov       ecx, [esp + 12]  // width
434    movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
435    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
436    movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
437
438 convertloop:
439    movdqu    xmm0, [eax]
440    movdqu    xmm1, [eax + 4]
441    movdqu    xmm2, [eax + 8]
442    lea       eax, [eax + 24]
443    pshufb    xmm0, xmm3
444    pshufb    xmm1, xmm4
445    pshufb    xmm2, xmm5
446    movq      qword ptr [edx], xmm0
447    movq      qword ptr [edx + 8], xmm1
448    movq      qword ptr [edx + 16], xmm2
449    lea       edx, [edx + 24]
450    sub       ecx, 8
451    jg        convertloop
452    ret
453  }
454}
455
456// pmul method to replicate bits.
457// Math to replicate bits:
458// (v << 8) | (v << 3)
459// v * 256 + v * 8
460// v * (256 + 8)
461// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
462// 20 instructions.
463__declspec(naked)
464void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
465                          int width) {
466  __asm {
467    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
468    movd      xmm5, eax
469    pshufd    xmm5, xmm5, 0
470    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
471    movd      xmm6, eax
472    pshufd    xmm6, xmm6, 0
473    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
474    psllw     xmm3, 11
475    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
476    psllw     xmm4, 10
477    psrlw     xmm4, 5
478    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
479    psllw     xmm7, 8
480
481    mov       eax, [esp + 4]   // src_rgb565
482    mov       edx, [esp + 8]   // dst_argb
483    mov       ecx, [esp + 12]  // width
484    sub       edx, eax
485    sub       edx, eax
486
487 convertloop:
488    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
489    movdqa    xmm1, xmm0
490    movdqa    xmm2, xmm0
491    pand      xmm1, xmm3    // R in upper 5 bits
492    psllw     xmm2, 11      // B in upper 5 bits
493    pmulhuw   xmm1, xmm5    // * (256 + 8)
494    pmulhuw   xmm2, xmm5    // * (256 + 8)
495    psllw     xmm1, 8
496    por       xmm1, xmm2    // RB
497    pand      xmm0, xmm4    // G in middle 6 bits
498    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
499    por       xmm0, xmm7    // AG
500    movdqa    xmm2, xmm1
501    punpcklbw xmm1, xmm0
502    punpckhbw xmm2, xmm0
503    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
504    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
505    lea       eax, [eax + 16]
506    sub       ecx, 8
507    jg        convertloop
508    ret
509  }
510}
511
512#ifdef HAS_RGB565TOARGBROW_AVX2
513// pmul method to replicate bits.
514// Math to replicate bits:
515// (v << 8) | (v << 3)
516// v * 256 + v * 8
517// v * (256 + 8)
518// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
519__declspec(naked)
520void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
521                          int width) {
522  __asm {
523    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
524    vmovd      xmm5, eax
525    vbroadcastss ymm5, xmm5
526    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
527    vmovd      xmm6, eax
528    vbroadcastss ymm6, xmm6
529    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
530    vpsllw     ymm3, ymm3, 11
531    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
532    vpsllw     ymm4, ymm4, 10
533    vpsrlw     ymm4, ymm4, 5
534    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
535    vpsllw     ymm7, ymm7, 8
536
537    mov        eax, [esp + 4]   // src_rgb565
538    mov        edx, [esp + 8]   // dst_argb
539    mov        ecx, [esp + 12]  // width
540    sub        edx, eax
541    sub        edx, eax
542
543 convertloop:
544    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
545    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
546    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
547    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
548    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
549    vpsllw     ymm1, ymm1, 8
550    vpor       ymm1, ymm1, ymm2    // RB
551    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
552    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
553    vpor       ymm0, ymm0, ymm7    // AG
554    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
555    vpermq     ymm1, ymm1, 0xd8
556    vpunpckhbw ymm2, ymm1, ymm0
557    vpunpcklbw ymm1, ymm1, ymm0
558    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
559    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
560    lea       eax, [eax + 32]
561    sub       ecx, 16
562    jg        convertloop
563    vzeroupper
564    ret
565  }
566}
567#endif  // HAS_RGB565TOARGBROW_AVX2
568
569#ifdef HAS_ARGB1555TOARGBROW_AVX2
570__declspec(naked)
571void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
572                            int width) {
573  __asm {
574    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
575    vmovd      xmm5, eax
576    vbroadcastss ymm5, xmm5
577    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
578    vmovd      xmm6, eax
579    vbroadcastss ymm6, xmm6
580    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
581    vpsllw     ymm3, ymm3, 11
582    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
583    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
584    vpsllw     ymm7, ymm7, 8
585
586    mov        eax,  [esp + 4]   // src_argb1555
587    mov        edx,  [esp + 8]   // dst_argb
588    mov        ecx,  [esp + 12]  // width
589    sub        edx,  eax
590    sub        edx,  eax
591
592 convertloop:
593    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
594    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
595    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
596    vpand      ymm1, ymm1, ymm3
597    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
598    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
599    vpsllw     ymm1, ymm1, 8
600    vpor       ymm1, ymm1, ymm2    // RB
601    vpsraw     ymm2, ymm0, 8       // A
602    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
603    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
604    vpand      ymm2, ymm2, ymm7
605    vpor       ymm0, ymm0, ymm2    // AG
606    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
607    vpermq     ymm1, ymm1, 0xd8
608    vpunpckhbw ymm2, ymm1, ymm0
609    vpunpcklbw ymm1, ymm1, ymm0
610    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
611    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
612    lea       eax, [eax + 32]
613    sub       ecx, 16
614    jg        convertloop
615    vzeroupper
616    ret
617  }
618}
619#endif  // HAS_ARGB1555TOARGBROW_AVX2
620
621#ifdef HAS_ARGB4444TOARGBROW_AVX2
622__declspec(naked)
623void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
624                            int width) {
625  __asm {
626    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
627    vmovd     xmm4, eax
628    vbroadcastss ymm4, xmm4
629    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
630    mov       eax,  [esp + 4]   // src_argb4444
631    mov       edx,  [esp + 8]   // dst_argb
632    mov       ecx,  [esp + 12]  // width
633    sub       edx,  eax
634    sub       edx,  eax
635
636 convertloop:
637    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
638    vpand      ymm2, ymm0, ymm5    // mask high nibbles
639    vpand      ymm0, ymm0, ymm4    // mask low nibbles
640    vpsrlw     ymm3, ymm2, 4
641    vpsllw     ymm1, ymm0, 4
642    vpor       ymm2, ymm2, ymm3
643    vpor       ymm0, ymm0, ymm1
644    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
645    vpermq     ymm2, ymm2, 0xd8
646    vpunpckhbw ymm1, ymm0, ymm2
647    vpunpcklbw ymm0, ymm0, ymm2
648    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
649    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
650    lea       eax, [eax + 32]
651    sub       ecx, 16
652    jg        convertloop
653    vzeroupper
654    ret
655  }
656}
657#endif  // HAS_ARGB4444TOARGBROW_AVX2
658
659// 24 instructions
660__declspec(naked)
661void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
662                            int width) {
663  __asm {
664    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
665    movd      xmm5, eax
666    pshufd    xmm5, xmm5, 0
667    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
668    movd      xmm6, eax
669    pshufd    xmm6, xmm6, 0
670    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
671    psllw     xmm3, 11
672    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
673    psrlw     xmm4, 6
674    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
675    psllw     xmm7, 8
676
677    mov       eax, [esp + 4]   // src_argb1555
678    mov       edx, [esp + 8]   // dst_argb
679    mov       ecx, [esp + 12]  // width
680    sub       edx, eax
681    sub       edx, eax
682
683 convertloop:
684    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
685    movdqa    xmm1, xmm0
686    movdqa    xmm2, xmm0
687    psllw     xmm1, 1       // R in upper 5 bits
688    psllw     xmm2, 11      // B in upper 5 bits
689    pand      xmm1, xmm3
690    pmulhuw   xmm2, xmm5    // * (256 + 8)
691    pmulhuw   xmm1, xmm5    // * (256 + 8)
692    psllw     xmm1, 8
693    por       xmm1, xmm2    // RB
694    movdqa    xmm2, xmm0
695    pand      xmm0, xmm4    // G in middle 5 bits
696    psraw     xmm2, 8       // A
697    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
698    pand      xmm2, xmm7
699    por       xmm0, xmm2    // AG
700    movdqa    xmm2, xmm1
701    punpcklbw xmm1, xmm0
702    punpckhbw xmm2, xmm0
703    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
704    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
705    lea       eax, [eax + 16]
706    sub       ecx, 8
707    jg        convertloop
708    ret
709  }
710}
711
712// 18 instructions.
713__declspec(naked)
714void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
715                            int width) {
716  __asm {
717    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
718    movd      xmm4, eax
719    pshufd    xmm4, xmm4, 0
720    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
721    pslld     xmm5, 4
722    mov       eax, [esp + 4]   // src_argb4444
723    mov       edx, [esp + 8]   // dst_argb
724    mov       ecx, [esp + 12]  // width
725    sub       edx, eax
726    sub       edx, eax
727
728 convertloop:
729    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
730    movdqa    xmm2, xmm0
731    pand      xmm0, xmm4    // mask low nibbles
732    pand      xmm2, xmm5    // mask high nibbles
733    movdqa    xmm1, xmm0
734    movdqa    xmm3, xmm2
735    psllw     xmm1, 4
736    psrlw     xmm3, 4
737    por       xmm0, xmm1
738    por       xmm2, xmm3
739    movdqa    xmm1, xmm0
740    punpcklbw xmm0, xmm2
741    punpckhbw xmm1, xmm2
742    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
743    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
744    lea       eax, [eax + 16]
745    sub       ecx, 8
746    jg        convertloop
747    ret
748  }
749}
750
751__declspec(naked)
752void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
753  __asm {
754    mov       eax, [esp + 4]   // src_argb
755    mov       edx, [esp + 8]   // dst_rgb
756    mov       ecx, [esp + 12]  // width
757    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
758
759 convertloop:
760    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
761    movdqu    xmm1, [eax + 16]
762    movdqu    xmm2, [eax + 32]
763    movdqu    xmm3, [eax + 48]
764    lea       eax, [eax + 64]
765    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
766    pshufb    xmm1, xmm6
767    pshufb    xmm2, xmm6
768    pshufb    xmm3, xmm6
769    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
770    psrldq    xmm1, 4      // 8 bytes from 1
771    pslldq    xmm4, 12     // 4 bytes from 1 for 0
772    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
773    por       xmm0, xmm4   // 4 bytes from 1 for 0
774    pslldq    xmm5, 8      // 8 bytes from 2 for 1
775    movdqu    [edx], xmm0  // store 0
776    por       xmm1, xmm5   // 8 bytes from 2 for 1
777    psrldq    xmm2, 8      // 4 bytes from 2
778    pslldq    xmm3, 4      // 12 bytes from 3 for 2
779    por       xmm2, xmm3   // 12 bytes from 3 for 2
780    movdqu    [edx + 16], xmm1   // store 1
781    movdqu    [edx + 32], xmm2   // store 2
782    lea       edx, [edx + 48]
783    sub       ecx, 16
784    jg        convertloop
785    ret
786  }
787}
788
789__declspec(naked)
790void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
791  __asm {
792    mov       eax, [esp + 4]   // src_argb
793    mov       edx, [esp + 8]   // dst_rgb
794    mov       ecx, [esp + 12]  // width
795    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
796
797 convertloop:
798    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
799    movdqu    xmm1, [eax + 16]
800    movdqu    xmm2, [eax + 32]
801    movdqu    xmm3, [eax + 48]
802    lea       eax, [eax + 64]
803    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
804    pshufb    xmm1, xmm6
805    pshufb    xmm2, xmm6
806    pshufb    xmm3, xmm6
807    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
808    psrldq    xmm1, 4      // 8 bytes from 1
809    pslldq    xmm4, 12     // 4 bytes from 1 for 0
810    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
811    por       xmm0, xmm4   // 4 bytes from 1 for 0
812    pslldq    xmm5, 8      // 8 bytes from 2 for 1
813    movdqu    [edx], xmm0  // store 0
814    por       xmm1, xmm5   // 8 bytes from 2 for 1
815    psrldq    xmm2, 8      // 4 bytes from 2
816    pslldq    xmm3, 4      // 12 bytes from 3 for 2
817    por       xmm2, xmm3   // 12 bytes from 3 for 2
818    movdqu    [edx + 16], xmm1   // store 1
819    movdqu    [edx + 32], xmm2   // store 2
820    lea       edx, [edx + 48]
821    sub       ecx, 16
822    jg        convertloop
823    ret
824  }
825}
826
827__declspec(naked)
828void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
829  __asm {
830    mov       eax, [esp + 4]   // src_argb
831    mov       edx, [esp + 8]   // dst_rgb
832    mov       ecx, [esp + 12]  // width
833    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
834    psrld     xmm3, 27
835    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
836    psrld     xmm4, 26
837    pslld     xmm4, 5
838    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
839    pslld     xmm5, 11
840
841 convertloop:
842    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
843    movdqa    xmm1, xmm0    // B
844    movdqa    xmm2, xmm0    // G
845    pslld     xmm0, 8       // R
846    psrld     xmm1, 3       // B
847    psrld     xmm2, 5       // G
848    psrad     xmm0, 16      // R
849    pand      xmm1, xmm3    // B
850    pand      xmm2, xmm4    // G
851    pand      xmm0, xmm5    // R
852    por       xmm1, xmm2    // BG
853    por       xmm0, xmm1    // BGR
854    packssdw  xmm0, xmm0
855    lea       eax, [eax + 16]
856    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
857    lea       edx, [edx + 8]
858    sub       ecx, 4
859    jg        convertloop
860    ret
861  }
862}
863
864__declspec(naked)
865void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
866                                const uint32 dither4, int width) {
867  __asm {
868
869    mov       eax, [esp + 4]   // src_argb
870    mov       edx, [esp + 8]   // dst_rgb
871    movd      xmm6, [esp + 12] // dither4
872    mov       ecx, [esp + 16]  // width
873    punpcklbw xmm6, xmm6       // make dither 16 bytes
874    movdqa    xmm7, xmm6
875    punpcklwd xmm6, xmm6
876    punpckhwd xmm7, xmm7
877    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
878    psrld     xmm3, 27
879    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
880    psrld     xmm4, 26
881    pslld     xmm4, 5
882    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
883    pslld     xmm5, 11
884
885 convertloop:
886    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
887    paddusb   xmm0, xmm6    // add dither
888    movdqa    xmm1, xmm0    // B
889    movdqa    xmm2, xmm0    // G
890    pslld     xmm0, 8       // R
891    psrld     xmm1, 3       // B
892    psrld     xmm2, 5       // G
893    psrad     xmm0, 16      // R
894    pand      xmm1, xmm3    // B
895    pand      xmm2, xmm4    // G
896    pand      xmm0, xmm5    // R
897    por       xmm1, xmm2    // BG
898    por       xmm0, xmm1    // BGR
899    packssdw  xmm0, xmm0
900    lea       eax, [eax + 16]
901    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
902    lea       edx, [edx + 8]
903    sub       ecx, 4
904    jg        convertloop
905    ret
906  }
907}
908
909#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
910__declspec(naked)
911void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
912                                const uint32 dither4, int width) {
913  __asm {
914    mov        eax, [esp + 4]      // src_argb
915    mov        edx, [esp + 8]      // dst_rgb
916    vbroadcastss xmm6, [esp + 12]  // dither4
917    mov        ecx, [esp + 16]     // width
918    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
919    vpermq     ymm6, ymm6, 0xd8
920    vpunpcklwd ymm6, ymm6, ymm6
921    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
922    vpsrld     ymm3, ymm3, 27
923    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
924    vpsrld     ymm4, ymm4, 26
925    vpslld     ymm4, ymm4, 5
926    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
927
928 convertloop:
929    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
930    vpaddusb   ymm0, ymm0, ymm6    // add dither
931    vpsrld     ymm2, ymm0, 5       // G
932    vpsrld     ymm1, ymm0, 3       // B
933    vpsrld     ymm0, ymm0, 8       // R
934    vpand      ymm2, ymm2, ymm4    // G
935    vpand      ymm1, ymm1, ymm3    // B
936    vpand      ymm0, ymm0, ymm5    // R
937    vpor       ymm1, ymm1, ymm2    // BG
938    vpor       ymm0, ymm0, ymm1    // BGR
939    vpackusdw  ymm0, ymm0, ymm0
940    vpermq     ymm0, ymm0, 0xd8
941    lea        eax, [eax + 32]
942    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
943    lea        edx, [edx + 16]
944    sub        ecx, 8
945    jg         convertloop
946    vzeroupper
947    ret
948  }
949}
950#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
951
952// TODO(fbarchard): Improve sign extension/packing.
953__declspec(naked)
954void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
955  __asm {
956    mov       eax, [esp + 4]   // src_argb
957    mov       edx, [esp + 8]   // dst_rgb
958    mov       ecx, [esp + 12]  // width
959    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
960    psrld     xmm4, 27
961    movdqa    xmm5, xmm4       // generate mask 0x000003e0
962    pslld     xmm5, 5
963    movdqa    xmm6, xmm4       // generate mask 0x00007c00
964    pslld     xmm6, 10
965    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
966    pslld     xmm7, 15
967
968 convertloop:
969    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
970    movdqa    xmm1, xmm0    // B
971    movdqa    xmm2, xmm0    // G
972    movdqa    xmm3, xmm0    // R
973    psrad     xmm0, 16      // A
974    psrld     xmm1, 3       // B
975    psrld     xmm2, 6       // G
976    psrld     xmm3, 9       // R
977    pand      xmm0, xmm7    // A
978    pand      xmm1, xmm4    // B
979    pand      xmm2, xmm5    // G
980    pand      xmm3, xmm6    // R
981    por       xmm0, xmm1    // BA
982    por       xmm2, xmm3    // GR
983    por       xmm0, xmm2    // BGRA
984    packssdw  xmm0, xmm0
985    lea       eax, [eax + 16]
986    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
987    lea       edx, [edx + 8]
988    sub       ecx, 4
989    jg        convertloop
990    ret
991  }
992}
993
994__declspec(naked)
995void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
996  __asm {
997    mov       eax, [esp + 4]   // src_argb
998    mov       edx, [esp + 8]   // dst_rgb
999    mov       ecx, [esp + 12]  // width
1000    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
1001    psllw     xmm4, 12
1002    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
1003    psrlw     xmm3, 8
1004
1005 convertloop:
1006    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
1007    movdqa    xmm1, xmm0
1008    pand      xmm0, xmm3    // low nibble
1009    pand      xmm1, xmm4    // high nibble
1010    psrld     xmm0, 4
1011    psrld     xmm1, 8
1012    por       xmm0, xmm1
1013    packuswb  xmm0, xmm0
1014    lea       eax, [eax + 16]
1015    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
1016    lea       edx, [edx + 8]
1017    sub       ecx, 4
1018    jg        convertloop
1019    ret
1020  }
1021}
1022
1023#ifdef HAS_ARGBTORGB565ROW_AVX2
1024__declspec(naked)
1025void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1026  __asm {
1027    mov        eax, [esp + 4]      // src_argb
1028    mov        edx, [esp + 8]      // dst_rgb
1029    mov        ecx, [esp + 12]     // width
1030    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
1031    vpsrld     ymm3, ymm3, 27
1032    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
1033    vpsrld     ymm4, ymm4, 26
1034    vpslld     ymm4, ymm4, 5
1035    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
1036
1037 convertloop:
1038    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
1039    vpsrld     ymm2, ymm0, 5       // G
1040    vpsrld     ymm1, ymm0, 3       // B
1041    vpsrld     ymm0, ymm0, 8       // R
1042    vpand      ymm2, ymm2, ymm4    // G
1043    vpand      ymm1, ymm1, ymm3    // B
1044    vpand      ymm0, ymm0, ymm5    // R
1045    vpor       ymm1, ymm1, ymm2    // BG
1046    vpor       ymm0, ymm0, ymm1    // BGR
1047    vpackusdw  ymm0, ymm0, ymm0
1048    vpermq     ymm0, ymm0, 0xd8
1049    lea        eax, [eax + 32]
1050    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
1051    lea        edx, [edx + 16]
1052    sub        ecx, 8
1053    jg         convertloop
1054    vzeroupper
1055    ret
1056  }
1057}
1058#endif  // HAS_ARGBTORGB565ROW_AVX2
1059
1060#ifdef HAS_ARGBTOARGB1555ROW_AVX2
1061__declspec(naked)
1062void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1063  __asm {
1064    mov        eax, [esp + 4]      // src_argb
1065    mov        edx, [esp + 8]      // dst_rgb
1066    mov        ecx, [esp + 12]     // width
1067    vpcmpeqb   ymm4, ymm4, ymm4
1068    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
1069    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
1070    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
1071    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
1072    vpslld     ymm7, ymm7, 15
1073
1074 convertloop:
1075    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
1076    vpsrld     ymm3, ymm0, 9       // R
1077    vpsrld     ymm2, ymm0, 6       // G
1078    vpsrld     ymm1, ymm0, 3       // B
1079    vpsrad     ymm0, ymm0, 16      // A
1080    vpand      ymm3, ymm3, ymm6    // R
1081    vpand      ymm2, ymm2, ymm5    // G
1082    vpand      ymm1, ymm1, ymm4    // B
1083    vpand      ymm0, ymm0, ymm7    // A
1084    vpor       ymm0, ymm0, ymm1    // BA
1085    vpor       ymm2, ymm2, ymm3    // GR
1086    vpor       ymm0, ymm0, ymm2    // BGRA
1087    vpackssdw  ymm0, ymm0, ymm0
1088    vpermq     ymm0, ymm0, 0xd8
1089    lea        eax, [eax + 32]
1090    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
1091    lea        edx, [edx + 16]
1092    sub        ecx, 8
1093    jg         convertloop
1094    vzeroupper
1095    ret
1096  }
1097}
1098#endif  // HAS_ARGBTOARGB1555ROW_AVX2
1099
1100#ifdef HAS_ARGBTOARGB4444ROW_AVX2
1101__declspec(naked)
1102void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1103  __asm {
1104    mov        eax, [esp + 4]   // src_argb
1105    mov        edx, [esp + 8]   // dst_rgb
1106    mov        ecx, [esp + 12]  // width
1107    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
1108    vpsllw     ymm4, ymm4, 12
1109    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
1110
1111 convertloop:
1112    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
1113    vpand      ymm1, ymm0, ymm4    // high nibble
1114    vpand      ymm0, ymm0, ymm3    // low nibble
1115    vpsrld     ymm1, ymm1, 8
1116    vpsrld     ymm0, ymm0, 4
1117    vpor       ymm0, ymm0, ymm1
1118    vpackuswb  ymm0, ymm0, ymm0
1119    vpermq     ymm0, ymm0, 0xd8
1120    lea        eax, [eax + 32]
1121    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
1122    lea        edx, [edx + 16]
1123    sub        ecx, 8
1124    jg         convertloop
1125    vzeroupper
1126    ret
1127  }
1128}
1129#endif  // HAS_ARGBTOARGB4444ROW_AVX2
1130
1131// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1132__declspec(naked)
1133void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1134  __asm {
1135    mov        eax, [esp + 4]   /* src_argb */
1136    mov        edx, [esp + 8]   /* dst_y */
1137    mov        ecx, [esp + 12]  /* width */
1138    movdqa     xmm4, xmmword ptr kARGBToY
1139    movdqa     xmm5, xmmword ptr kAddY16
1140
1141 convertloop:
1142    movdqu     xmm0, [eax]
1143    movdqu     xmm1, [eax + 16]
1144    movdqu     xmm2, [eax + 32]
1145    movdqu     xmm3, [eax + 48]
1146    pmaddubsw  xmm0, xmm4
1147    pmaddubsw  xmm1, xmm4
1148    pmaddubsw  xmm2, xmm4
1149    pmaddubsw  xmm3, xmm4
1150    lea        eax, [eax + 64]
1151    phaddw     xmm0, xmm1
1152    phaddw     xmm2, xmm3
1153    psrlw      xmm0, 7
1154    psrlw      xmm2, 7
1155    packuswb   xmm0, xmm2
1156    paddb      xmm0, xmm5
1157    movdqu     [edx], xmm0
1158    lea        edx, [edx + 16]
1159    sub        ecx, 16
1160    jg         convertloop
1161    ret
1162  }
1163}
1164
1165// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1166// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1167__declspec(naked)
1168void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1169  __asm {
1170    mov        eax, [esp + 4]   /* src_argb */
1171    mov        edx, [esp + 8]   /* dst_y */
1172    mov        ecx, [esp + 12]  /* width */
1173    movdqa     xmm4, xmmword ptr kARGBToYJ
1174    movdqa     xmm5, xmmword ptr kAddYJ64
1175
1176 convertloop:
1177    movdqu     xmm0, [eax]
1178    movdqu     xmm1, [eax + 16]
1179    movdqu     xmm2, [eax + 32]
1180    movdqu     xmm3, [eax + 48]
1181    pmaddubsw  xmm0, xmm4
1182    pmaddubsw  xmm1, xmm4
1183    pmaddubsw  xmm2, xmm4
1184    pmaddubsw  xmm3, xmm4
1185    lea        eax, [eax + 64]
1186    phaddw     xmm0, xmm1
1187    phaddw     xmm2, xmm3
1188    paddw      xmm0, xmm5  // Add .5 for rounding.
1189    paddw      xmm2, xmm5
1190    psrlw      xmm0, 7
1191    psrlw      xmm2, 7
1192    packuswb   xmm0, xmm2
1193    movdqu     [edx], xmm0
1194    lea        edx, [edx + 16]
1195    sub        ecx, 16
1196    jg         convertloop
1197    ret
1198  }
1199}
1200
1201#ifdef HAS_ARGBTOYROW_AVX2
1202// vpermd for vphaddw + vpackuswb vpermd.
1203static const lvec32 kPermdARGBToY_AVX = {
1204  0, 4, 1, 5, 2, 6, 3, 7
1205};
1206
1207// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1208__declspec(naked)
1209void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
1210  __asm {
1211    mov        eax, [esp + 4]   /* src_argb */
1212    mov        edx, [esp + 8]   /* dst_y */
1213    mov        ecx, [esp + 12]  /* width */
1214    vbroadcastf128 ymm4, xmmword ptr kARGBToY
1215    vbroadcastf128 ymm5, xmmword ptr kAddY16
1216    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1217
1218 convertloop:
1219    vmovdqu    ymm0, [eax]
1220    vmovdqu    ymm1, [eax + 32]
1221    vmovdqu    ymm2, [eax + 64]
1222    vmovdqu    ymm3, [eax + 96]
1223    vpmaddubsw ymm0, ymm0, ymm4
1224    vpmaddubsw ymm1, ymm1, ymm4
1225    vpmaddubsw ymm2, ymm2, ymm4
1226    vpmaddubsw ymm3, ymm3, ymm4
1227    lea        eax, [eax + 128]
1228    vphaddw    ymm0, ymm0, ymm1  // mutates.
1229    vphaddw    ymm2, ymm2, ymm3
1230    vpsrlw     ymm0, ymm0, 7
1231    vpsrlw     ymm2, ymm2, 7
1232    vpackuswb  ymm0, ymm0, ymm2  // mutates.
1233    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1234    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
1235    vmovdqu    [edx], ymm0
1236    lea        edx, [edx + 32]
1237    sub        ecx, 32
1238    jg         convertloop
1239    vzeroupper
1240    ret
1241  }
1242}
1243#endif  //  HAS_ARGBTOYROW_AVX2
1244
1245#ifdef HAS_ARGBTOYJROW_AVX2
1246// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1247__declspec(naked)
1248void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
1249  __asm {
1250    mov        eax, [esp + 4]   /* src_argb */
1251    mov        edx, [esp + 8]   /* dst_y */
1252    mov        ecx, [esp + 12]  /* width */
1253    vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1254    vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1255    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1256
1257 convertloop:
1258    vmovdqu    ymm0, [eax]
1259    vmovdqu    ymm1, [eax + 32]
1260    vmovdqu    ymm2, [eax + 64]
1261    vmovdqu    ymm3, [eax + 96]
1262    vpmaddubsw ymm0, ymm0, ymm4
1263    vpmaddubsw ymm1, ymm1, ymm4
1264    vpmaddubsw ymm2, ymm2, ymm4
1265    vpmaddubsw ymm3, ymm3, ymm4
1266    lea        eax, [eax + 128]
1267    vphaddw    ymm0, ymm0, ymm1  // mutates.
1268    vphaddw    ymm2, ymm2, ymm3
1269    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
1270    vpaddw     ymm2, ymm2, ymm5
1271    vpsrlw     ymm0, ymm0, 7
1272    vpsrlw     ymm2, ymm2, 7
1273    vpackuswb  ymm0, ymm0, ymm2  // mutates.
1274    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1275    vmovdqu    [edx], ymm0
1276    lea        edx, [edx + 32]
1277    sub        ecx, 32
1278    jg         convertloop
1279
1280    vzeroupper
1281    ret
1282  }
1283}
1284#endif  //  HAS_ARGBTOYJROW_AVX2
1285
1286__declspec(naked)
1287void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1288  __asm {
1289    mov        eax, [esp + 4]   /* src_argb */
1290    mov        edx, [esp + 8]   /* dst_y */
1291    mov        ecx, [esp + 12]  /* width */
1292    movdqa     xmm4, xmmword ptr kBGRAToY
1293    movdqa     xmm5, xmmword ptr kAddY16
1294
1295 convertloop:
1296    movdqu     xmm0, [eax]
1297    movdqu     xmm1, [eax + 16]
1298    movdqu     xmm2, [eax + 32]
1299    movdqu     xmm3, [eax + 48]
1300    pmaddubsw  xmm0, xmm4
1301    pmaddubsw  xmm1, xmm4
1302    pmaddubsw  xmm2, xmm4
1303    pmaddubsw  xmm3, xmm4
1304    lea        eax, [eax + 64]
1305    phaddw     xmm0, xmm1
1306    phaddw     xmm2, xmm3
1307    psrlw      xmm0, 7
1308    psrlw      xmm2, 7
1309    packuswb   xmm0, xmm2
1310    paddb      xmm0, xmm5
1311    movdqu     [edx], xmm0
1312    lea        edx, [edx + 16]
1313    sub        ecx, 16
1314    jg         convertloop
1315    ret
1316  }
1317}
1318
1319__declspec(naked)
1320void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1321  __asm {
1322    mov        eax, [esp + 4]   /* src_argb */
1323    mov        edx, [esp + 8]   /* dst_y */
1324    mov        ecx, [esp + 12]  /* width */
1325    movdqa     xmm4, xmmword ptr kABGRToY
1326    movdqa     xmm5, xmmword ptr kAddY16
1327
1328 convertloop:
1329    movdqu     xmm0, [eax]
1330    movdqu     xmm1, [eax + 16]
1331    movdqu     xmm2, [eax + 32]
1332    movdqu     xmm3, [eax + 48]
1333    pmaddubsw  xmm0, xmm4
1334    pmaddubsw  xmm1, xmm4
1335    pmaddubsw  xmm2, xmm4
1336    pmaddubsw  xmm3, xmm4
1337    lea        eax, [eax + 64]
1338    phaddw     xmm0, xmm1
1339    phaddw     xmm2, xmm3
1340    psrlw      xmm0, 7
1341    psrlw      xmm2, 7
1342    packuswb   xmm0, xmm2
1343    paddb      xmm0, xmm5
1344    movdqu     [edx], xmm0
1345    lea        edx, [edx + 16]
1346    sub        ecx, 16
1347    jg         convertloop
1348    ret
1349  }
1350}
1351
1352__declspec(naked)
1353void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1354  __asm {
1355    mov        eax, [esp + 4]   /* src_argb */
1356    mov        edx, [esp + 8]   /* dst_y */
1357    mov        ecx, [esp + 12]  /* width */
1358    movdqa     xmm4, xmmword ptr kRGBAToY
1359    movdqa     xmm5, xmmword ptr kAddY16
1360
1361 convertloop:
1362    movdqu     xmm0, [eax]
1363    movdqu     xmm1, [eax + 16]
1364    movdqu     xmm2, [eax + 32]
1365    movdqu     xmm3, [eax + 48]
1366    pmaddubsw  xmm0, xmm4
1367    pmaddubsw  xmm1, xmm4
1368    pmaddubsw  xmm2, xmm4
1369    pmaddubsw  xmm3, xmm4
1370    lea        eax, [eax + 64]
1371    phaddw     xmm0, xmm1
1372    phaddw     xmm2, xmm3
1373    psrlw      xmm0, 7
1374    psrlw      xmm2, 7
1375    packuswb   xmm0, xmm2
1376    paddb      xmm0, xmm5
1377    movdqu     [edx], xmm0
1378    lea        edx, [edx + 16]
1379    sub        ecx, 16
1380    jg         convertloop
1381    ret
1382  }
1383}
1384
1385__declspec(naked)
1386void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1387                       uint8* dst_u, uint8* dst_v, int width) {
1388  __asm {
1389    push       esi
1390    push       edi
1391    mov        eax, [esp + 8 + 4]   // src_argb
1392    mov        esi, [esp + 8 + 8]   // src_stride_argb
1393    mov        edx, [esp + 8 + 12]  // dst_u
1394    mov        edi, [esp + 8 + 16]  // dst_v
1395    mov        ecx, [esp + 8 + 20]  // width
1396    movdqa     xmm5, xmmword ptr kAddUV128
1397    movdqa     xmm6, xmmword ptr kARGBToV
1398    movdqa     xmm7, xmmword ptr kARGBToU
1399    sub        edi, edx             // stride from u to v
1400
1401 convertloop:
1402    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1403    movdqu     xmm0, [eax]
1404    movdqu     xmm4, [eax + esi]
1405    pavgb      xmm0, xmm4
1406    movdqu     xmm1, [eax + 16]
1407    movdqu     xmm4, [eax + esi + 16]
1408    pavgb      xmm1, xmm4
1409    movdqu     xmm2, [eax + 32]
1410    movdqu     xmm4, [eax + esi + 32]
1411    pavgb      xmm2, xmm4
1412    movdqu     xmm3, [eax + 48]
1413    movdqu     xmm4, [eax + esi + 48]
1414    pavgb      xmm3, xmm4
1415
1416    lea        eax,  [eax + 64]
1417    movdqa     xmm4, xmm0
1418    shufps     xmm0, xmm1, 0x88
1419    shufps     xmm4, xmm1, 0xdd
1420    pavgb      xmm0, xmm4
1421    movdqa     xmm4, xmm2
1422    shufps     xmm2, xmm3, 0x88
1423    shufps     xmm4, xmm3, 0xdd
1424    pavgb      xmm2, xmm4
1425
1426    // step 2 - convert to U and V
1427    // from here down is very similar to Y code except
1428    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1429    movdqa     xmm1, xmm0
1430    movdqa     xmm3, xmm2
1431    pmaddubsw  xmm0, xmm7  // U
1432    pmaddubsw  xmm2, xmm7
1433    pmaddubsw  xmm1, xmm6  // V
1434    pmaddubsw  xmm3, xmm6
1435    phaddw     xmm0, xmm2
1436    phaddw     xmm1, xmm3
1437    psraw      xmm0, 8
1438    psraw      xmm1, 8
1439    packsswb   xmm0, xmm1
1440    paddb      xmm0, xmm5            // -> unsigned
1441
1442    // step 3 - store 8 U and 8 V values
1443    movlps     qword ptr [edx], xmm0 // U
1444    movhps     qword ptr [edx + edi], xmm0 // V
1445    lea        edx, [edx + 8]
1446    sub        ecx, 16
1447    jg         convertloop
1448
1449    pop        edi
1450    pop        esi
1451    ret
1452  }
1453}
1454
1455__declspec(naked)
1456void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1457                        uint8* dst_u, uint8* dst_v, int width) {
1458  __asm {
1459    push       esi
1460    push       edi
1461    mov        eax, [esp + 8 + 4]   // src_argb
1462    mov        esi, [esp + 8 + 8]   // src_stride_argb
1463    mov        edx, [esp + 8 + 12]  // dst_u
1464    mov        edi, [esp + 8 + 16]  // dst_v
1465    mov        ecx, [esp + 8 + 20]  // width
1466    movdqa     xmm5, xmmword ptr kAddUVJ128
1467    movdqa     xmm6, xmmword ptr kARGBToVJ
1468    movdqa     xmm7, xmmword ptr kARGBToUJ
1469    sub        edi, edx             // stride from u to v
1470
1471 convertloop:
1472    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1473    movdqu     xmm0, [eax]
1474    movdqu     xmm4, [eax + esi]
1475    pavgb      xmm0, xmm4
1476    movdqu     xmm1, [eax + 16]
1477    movdqu     xmm4, [eax + esi + 16]
1478    pavgb      xmm1, xmm4
1479    movdqu     xmm2, [eax + 32]
1480    movdqu     xmm4, [eax + esi + 32]
1481    pavgb      xmm2, xmm4
1482    movdqu     xmm3, [eax + 48]
1483    movdqu     xmm4, [eax + esi + 48]
1484    pavgb      xmm3, xmm4
1485
1486    lea        eax,  [eax + 64]
1487    movdqa     xmm4, xmm0
1488    shufps     xmm0, xmm1, 0x88
1489    shufps     xmm4, xmm1, 0xdd
1490    pavgb      xmm0, xmm4
1491    movdqa     xmm4, xmm2
1492    shufps     xmm2, xmm3, 0x88
1493    shufps     xmm4, xmm3, 0xdd
1494    pavgb      xmm2, xmm4
1495
1496    // step 2 - convert to U and V
1497    // from here down is very similar to Y code except
1498    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1499    movdqa     xmm1, xmm0
1500    movdqa     xmm3, xmm2
1501    pmaddubsw  xmm0, xmm7  // U
1502    pmaddubsw  xmm2, xmm7
1503    pmaddubsw  xmm1, xmm6  // V
1504    pmaddubsw  xmm3, xmm6
1505    phaddw     xmm0, xmm2
1506    phaddw     xmm1, xmm3
1507    paddw      xmm0, xmm5  // +.5 rounding -> unsigned
1508    paddw      xmm1, xmm5
1509    psraw      xmm0, 8
1510    psraw      xmm1, 8
1511    packsswb   xmm0, xmm1
1512
1513    // step 3 - store 8 U and 8 V values
1514    movlps     qword ptr [edx], xmm0 // U
1515    movhps     qword ptr [edx + edi], xmm0 // V
1516    lea        edx, [edx + 8]
1517    sub        ecx, 16
1518    jg         convertloop
1519
1520    pop        edi
1521    pop        esi
1522    ret
1523  }
1524}
1525
1526#ifdef HAS_ARGBTOUVROW_AVX2
1527__declspec(naked)
1528void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1529                      uint8* dst_u, uint8* dst_v, int width) {
1530  __asm {
1531    push       esi
1532    push       edi
1533    mov        eax, [esp + 8 + 4]   // src_argb
1534    mov        esi, [esp + 8 + 8]   // src_stride_argb
1535    mov        edx, [esp + 8 + 12]  // dst_u
1536    mov        edi, [esp + 8 + 16]  // dst_v
1537    mov        ecx, [esp + 8 + 20]  // width
1538    vbroadcastf128 ymm5, xmmword ptr kAddUV128
1539    vbroadcastf128 ymm6, xmmword ptr kARGBToV
1540    vbroadcastf128 ymm7, xmmword ptr kARGBToU
1541    sub        edi, edx             // stride from u to v
1542
1543 convertloop:
1544    /* step 1 - subsample 32x2 argb pixels to 16x1 */
1545    vmovdqu    ymm0, [eax]
1546    vmovdqu    ymm1, [eax + 32]
1547    vmovdqu    ymm2, [eax + 64]
1548    vmovdqu    ymm3, [eax + 96]
1549    vpavgb     ymm0, ymm0, [eax + esi]
1550    vpavgb     ymm1, ymm1, [eax + esi + 32]
1551    vpavgb     ymm2, ymm2, [eax + esi + 64]
1552    vpavgb     ymm3, ymm3, [eax + esi + 96]
1553    lea        eax,  [eax + 128]
1554    vshufps    ymm4, ymm0, ymm1, 0x88
1555    vshufps    ymm0, ymm0, ymm1, 0xdd
1556    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1557    vshufps    ymm4, ymm2, ymm3, 0x88
1558    vshufps    ymm2, ymm2, ymm3, 0xdd
1559    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1560
1561    // step 2 - convert to U and V
1562    // from here down is very similar to Y code except
1563    // instead of 32 different pixels, its 16 pixels of U and 16 of V
1564    vpmaddubsw ymm1, ymm0, ymm7  // U
1565    vpmaddubsw ymm3, ymm2, ymm7
1566    vpmaddubsw ymm0, ymm0, ymm6  // V
1567    vpmaddubsw ymm2, ymm2, ymm6
1568    vphaddw    ymm1, ymm1, ymm3  // mutates
1569    vphaddw    ymm0, ymm0, ymm2
1570    vpsraw     ymm1, ymm1, 8
1571    vpsraw     ymm0, ymm0, 8
1572    vpacksswb  ymm0, ymm1, ymm0  // mutates
1573    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1574    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1575    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
1576
1577    // step 3 - store 16 U and 16 V values
1578    vextractf128 [edx], ymm0, 0 // U
1579    vextractf128 [edx + edi], ymm0, 1 // V
1580    lea        edx, [edx + 16]
1581    sub        ecx, 32
1582    jg         convertloop
1583
1584    pop        edi
1585    pop        esi
1586    vzeroupper
1587    ret
1588  }
1589}
1590#endif  // HAS_ARGBTOUVROW_AVX2
1591
1592#ifdef HAS_ARGBTOUVJROW_AVX2
1593__declspec(naked)
1594void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1595                      uint8* dst_u, uint8* dst_v, int width) {
1596  __asm {
1597    push       esi
1598    push       edi
1599    mov        eax, [esp + 8 + 4]   // src_argb
1600    mov        esi, [esp + 8 + 8]   // src_stride_argb
1601    mov        edx, [esp + 8 + 12]  // dst_u
1602    mov        edi, [esp + 8 + 16]  // dst_v
1603    mov        ecx, [esp + 8 + 20]  // width
1604    vbroadcastf128 ymm5, xmmword ptr kAddUV128
1605    vbroadcastf128 ymm6, xmmword ptr kARGBToV
1606    vbroadcastf128 ymm7, xmmword ptr kARGBToU
1607    sub        edi, edx             // stride from u to v
1608
1609 convertloop:
1610    /* step 1 - subsample 32x2 argb pixels to 16x1 */
1611    vmovdqu    ymm0, [eax]
1612    vmovdqu    ymm1, [eax + 32]
1613    vmovdqu    ymm2, [eax + 64]
1614    vmovdqu    ymm3, [eax + 96]
1615    vpavgb     ymm0, ymm0, [eax + esi]
1616    vpavgb     ymm1, ymm1, [eax + esi + 32]
1617    vpavgb     ymm2, ymm2, [eax + esi + 64]
1618    vpavgb     ymm3, ymm3, [eax + esi + 96]
1619    lea        eax,  [eax + 128]
1620    vshufps    ymm4, ymm0, ymm1, 0x88
1621    vshufps    ymm0, ymm0, ymm1, 0xdd
1622    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1623    vshufps    ymm4, ymm2, ymm3, 0x88
1624    vshufps    ymm2, ymm2, ymm3, 0xdd
1625    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1626
1627    // step 2 - convert to U and V
1628    // from here down is very similar to Y code except
1629    // instead of 32 different pixels, its 16 pixels of U and 16 of V
1630    vpmaddubsw ymm1, ymm0, ymm7  // U
1631    vpmaddubsw ymm3, ymm2, ymm7
1632    vpmaddubsw ymm0, ymm0, ymm6  // V
1633    vpmaddubsw ymm2, ymm2, ymm6
1634    vphaddw    ymm1, ymm1, ymm3  // mutates
1635    vphaddw    ymm0, ymm0, ymm2
1636    vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
1637    vpaddw     ymm0, ymm0, ymm5
1638    vpsraw     ymm1, ymm1, 8
1639    vpsraw     ymm0, ymm0, 8
1640    vpacksswb  ymm0, ymm1, ymm0  // mutates
1641    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1642    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1643
1644    // step 3 - store 16 U and 16 V values
1645    vextractf128 [edx], ymm0, 0 // U
1646    vextractf128 [edx + edi], ymm0, 1 // V
1647    lea        edx, [edx + 16]
1648    sub        ecx, 32
1649    jg         convertloop
1650
1651    pop        edi
1652    pop        esi
1653    vzeroupper
1654    ret
1655  }
1656}
1657#endif  // HAS_ARGBTOUVJROW_AVX2
1658
1659__declspec(naked)
1660void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1661                          uint8* dst_u, uint8* dst_v, int width) {
1662  __asm {
1663    push       edi
1664    mov        eax, [esp + 4 + 4]   // src_argb
1665    mov        edx, [esp + 4 + 8]   // dst_u
1666    mov        edi, [esp + 4 + 12]  // dst_v
1667    mov        ecx, [esp + 4 + 16]  // width
1668    movdqa     xmm5, xmmword ptr kAddUV128
1669    movdqa     xmm6, xmmword ptr kARGBToV
1670    movdqa     xmm7, xmmword ptr kARGBToU
1671    sub        edi, edx             // stride from u to v
1672
1673 convertloop:
1674    /* convert to U and V */
1675    movdqu     xmm0, [eax]          // U
1676    movdqu     xmm1, [eax + 16]
1677    movdqu     xmm2, [eax + 32]
1678    movdqu     xmm3, [eax + 48]
1679    pmaddubsw  xmm0, xmm7
1680    pmaddubsw  xmm1, xmm7
1681    pmaddubsw  xmm2, xmm7
1682    pmaddubsw  xmm3, xmm7
1683    phaddw     xmm0, xmm1
1684    phaddw     xmm2, xmm3
1685    psraw      xmm0, 8
1686    psraw      xmm2, 8
1687    packsswb   xmm0, xmm2
1688    paddb      xmm0, xmm5
1689    movdqu     [edx], xmm0
1690
1691    movdqu     xmm0, [eax]          // V
1692    movdqu     xmm1, [eax + 16]
1693    movdqu     xmm2, [eax + 32]
1694    movdqu     xmm3, [eax + 48]
1695    pmaddubsw  xmm0, xmm6
1696    pmaddubsw  xmm1, xmm6
1697    pmaddubsw  xmm2, xmm6
1698    pmaddubsw  xmm3, xmm6
1699    phaddw     xmm0, xmm1
1700    phaddw     xmm2, xmm3
1701    psraw      xmm0, 8
1702    psraw      xmm2, 8
1703    packsswb   xmm0, xmm2
1704    paddb      xmm0, xmm5
1705    lea        eax,  [eax + 64]
1706    movdqu     [edx + edi], xmm0
1707    lea        edx,  [edx + 16]
1708    sub        ecx,  16
1709    jg         convertloop
1710
1711    pop        edi
1712    ret
1713  }
1714}
1715
1716__declspec(naked)
1717void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1718                       uint8* dst_u, uint8* dst_v, int width) {
1719  __asm {
1720    push       esi
1721    push       edi
1722    mov        eax, [esp + 8 + 4]   // src_argb
1723    mov        esi, [esp + 8 + 8]   // src_stride_argb
1724    mov        edx, [esp + 8 + 12]  // dst_u
1725    mov        edi, [esp + 8 + 16]  // dst_v
1726    mov        ecx, [esp + 8 + 20]  // width
1727    movdqa     xmm5, xmmword ptr kAddUV128
1728    movdqa     xmm6, xmmword ptr kBGRAToV
1729    movdqa     xmm7, xmmword ptr kBGRAToU
1730    sub        edi, edx             // stride from u to v
1731
1732 convertloop:
1733    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1734    movdqu     xmm0, [eax]
1735    movdqu     xmm4, [eax + esi]
1736    pavgb      xmm0, xmm4
1737    movdqu     xmm1, [eax + 16]
1738    movdqu     xmm4, [eax + esi + 16]
1739    pavgb      xmm1, xmm4
1740    movdqu     xmm2, [eax + 32]
1741    movdqu     xmm4, [eax + esi + 32]
1742    pavgb      xmm2, xmm4
1743    movdqu     xmm3, [eax + 48]
1744    movdqu     xmm4, [eax + esi + 48]
1745    pavgb      xmm3, xmm4
1746
1747    lea        eax,  [eax + 64]
1748    movdqa     xmm4, xmm0
1749    shufps     xmm0, xmm1, 0x88
1750    shufps     xmm4, xmm1, 0xdd
1751    pavgb      xmm0, xmm4
1752    movdqa     xmm4, xmm2
1753    shufps     xmm2, xmm3, 0x88
1754    shufps     xmm4, xmm3, 0xdd
1755    pavgb      xmm2, xmm4
1756
1757    // step 2 - convert to U and V
1758    // from here down is very similar to Y code except
1759    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1760    movdqa     xmm1, xmm0
1761    movdqa     xmm3, xmm2
1762    pmaddubsw  xmm0, xmm7  // U
1763    pmaddubsw  xmm2, xmm7
1764    pmaddubsw  xmm1, xmm6  // V
1765    pmaddubsw  xmm3, xmm6
1766    phaddw     xmm0, xmm2
1767    phaddw     xmm1, xmm3
1768    psraw      xmm0, 8
1769    psraw      xmm1, 8
1770    packsswb   xmm0, xmm1
1771    paddb      xmm0, xmm5            // -> unsigned
1772
1773    // step 3 - store 8 U and 8 V values
1774    movlps     qword ptr [edx], xmm0 // U
1775    movhps     qword ptr [edx + edi], xmm0 // V
1776    lea        edx, [edx + 8]
1777    sub        ecx, 16
1778    jg         convertloop
1779
1780    pop        edi
1781    pop        esi
1782    ret
1783  }
1784}
1785
1786__declspec(naked)
1787void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1788                       uint8* dst_u, uint8* dst_v, int width) {
1789  __asm {
1790    push       esi
1791    push       edi
1792    mov        eax, [esp + 8 + 4]   // src_argb
1793    mov        esi, [esp + 8 + 8]   // src_stride_argb
1794    mov        edx, [esp + 8 + 12]  // dst_u
1795    mov        edi, [esp + 8 + 16]  // dst_v
1796    mov        ecx, [esp + 8 + 20]  // width
1797    movdqa     xmm5, xmmword ptr kAddUV128
1798    movdqa     xmm6, xmmword ptr kABGRToV
1799    movdqa     xmm7, xmmword ptr kABGRToU
1800    sub        edi, edx             // stride from u to v
1801
1802 convertloop:
1803    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1804    movdqu     xmm0, [eax]
1805    movdqu     xmm4, [eax + esi]
1806    pavgb      xmm0, xmm4
1807    movdqu     xmm1, [eax + 16]
1808    movdqu     xmm4, [eax + esi + 16]
1809    pavgb      xmm1, xmm4
1810    movdqu     xmm2, [eax + 32]
1811    movdqu     xmm4, [eax + esi + 32]
1812    pavgb      xmm2, xmm4
1813    movdqu     xmm3, [eax + 48]
1814    movdqu     xmm4, [eax + esi + 48]
1815    pavgb      xmm3, xmm4
1816
1817    lea        eax,  [eax + 64]
1818    movdqa     xmm4, xmm0
1819    shufps     xmm0, xmm1, 0x88
1820    shufps     xmm4, xmm1, 0xdd
1821    pavgb      xmm0, xmm4
1822    movdqa     xmm4, xmm2
1823    shufps     xmm2, xmm3, 0x88
1824    shufps     xmm4, xmm3, 0xdd
1825    pavgb      xmm2, xmm4
1826
1827    // step 2 - convert to U and V
1828    // from here down is very similar to Y code except
1829    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1830    movdqa     xmm1, xmm0
1831    movdqa     xmm3, xmm2
1832    pmaddubsw  xmm0, xmm7  // U
1833    pmaddubsw  xmm2, xmm7
1834    pmaddubsw  xmm1, xmm6  // V
1835    pmaddubsw  xmm3, xmm6
1836    phaddw     xmm0, xmm2
1837    phaddw     xmm1, xmm3
1838    psraw      xmm0, 8
1839    psraw      xmm1, 8
1840    packsswb   xmm0, xmm1
1841    paddb      xmm0, xmm5            // -> unsigned
1842
1843    // step 3 - store 8 U and 8 V values
1844    movlps     qword ptr [edx], xmm0 // U
1845    movhps     qword ptr [edx + edi], xmm0 // V
1846    lea        edx, [edx + 8]
1847    sub        ecx, 16
1848    jg         convertloop
1849
1850    pop        edi
1851    pop        esi
1852    ret
1853  }
1854}
1855
1856__declspec(naked)
1857void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1858                       uint8* dst_u, uint8* dst_v, int width) {
1859  __asm {
1860    push       esi
1861    push       edi
1862    mov        eax, [esp + 8 + 4]   // src_argb
1863    mov        esi, [esp + 8 + 8]   // src_stride_argb
1864    mov        edx, [esp + 8 + 12]  // dst_u
1865    mov        edi, [esp + 8 + 16]  // dst_v
1866    mov        ecx, [esp + 8 + 20]  // width
1867    movdqa     xmm5, xmmword ptr kAddUV128
1868    movdqa     xmm6, xmmword ptr kRGBAToV
1869    movdqa     xmm7, xmmword ptr kRGBAToU
1870    sub        edi, edx             // stride from u to v
1871
1872 convertloop:
1873    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1874    movdqu     xmm0, [eax]
1875    movdqu     xmm4, [eax + esi]
1876    pavgb      xmm0, xmm4
1877    movdqu     xmm1, [eax + 16]
1878    movdqu     xmm4, [eax + esi + 16]
1879    pavgb      xmm1, xmm4
1880    movdqu     xmm2, [eax + 32]
1881    movdqu     xmm4, [eax + esi + 32]
1882    pavgb      xmm2, xmm4
1883    movdqu     xmm3, [eax + 48]
1884    movdqu     xmm4, [eax + esi + 48]
1885    pavgb      xmm3, xmm4
1886
1887    lea        eax,  [eax + 64]
1888    movdqa     xmm4, xmm0
1889    shufps     xmm0, xmm1, 0x88
1890    shufps     xmm4, xmm1, 0xdd
1891    pavgb      xmm0, xmm4
1892    movdqa     xmm4, xmm2
1893    shufps     xmm2, xmm3, 0x88
1894    shufps     xmm4, xmm3, 0xdd
1895    pavgb      xmm2, xmm4
1896
1897    // step 2 - convert to U and V
1898    // from here down is very similar to Y code except
1899    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1900    movdqa     xmm1, xmm0
1901    movdqa     xmm3, xmm2
1902    pmaddubsw  xmm0, xmm7  // U
1903    pmaddubsw  xmm2, xmm7
1904    pmaddubsw  xmm1, xmm6  // V
1905    pmaddubsw  xmm3, xmm6
1906    phaddw     xmm0, xmm2
1907    phaddw     xmm1, xmm3
1908    psraw      xmm0, 8
1909    psraw      xmm1, 8
1910    packsswb   xmm0, xmm1
1911    paddb      xmm0, xmm5            // -> unsigned
1912
1913    // step 3 - store 8 U and 8 V values
1914    movlps     qword ptr [edx], xmm0 // U
1915    movhps     qword ptr [edx + edi], xmm0 // V
1916    lea        edx, [edx + 8]
1917    sub        ecx, 16
1918    jg         convertloop
1919
1920    pop        edi
1921    pop        esi
1922    ret
1923  }
1924}
1925#endif  // HAS_ARGBTOYROW_SSSE3
1926
1927// Read 16 UV from 444
1928#define READYUV444_AVX2 __asm {                                                \
1929    __asm vmovdqu    xmm0, [esi]                  /* U */                      \
1930    __asm vmovdqu    xmm1, [esi + edi]            /* V */                      \
1931    __asm lea        esi,  [esi + 16]                                          \
1932    __asm vpermq     ymm0, ymm0, 0xd8                                          \
1933    __asm vpermq     ymm1, ymm1, 0xd8                                          \
1934    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
1935    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
1936    __asm vpermq     ymm4, ymm4, 0xd8                                          \
1937    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1938    __asm lea        eax, [eax + 16]                                           \
1939  }
1940
1941// Read 8 UV from 422, upsample to 16 UV.
1942#define READYUV422_AVX2 __asm {                                                \
1943    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
1944    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
1945    __asm lea        esi,  [esi + 8]                                           \
1946    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
1947    __asm vpermq     ymm0, ymm0, 0xd8                                          \
1948    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
1949    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
1950    __asm vpermq     ymm4, ymm4, 0xd8                                          \
1951    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1952    __asm lea        eax, [eax + 16]                                           \
1953  }
1954
1955// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
1956#define READYUVA422_AVX2 __asm {                                               \
1957    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
1958    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
1959    __asm lea        esi,  [esi + 8]                                           \
1960    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
1961    __asm vpermq     ymm0, ymm0, 0xd8                                          \
1962    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
1963    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
1964    __asm vpermq     ymm4, ymm4, 0xd8                                          \
1965    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1966    __asm lea        eax, [eax + 16]                                           \
1967    __asm vmovdqu    xmm5, [ebp]                  /* A */                      \
1968    __asm vpermq     ymm5, ymm5, 0xd8                                          \
1969    __asm lea        ebp, [ebp + 16]                                           \
1970  }
1971
1972// Read 4 UV from 411, upsample to 16 UV.
1973#define READYUV411_AVX2 __asm {                                                \
1974    __asm vmovd      xmm0, dword ptr [esi]        /* U */                      \
1975    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */                      \
1976    __asm lea        esi,  [esi + 4]                                           \
1977    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
1978    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
1979    __asm vpermq     ymm0, ymm0, 0xd8                                          \
1980    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
1981    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
1982    __asm vpermq     ymm4, ymm4, 0xd8                                          \
1983    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1984    __asm lea        eax, [eax + 16]                                           \
1985  }
1986
1987// Read 8 UV from NV12, upsample to 16 UV.
1988#define READNV12_AVX2 __asm {                                                  \
1989    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
1990    __asm lea        esi,  [esi + 16]                                          \
1991    __asm vpermq     ymm0, ymm0, 0xd8                                          \
1992    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
1993    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
1994    __asm vpermq     ymm4, ymm4, 0xd8                                          \
1995    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1996    __asm lea        eax, [eax + 16]                                           \
1997  }
1998
1999// Read 8 UV from NV21, upsample to 16 UV.
2000#define READNV21_AVX2 __asm {                                                  \
2001    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
2002    __asm lea        esi,  [esi + 16]                                          \
2003    __asm vpermq     ymm0, ymm0, 0xd8                                          \
2004    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
2005    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
2006    __asm vpermq     ymm4, ymm4, 0xd8                                          \
2007    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
2008    __asm lea        eax, [eax + 16]                                           \
2009  }
2010
2011// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2012#define READYUY2_AVX2 __asm {                                                  \
2013    __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \
2014    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
2015    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
2016    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
2017    __asm lea        eax, [eax + 32]                                           \
2018  }
2019
2020// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2021#define READUYVY_AVX2 __asm {                                                  \
2022    __asm vmovdqu    ymm4, [eax]          /* UYVY */                           \
2023    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
2024    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
2025    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
2026    __asm lea        eax, [eax + 32]                                           \
2027  }
2028
2029// Convert 16 pixels: 16 UV and 16 Y.
2030#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
2031    __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
2032    __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
2033    __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
2034    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
2035    __asm vpsubw     ymm2, ymm3, ymm2                                          \
2036    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
2037    __asm vpsubw     ymm1, ymm3, ymm1                                          \
2038    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
2039    __asm vpsubw     ymm0, ymm3, ymm0                                          \
2040    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
2041    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
2042    __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \
2043    __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \
2044    __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \
2045    __asm vpsraw     ymm0, ymm0, 6                                             \
2046    __asm vpsraw     ymm1, ymm1, 6                                             \
2047    __asm vpsraw     ymm2, ymm2, 6                                             \
2048    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
2049    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
2050    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
2051  }
2052
2053// Store 16 ARGB values.
2054#define STOREARGB_AVX2 __asm {                                                 \
2055    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
2056    __asm vpermq     ymm0, ymm0, 0xd8                                          \
2057    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
2058    __asm vpermq     ymm2, ymm2, 0xd8                                          \
2059    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
2060    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
2061    __asm vmovdqu    0[edx], ymm1                                              \
2062    __asm vmovdqu    32[edx], ymm0                                             \
2063    __asm lea        edx,  [edx + 64]                                          \
2064  }
2065
2066// Store 16 RGBA values.
2067#define STORERGBA_AVX2 __asm {                                                 \
2068    __asm vpunpcklbw ymm1, ymm1, ymm2           /* GR */                       \
2069    __asm vpermq     ymm1, ymm1, 0xd8                                          \
2070    __asm vpunpcklbw ymm2, ymm5, ymm0           /* AB */                       \
2071    __asm vpermq     ymm2, ymm2, 0xd8                                          \
2072    __asm vpunpcklwd ymm0, ymm2, ymm1           /* ABGR first 8 pixels */      \
2073    __asm vpunpckhwd ymm1, ymm2, ymm1           /* ABGR next 8 pixels */       \
2074    __asm vmovdqu    [edx], ymm0                                               \
2075    __asm vmovdqu    [edx + 32], ymm1                                          \
2076    __asm lea        edx,  [edx + 64]                                          \
2077  }
2078
2079#ifdef HAS_I422TOARGBROW_AVX2
2080// 16 pixels
2081// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2082__declspec(naked)
2083void I422ToARGBRow_AVX2(const uint8* y_buf,
2084                        const uint8* u_buf,
2085                        const uint8* v_buf,
2086                        uint8* dst_argb,
2087                        const struct YuvConstants* yuvconstants,
2088                        int width) {
2089  __asm {
2090    push       esi
2091    push       edi
2092    push       ebx
2093    mov        eax, [esp + 12 + 4]   // Y
2094    mov        esi, [esp + 12 + 8]   // U
2095    mov        edi, [esp + 12 + 12]  // V
2096    mov        edx, [esp + 12 + 16]  // argb
2097    mov        ebx, [esp + 12 + 20]  // yuvconstants
2098    mov        ecx, [esp + 12 + 24]  // width
2099    sub        edi, esi
2100    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2101
2102 convertloop:
2103    READYUV422_AVX2
2104    YUVTORGB_AVX2(ebx)
2105    STOREARGB_AVX2
2106
2107    sub        ecx, 16
2108    jg         convertloop
2109
2110    pop        ebx
2111    pop        edi
2112    pop        esi
2113    vzeroupper
2114    ret
2115  }
2116}
2117#endif  // HAS_I422TOARGBROW_AVX2
2118
2119#ifdef HAS_I422ALPHATOARGBROW_AVX2
2120// 16 pixels
2121// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2122__declspec(naked)
2123void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2124                             const uint8* u_buf,
2125                             const uint8* v_buf,
2126                             const uint8* a_buf,
2127                             uint8* dst_argb,
2128                             const struct YuvConstants* yuvconstants,
2129                             int width) {
2130  __asm {
2131    push       esi
2132    push       edi
2133    push       ebx
2134    push       ebp
2135    mov        eax, [esp + 16 + 4]   // Y
2136    mov        esi, [esp + 16 + 8]   // U
2137    mov        edi, [esp + 16 + 12]  // V
2138    mov        ebp, [esp + 16 + 16]  // A
2139    mov        edx, [esp + 16 + 20]  // argb
2140    mov        ebx, [esp + 16 + 24]  // yuvconstants
2141    mov        ecx, [esp + 16 + 28]  // width
2142    sub        edi, esi
2143
2144 convertloop:
2145    READYUVA422_AVX2
2146    YUVTORGB_AVX2(ebx)
2147    STOREARGB_AVX2
2148
2149    sub        ecx, 16
2150    jg         convertloop
2151
2152    pop        ebp
2153    pop        ebx
2154    pop        edi
2155    pop        esi
2156    vzeroupper
2157    ret
2158  }
2159}
2160#endif  // HAS_I422ALPHATOARGBROW_AVX2
2161
2162#ifdef HAS_I444TOARGBROW_AVX2
2163// 16 pixels
2164// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2165__declspec(naked)
2166void I444ToARGBRow_AVX2(const uint8* y_buf,
2167                        const uint8* u_buf,
2168                        const uint8* v_buf,
2169                        uint8* dst_argb,
2170                        const struct YuvConstants* yuvconstants,
2171                        int width) {
2172  __asm {
2173    push       esi
2174    push       edi
2175    push       ebx
2176    mov        eax, [esp + 12 + 4]   // Y
2177    mov        esi, [esp + 12 + 8]   // U
2178    mov        edi, [esp + 12 + 12]  // V
2179    mov        edx, [esp + 12 + 16]  // argb
2180    mov        ebx, [esp + 12 + 20]  // yuvconstants
2181    mov        ecx, [esp + 12 + 24]  // width
2182    sub        edi, esi
2183    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2184 convertloop:
2185    READYUV444_AVX2
2186    YUVTORGB_AVX2(ebx)
2187    STOREARGB_AVX2
2188
2189    sub        ecx, 16
2190    jg         convertloop
2191
2192    pop        ebx
2193    pop        edi
2194    pop        esi
2195    vzeroupper
2196    ret
2197  }
2198}
2199#endif  // HAS_I444TOARGBROW_AVX2
2200
2201#ifdef HAS_I411TOARGBROW_AVX2
2202// 16 pixels
2203// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2204__declspec(naked)
2205void I411ToARGBRow_AVX2(const uint8* y_buf,
2206                        const uint8* u_buf,
2207                        const uint8* v_buf,
2208                        uint8* dst_argb,
2209                        const struct YuvConstants* yuvconstants,
2210                        int width) {
2211  __asm {
2212    push       esi
2213    push       edi
2214    push       ebx
2215    mov        eax, [esp + 12 + 4]   // Y
2216    mov        esi, [esp + 12 + 8]   // U
2217    mov        edi, [esp + 12 + 12]  // V
2218    mov        edx, [esp + 12 + 16]  // abgr
2219    mov        ebx, [esp + 12 + 20]  // yuvconstants
2220    mov        ecx, [esp + 12 + 24]  // width
2221    sub        edi, esi
2222    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2223
2224 convertloop:
2225    READYUV411_AVX2
2226    YUVTORGB_AVX2(ebx)
2227    STOREARGB_AVX2
2228
2229    sub        ecx, 16
2230    jg         convertloop
2231
2232    pop        ebx
2233    pop        edi
2234    pop        esi
2235    vzeroupper
2236    ret
2237  }
2238}
2239#endif  // HAS_I411TOARGBROW_AVX2
2240
2241#ifdef HAS_NV12TOARGBROW_AVX2
2242// 16 pixels.
2243// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2244__declspec(naked)
2245void NV12ToARGBRow_AVX2(const uint8* y_buf,
2246                        const uint8* uv_buf,
2247                        uint8* dst_argb,
2248                        const struct YuvConstants* yuvconstants,
2249                        int width) {
2250  __asm {
2251    push       esi
2252    push       ebx
2253    mov        eax, [esp + 8 + 4]   // Y
2254    mov        esi, [esp + 8 + 8]   // UV
2255    mov        edx, [esp + 8 + 12]  // argb
2256    mov        ebx, [esp + 8 + 16]  // yuvconstants
2257    mov        ecx, [esp + 8 + 20]  // width
2258    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2259
2260 convertloop:
2261    READNV12_AVX2
2262    YUVTORGB_AVX2(ebx)
2263    STOREARGB_AVX2
2264
2265    sub        ecx, 16
2266    jg         convertloop
2267
2268    pop        ebx
2269    pop        esi
2270    vzeroupper
2271    ret
2272  }
2273}
2274#endif  // HAS_NV12TOARGBROW_AVX2
2275
2276#ifdef HAS_NV21TOARGBROW_AVX2
2277// 16 pixels.
2278// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2279__declspec(naked)
2280void NV21ToARGBRow_AVX2(const uint8* y_buf,
2281                        const uint8* vu_buf,
2282                        uint8* dst_argb,
2283                        const struct YuvConstants* yuvconstants,
2284                        int width) {
2285  __asm {
2286    push       esi
2287    push       ebx
2288    mov        eax, [esp + 8 + 4]   // Y
2289    mov        esi, [esp + 8 + 8]   // VU
2290    mov        edx, [esp + 8 + 12]  // argb
2291    mov        ebx, [esp + 8 + 16]  // yuvconstants
2292    mov        ecx, [esp + 8 + 20]  // width
2293    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2294
2295 convertloop:
2296    READNV21_AVX2
2297    YUVTORGB_AVX2(ebx)
2298    STOREARGB_AVX2
2299
2300    sub        ecx, 16
2301    jg         convertloop
2302
2303    pop        ebx
2304    pop        esi
2305    vzeroupper
2306    ret
2307  }
2308}
2309#endif  // HAS_NV21TOARGBROW_AVX2
2310
2311#ifdef HAS_YUY2TOARGBROW_AVX2
2312// 16 pixels.
2313// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2314__declspec(naked)
2315void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
2316                        uint8* dst_argb,
2317                        const struct YuvConstants* yuvconstants,
2318                        int width) {
2319  __asm {
2320    push       ebx
2321    mov        eax, [esp + 4 + 4]   // yuy2
2322    mov        edx, [esp + 4 + 8]   // argb
2323    mov        ebx, [esp + 4 + 12]  // yuvconstants
2324    mov        ecx, [esp + 4 + 16]  // width
2325    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2326
2327 convertloop:
2328    READYUY2_AVX2
2329    YUVTORGB_AVX2(ebx)
2330    STOREARGB_AVX2
2331
2332    sub        ecx, 16
2333    jg         convertloop
2334
2335    pop        ebx
2336    vzeroupper
2337    ret
2338  }
2339}
2340#endif  // HAS_YUY2TOARGBROW_AVX2
2341
2342#ifdef HAS_UYVYTOARGBROW_AVX2
2343// 16 pixels.
2344// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2345__declspec(naked)
2346void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
2347                        uint8* dst_argb,
2348                        const struct YuvConstants* yuvconstants,
2349                        int width) {
2350  __asm {
2351    push       ebx
2352    mov        eax, [esp + 4 + 4]   // uyvy
2353    mov        edx, [esp + 4 + 8]   // argb
2354    mov        ebx, [esp + 4 + 12]  // yuvconstants
2355    mov        ecx, [esp + 4 + 16]  // width
2356    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2357
2358 convertloop:
2359    READUYVY_AVX2
2360    YUVTORGB_AVX2(ebx)
2361    STOREARGB_AVX2
2362
2363    sub        ecx, 16
2364    jg         convertloop
2365
2366    pop        ebx
2367    vzeroupper
2368    ret
2369  }
2370}
2371#endif  // HAS_UYVYTOARGBROW_AVX2
2372
2373#ifdef HAS_I422TORGBAROW_AVX2
2374// 16 pixels
2375// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2376__declspec(naked)
2377void I422ToRGBARow_AVX2(const uint8* y_buf,
2378                        const uint8* u_buf,
2379                        const uint8* v_buf,
2380                        uint8* dst_argb,
2381                        const struct YuvConstants* yuvconstants,
2382                        int width) {
2383  __asm {
2384    push       esi
2385    push       edi
2386    push       ebx
2387    mov        eax, [esp + 12 + 4]   // Y
2388    mov        esi, [esp + 12 + 8]   // U
2389    mov        edi, [esp + 12 + 12]  // V
2390    mov        edx, [esp + 12 + 16]  // abgr
2391    mov        ebx, [esp + 12 + 20]  // yuvconstants
2392    mov        ecx, [esp + 12 + 24]  // width
2393    sub        edi, esi
2394    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2395
2396 convertloop:
2397    READYUV422_AVX2
2398    YUVTORGB_AVX2(ebx)
2399    STORERGBA_AVX2
2400
2401    sub        ecx, 16
2402    jg         convertloop
2403
2404    pop        ebx
2405    pop        edi
2406    pop        esi
2407    vzeroupper
2408    ret
2409  }
2410}
2411#endif  // HAS_I422TORGBAROW_AVX2
2412
2413#if defined(HAS_I422TOARGBROW_SSSE3)
2414// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2415// Allows a conversion with half size scaling.
2416
2417// Read 8 UV from 444.
2418#define READYUV444 __asm {                                                     \
2419    __asm movq       xmm0, qword ptr [esi] /* U */                             \
2420    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
2421    __asm lea        esi,  [esi + 8]                                           \
2422    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2423    __asm movq       xmm4, qword ptr [eax]                                     \
2424    __asm punpcklbw  xmm4, xmm4                                                \
2425    __asm lea        eax, [eax + 8]                                            \
2426  }
2427
2428// Read 4 UV from 422, upsample to 8 UV.
2429#define READYUV422 __asm {                                                     \
2430    __asm movd       xmm0, [esi]          /* U */                              \
2431    __asm movd       xmm1, [esi + edi]    /* V */                              \
2432    __asm lea        esi,  [esi + 4]                                           \
2433    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2434    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2435    __asm movq       xmm4, qword ptr [eax]                                     \
2436    __asm punpcklbw  xmm4, xmm4                                                \
2437    __asm lea        eax, [eax + 8]                                            \
2438  }
2439
2440// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
2441#define READYUVA422 __asm {                                                    \
2442    __asm movd       xmm0, [esi]          /* U */                              \
2443    __asm movd       xmm1, [esi + edi]    /* V */                              \
2444    __asm lea        esi,  [esi + 4]                                           \
2445    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2446    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2447    __asm movq       xmm4, qword ptr [eax]   /* Y */                           \
2448    __asm punpcklbw  xmm4, xmm4                                                \
2449    __asm lea        eax, [eax + 8]                                            \
2450    __asm movq       xmm5, qword ptr [ebp]   /* A */                           \
2451    __asm lea        ebp, [ebp + 8]                                            \
2452  }
2453
2454// Read 2 UV from 411, upsample to 8 UV.
2455// drmemory fails with memory fault if pinsrw used. libyuv bug: 525
2456//  __asm pinsrw     xmm0, [esi], 0        /* U */
2457//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
2458#define READYUV411_EBX __asm {                                                 \
2459    __asm movzx      ebx, word ptr [esi]        /* U */                        \
2460    __asm movd       xmm0, ebx                                                 \
2461    __asm movzx      ebx, word ptr [esi + edi]  /* V */                        \
2462    __asm movd       xmm1, ebx                                                 \
2463    __asm lea        esi,  [esi + 2]                                           \
2464    __asm punpcklbw  xmm0, xmm1            /* UV */                            \
2465    __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \
2466    __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \
2467    __asm movq       xmm4, qword ptr [eax]                                     \
2468    __asm punpcklbw  xmm4, xmm4                                                \
2469    __asm lea        eax, [eax + 8]                                            \
2470  }
2471
2472// Read 4 UV from NV12, upsample to 8 UV.
2473#define READNV12 __asm {                                                       \
2474    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
2475    __asm lea        esi,  [esi + 8]                                           \
2476    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2477    __asm movq       xmm4, qword ptr [eax]                                     \
2478    __asm punpcklbw  xmm4, xmm4                                                \
2479    __asm lea        eax, [eax + 8]                                            \
2480  }
2481
2482// Read 4 VU from NV21, upsample to 8 UV.
2483#define READNV21 __asm {                                                       \
2484    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
2485    __asm lea        esi,  [esi + 8]                                           \
2486    __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
2487    __asm movq       xmm4, qword ptr [eax]                                     \
2488    __asm punpcklbw  xmm4, xmm4                                                \
2489    __asm lea        eax, [eax + 8]                                            \
2490  }
2491
2492// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2493#define READYUY2 __asm {                                                       \
2494    __asm movdqu     xmm4, [eax]          /* YUY2 */                           \
2495    __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
2496    __asm movdqu     xmm0, [eax]          /* UV */                             \
2497    __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
2498    __asm lea        eax, [eax + 16]                                           \
2499  }
2500
2501// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2502#define READUYVY __asm {                                                       \
2503    __asm movdqu     xmm4, [eax]          /* UYVY */                           \
2504    __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
2505    __asm movdqu     xmm0, [eax]          /* UV */                             \
2506    __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
2507    __asm lea        eax, [eax + 16]                                           \
2508  }
2509
2510// Convert 8 pixels: 8 UV and 8 Y.
2511#define YUVTORGB(YuvConstants) __asm {                                         \
2512    __asm movdqa     xmm1, xmm0                                                \
2513    __asm movdqa     xmm2, xmm0                                                \
2514    __asm movdqa     xmm3, xmm0                                                \
2515    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
2516    __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
2517    __asm psubw      xmm0, xmm1                                                \
2518    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
2519    __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
2520    __asm psubw      xmm1, xmm2                                                \
2521    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
2522    __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
2523    __asm psubw      xmm2, xmm3                                                \
2524    __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
2525    __asm paddsw     xmm0, xmm4           /* B += Y */                         \
2526    __asm paddsw     xmm1, xmm4           /* G += Y */                         \
2527    __asm paddsw     xmm2, xmm4           /* R += Y */                         \
2528    __asm psraw      xmm0, 6                                                   \
2529    __asm psraw      xmm1, 6                                                   \
2530    __asm psraw      xmm2, 6                                                   \
2531    __asm packuswb   xmm0, xmm0           /* B */                              \
2532    __asm packuswb   xmm1, xmm1           /* G */                              \
2533    __asm packuswb   xmm2, xmm2           /* R */                              \
2534  }
2535
2536// Store 8 ARGB values.
2537#define STOREARGB __asm {                                                      \
2538    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
2539    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
2540    __asm movdqa     xmm1, xmm0                                                \
2541    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
2542    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
2543    __asm movdqu     0[edx], xmm0                                              \
2544    __asm movdqu     16[edx], xmm1                                             \
2545    __asm lea        edx,  [edx + 32]                                          \
2546  }
2547
2548// Store 8 BGRA values.
2549#define STOREBGRA __asm {                                                      \
2550    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
2551    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
2552    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
2553    __asm movdqa     xmm0, xmm5                                                \
2554    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
2555    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
2556    __asm movdqu     0[edx], xmm5                                              \
2557    __asm movdqu     16[edx], xmm0                                             \
2558    __asm lea        edx,  [edx + 32]                                          \
2559  }
2560
2561// Store 8 RGBA values.
2562#define STORERGBA __asm {                                                      \
2563    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
2564    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
2565    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
2566    __asm movdqa     xmm0, xmm5                                                \
2567    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
2568    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
2569    __asm movdqu     0[edx], xmm5                                              \
2570    __asm movdqu     16[edx], xmm0                                             \
2571    __asm lea        edx,  [edx + 32]                                          \
2572  }
2573
2574// Store 8 RGB24 values.
2575#define STORERGB24 __asm {                                                     \
2576    /* Weave into RRGB */                                                      \
2577    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
2578    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
2579    __asm movdqa     xmm1, xmm0                                                \
2580    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
2581    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
2582    /* RRGB -> RGB24 */                                                        \
2583    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
2584    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
2585    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
2586    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
2587    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
2588    __asm lea        edx,  [edx + 24]                                          \
2589  }
2590
2591// Store 8 RGB565 values.
2592#define STORERGB565 __asm {                                                    \
2593    /* Weave into RRGB */                                                      \
2594    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
2595    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
2596    __asm movdqa     xmm1, xmm0                                                \
2597    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
2598    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
2599    /* RRGB -> RGB565 */                                                       \
2600    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
2601    __asm movdqa     xmm2, xmm0    /* G */                                     \
2602    __asm pslld      xmm0, 8       /* R */                                     \
2603    __asm psrld      xmm3, 3       /* B */                                     \
2604    __asm psrld      xmm2, 5       /* G */                                     \
2605    __asm psrad      xmm0, 16      /* R */                                     \
2606    __asm pand       xmm3, xmm5    /* B */                                     \
2607    __asm pand       xmm2, xmm6    /* G */                                     \
2608    __asm pand       xmm0, xmm7    /* R */                                     \
2609    __asm por        xmm3, xmm2    /* BG */                                    \
2610    __asm por        xmm0, xmm3    /* BGR */                                   \
2611    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
2612    __asm movdqa     xmm2, xmm1    /* G */                                     \
2613    __asm pslld      xmm1, 8       /* R */                                     \
2614    __asm psrld      xmm3, 3       /* B */                                     \
2615    __asm psrld      xmm2, 5       /* G */                                     \
2616    __asm psrad      xmm1, 16      /* R */                                     \
2617    __asm pand       xmm3, xmm5    /* B */                                     \
2618    __asm pand       xmm2, xmm6    /* G */                                     \
2619    __asm pand       xmm1, xmm7    /* R */                                     \
2620    __asm por        xmm3, xmm2    /* BG */                                    \
2621    __asm por        xmm1, xmm3    /* BGR */                                   \
2622    __asm packssdw   xmm0, xmm1                                                \
2623    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
2624    __asm lea        edx, [edx + 16]                                           \
2625  }
2626
2627// 8 pixels.
2628// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2629__declspec(naked)
2630void I444ToARGBRow_SSSE3(const uint8* y_buf,
2631                         const uint8* u_buf,
2632                         const uint8* v_buf,
2633                         uint8* dst_argb,
2634                         const struct YuvConstants* yuvconstants,
2635                         int width) {
2636  __asm {
2637    push       esi
2638    push       edi
2639    push       ebx
2640    mov        eax, [esp + 12 + 4]   // Y
2641    mov        esi, [esp + 12 + 8]   // U
2642    mov        edi, [esp + 12 + 12]  // V
2643    mov        edx, [esp + 12 + 16]  // argb
2644    mov        ebx, [esp + 12 + 20]  // yuvconstants
2645    mov        ecx, [esp + 12 + 24]  // width
2646    sub        edi, esi
2647    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
2648
2649 convertloop:
2650    READYUV444
2651    YUVTORGB(ebx)
2652    STOREARGB
2653
2654    sub        ecx, 8
2655    jg         convertloop
2656
2657    pop        ebx
2658    pop        edi
2659    pop        esi
2660    ret
2661  }
2662}
2663
2664// 8 pixels.
2665// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2666__declspec(naked)
2667void I422ToRGB24Row_SSSE3(const uint8* y_buf,
2668                          const uint8* u_buf,
2669                          const uint8* v_buf,
2670                          uint8* dst_rgb24,
2671                          const struct YuvConstants* yuvconstants,
2672                          int width) {
2673  __asm {
2674    push       esi
2675    push       edi
2676    push       ebx
2677    mov        eax, [esp + 12 + 4]   // Y
2678    mov        esi, [esp + 12 + 8]   // U
2679    mov        edi, [esp + 12 + 12]  // V
2680    mov        edx, [esp + 12 + 16]  // argb
2681    mov        ebx, [esp + 12 + 20]  // yuvconstants
2682    mov        ecx, [esp + 12 + 24]  // width
2683    sub        edi, esi
2684    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2685    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2686
2687 convertloop:
2688    READYUV422
2689    YUVTORGB(ebx)
2690    STORERGB24
2691
2692    sub        ecx, 8
2693    jg         convertloop
2694
2695    pop        ebx
2696    pop        edi
2697    pop        esi
2698    ret
2699  }
2700}
2701
2702// 8 pixels
2703// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2704__declspec(naked)
2705void I422ToRGB565Row_SSSE3(const uint8* y_buf,
2706                           const uint8* u_buf,
2707                           const uint8* v_buf,
2708                           uint8* rgb565_buf,
2709                           const struct YuvConstants* yuvconstants,
2710                           int width) {
2711  __asm {
2712    push       esi
2713    push       edi
2714    push       ebx
2715    mov        eax, [esp + 12 + 4]   // Y
2716    mov        esi, [esp + 12 + 8]   // U
2717    mov        edi, [esp + 12 + 12]  // V
2718    mov        edx, [esp + 12 + 16]  // argb
2719    mov        ebx, [esp + 12 + 20]  // yuvconstants
2720    mov        ecx, [esp + 12 + 24]  // width
2721    sub        edi, esi
2722    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
2723    psrld      xmm5, 27
2724    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
2725    psrld      xmm6, 26
2726    pslld      xmm6, 5
2727    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
2728    pslld      xmm7, 11
2729
2730 convertloop:
2731    READYUV422
2732    YUVTORGB(ebx)
2733    STORERGB565
2734
2735    sub        ecx, 8
2736    jg         convertloop
2737
2738    pop        ebx
2739    pop        edi
2740    pop        esi
2741    ret
2742  }
2743}
2744
2745// 8 pixels.
2746// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2747__declspec(naked)
2748void I422ToARGBRow_SSSE3(const uint8* y_buf,
2749                         const uint8* u_buf,
2750                         const uint8* v_buf,
2751                         uint8* dst_argb,
2752                         const struct YuvConstants* yuvconstants,
2753                         int width) {
2754  __asm {
2755    push       esi
2756    push       edi
2757    push       ebx
2758    mov        eax, [esp + 12 + 4]   // Y
2759    mov        esi, [esp + 12 + 8]   // U
2760    mov        edi, [esp + 12 + 12]  // V
2761    mov        edx, [esp + 12 + 16]  // argb
2762    mov        ebx, [esp + 12 + 20]  // yuvconstants
2763    mov        ecx, [esp + 12 + 24]  // width
2764    sub        edi, esi
2765    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2766
2767 convertloop:
2768    READYUV422
2769    YUVTORGB(ebx)
2770    STOREARGB
2771
2772    sub        ecx, 8
2773    jg         convertloop
2774
2775    pop        ebx
2776    pop        edi
2777    pop        esi
2778    ret
2779  }
2780}
2781
2782// 8 pixels.
2783// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2784__declspec(naked)
2785void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
2786                              const uint8* u_buf,
2787                              const uint8* v_buf,
2788                              const uint8* a_buf,
2789                              uint8* dst_argb,
2790                              const struct YuvConstants* yuvconstants,
2791                              int width) {
2792  __asm {
2793    push       esi
2794    push       edi
2795    push       ebx
2796    push       ebp
2797    mov        eax, [esp + 16 + 4]   // Y
2798    mov        esi, [esp + 16 + 8]   // U
2799    mov        edi, [esp + 16 + 12]  // V
2800    mov        ebp, [esp + 16 + 16]  // A
2801    mov        edx, [esp + 16 + 20]  // argb
2802    mov        ebx, [esp + 16 + 24]  // yuvconstants
2803    mov        ecx, [esp + 16 + 28]  // width
2804    sub        edi, esi
2805
2806 convertloop:
2807    READYUVA422
2808    YUVTORGB(ebx)
2809    STOREARGB
2810
2811    sub        ecx, 8
2812    jg         convertloop
2813
2814    pop        ebp
2815    pop        ebx
2816    pop        edi
2817    pop        esi
2818    ret
2819  }
2820}
2821
2822// 8 pixels.
2823// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2824// Similar to I420 but duplicate UV once more.
2825__declspec(naked)
2826void I411ToARGBRow_SSSE3(const uint8* y_buf,
2827                         const uint8* u_buf,
2828                         const uint8* v_buf,
2829                         uint8* dst_argb,
2830                         const struct YuvConstants* yuvconstants,
2831                         int width) {
2832  __asm {
2833    push       esi
2834    push       edi
2835    push       ebx
2836    push       ebp
2837    mov        eax, [esp + 16 + 4]   // Y
2838    mov        esi, [esp + 16 + 8]   // U
2839    mov        edi, [esp + 16 + 12]  // V
2840    mov        edx, [esp + 16 + 16]  // abgr
2841    mov        ebp, [esp + 16 + 20]  // yuvconstants
2842    mov        ecx, [esp + 16 + 24]  // width
2843    sub        edi, esi
2844    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
2845
2846 convertloop:
2847    READYUV411_EBX
2848    YUVTORGB(ebp)
2849    STOREARGB
2850
2851    sub        ecx, 8
2852    jg         convertloop
2853
2854    pop        ebp
2855    pop        ebx
2856    pop        edi
2857    pop        esi
2858    ret
2859  }
2860}
2861
2862// 8 pixels.
2863// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2864__declspec(naked)
2865void NV12ToARGBRow_SSSE3(const uint8* y_buf,
2866                         const uint8* uv_buf,
2867                         uint8* dst_argb,
2868                         const struct YuvConstants* yuvconstants,
2869                         int width) {
2870  __asm {
2871    push       esi
2872    push       ebx
2873    mov        eax, [esp + 8 + 4]   // Y
2874    mov        esi, [esp + 8 + 8]   // UV
2875    mov        edx, [esp + 8 + 12]  // argb
2876    mov        ebx, [esp + 8 + 16]  // yuvconstants
2877    mov        ecx, [esp + 8 + 20]  // width
2878    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2879
2880 convertloop:
2881    READNV12
2882    YUVTORGB(ebx)
2883    STOREARGB
2884
2885    sub        ecx, 8
2886    jg         convertloop
2887
2888    pop        ebx
2889    pop        esi
2890    ret
2891  }
2892}
2893
2894// 8 pixels.
2895// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2896__declspec(naked)
2897void NV21ToARGBRow_SSSE3(const uint8* y_buf,
2898                         const uint8* vu_buf,
2899                         uint8* dst_argb,
2900                         const struct YuvConstants* yuvconstants,
2901                         int width) {
2902  __asm {
2903    push       esi
2904    push       ebx
2905    mov        eax, [esp + 8 + 4]   // Y
2906    mov        esi, [esp + 8 + 8]   // VU
2907    mov        edx, [esp + 8 + 12]  // argb
2908    mov        ebx, [esp + 8 + 16]  // yuvconstants
2909    mov        ecx, [esp + 8 + 20]  // width
2910    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2911
2912 convertloop:
2913    READNV21
2914    YUVTORGB(ebx)
2915    STOREARGB
2916
2917    sub        ecx, 8
2918    jg         convertloop
2919
2920    pop        ebx
2921    pop        esi
2922    ret
2923  }
2924}
2925
2926// 8 pixels.
2927// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2928__declspec(naked)
2929void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
2930                         uint8* dst_argb,
2931                         const struct YuvConstants* yuvconstants,
2932                         int width) {
2933  __asm {
2934    push       ebx
2935    mov        eax, [esp + 4 + 4]   // yuy2
2936    mov        edx, [esp + 4 + 8]   // argb
2937    mov        ebx, [esp + 4 + 12]  // yuvconstants
2938    mov        ecx, [esp + 4 + 16]  // width
2939    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2940
2941 convertloop:
2942    READYUY2
2943    YUVTORGB(ebx)
2944    STOREARGB
2945
2946    sub        ecx, 8
2947    jg         convertloop
2948
2949    pop        ebx
2950    ret
2951  }
2952}
2953
2954// 8 pixels.
2955// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2956__declspec(naked)
2957void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
2958                         uint8* dst_argb,
2959                         const struct YuvConstants* yuvconstants,
2960                         int width) {
2961  __asm {
2962    push       ebx
2963    mov        eax, [esp + 4 + 4]   // uyvy
2964    mov        edx, [esp + 4 + 8]   // argb
2965    mov        ebx, [esp + 4 + 12]  // yuvconstants
2966    mov        ecx, [esp + 4 + 16]  // width
2967    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2968
2969 convertloop:
2970    READUYVY
2971    YUVTORGB(ebx)
2972    STOREARGB
2973
2974    sub        ecx, 8
2975    jg         convertloop
2976
2977    pop        ebx
2978    ret
2979  }
2980}
2981
2982__declspec(naked)
2983void I422ToRGBARow_SSSE3(const uint8* y_buf,
2984                         const uint8* u_buf,
2985                         const uint8* v_buf,
2986                         uint8* dst_rgba,
2987                         const struct YuvConstants* yuvconstants,
2988                         int width) {
2989  __asm {
2990    push       esi
2991    push       edi
2992    push       ebx
2993    mov        eax, [esp + 12 + 4]   // Y
2994    mov        esi, [esp + 12 + 8]   // U
2995    mov        edi, [esp + 12 + 12]  // V
2996    mov        edx, [esp + 12 + 16]  // argb
2997    mov        ebx, [esp + 12 + 20]  // yuvconstants
2998    mov        ecx, [esp + 12 + 24]  // width
2999    sub        edi, esi
3000
3001 convertloop:
3002    READYUV422
3003    YUVTORGB(ebx)
3004    STORERGBA
3005
3006    sub        ecx, 8
3007    jg         convertloop
3008
3009    pop        ebx
3010    pop        edi
3011    pop        esi
3012    ret
3013  }
3014}
3015#endif  // HAS_I422TOARGBROW_SSSE3
3016
3017#ifdef HAS_I400TOARGBROW_SSE2
3018// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
3019__declspec(naked)
3020void I400ToARGBRow_SSE2(const uint8* y_buf,
3021                        uint8* rgb_buf,
3022                        int width) {
3023  __asm {
3024    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
3025    movd       xmm2, eax
3026    pshufd     xmm2, xmm2,0
3027    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
3028    movd       xmm3, eax
3029    pshufd     xmm3, xmm3, 0
3030    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
3031    pslld      xmm4, 24
3032
3033    mov        eax, [esp + 4]       // Y
3034    mov        edx, [esp + 8]       // rgb
3035    mov        ecx, [esp + 12]      // width
3036
3037 convertloop:
3038    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3039    movq       xmm0, qword ptr [eax]
3040    lea        eax, [eax + 8]
3041    punpcklbw  xmm0, xmm0           // Y.Y
3042    pmulhuw    xmm0, xmm2
3043    psubusw    xmm0, xmm3
3044    psrlw      xmm0, 6
3045    packuswb   xmm0, xmm0           // G
3046
3047    // Step 2: Weave into ARGB
3048    punpcklbw  xmm0, xmm0           // GG
3049    movdqa     xmm1, xmm0
3050    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
3051    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
3052    por        xmm0, xmm4
3053    por        xmm1, xmm4
3054    movdqu     [edx], xmm0
3055    movdqu     [edx + 16], xmm1
3056    lea        edx,  [edx + 32]
3057    sub        ecx, 8
3058    jg         convertloop
3059    ret
3060  }
3061}
3062#endif  // HAS_I400TOARGBROW_SSE2
3063
3064#ifdef HAS_I400TOARGBROW_AVX2
3065// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
3066// note: vpunpcklbw mutates and vpackuswb unmutates.
3067__declspec(naked)
3068void I400ToARGBRow_AVX2(const uint8* y_buf,
3069                        uint8* rgb_buf,
3070                        int width) {
3071  __asm {
3072    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
3073    vmovd      xmm2, eax
3074    vbroadcastss ymm2, xmm2
3075    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
3076    vmovd      xmm3, eax
3077    vbroadcastss ymm3, xmm3
3078    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
3079    vpslld     ymm4, ymm4, 24
3080
3081    mov        eax, [esp + 4]       // Y
3082    mov        edx, [esp + 8]       // rgb
3083    mov        ecx, [esp + 12]      // width
3084
3085 convertloop:
3086    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
3087    vmovdqu    xmm0, [eax]
3088    lea        eax, [eax + 16]
3089    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
3090    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
3091    vpmulhuw   ymm0, ymm0, ymm2
3092    vpsubusw   ymm0, ymm0, ymm3
3093    vpsrlw     ymm0, ymm0, 6
3094    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
3095
3096    // TODO(fbarchard): Weave alpha with unpack.
3097    // Step 2: Weave into ARGB
3098    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
3099    vpermq     ymm1, ymm1, 0xd8
3100    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
3101    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
3102    vpor       ymm0, ymm0, ymm4
3103    vpor       ymm1, ymm1, ymm4
3104    vmovdqu    [edx], ymm0
3105    vmovdqu    [edx + 32], ymm1
3106    lea        edx,  [edx + 64]
3107    sub        ecx, 16
3108    jg         convertloop
3109    vzeroupper
3110    ret
3111  }
3112}
3113#endif  // HAS_I400TOARGBROW_AVX2
3114
3115#ifdef HAS_MIRRORROW_SSSE3
3116// Shuffle table for reversing the bytes.
3117static const uvec8 kShuffleMirror = {
3118  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3119};
3120
3121// TODO(fbarchard): Replace lea with -16 offset.
3122__declspec(naked)
3123void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3124  __asm {
3125    mov       eax, [esp + 4]   // src
3126    mov       edx, [esp + 8]   // dst
3127    mov       ecx, [esp + 12]  // width
3128    movdqa    xmm5, xmmword ptr kShuffleMirror
3129
3130 convertloop:
3131    movdqu    xmm0, [eax - 16 + ecx]
3132    pshufb    xmm0, xmm5
3133    movdqu    [edx], xmm0
3134    lea       edx, [edx + 16]
3135    sub       ecx, 16
3136    jg        convertloop
3137    ret
3138  }
3139}
3140#endif  // HAS_MIRRORROW_SSSE3
3141
3142#ifdef HAS_MIRRORROW_AVX2
3143__declspec(naked)
3144void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3145  __asm {
3146    mov       eax, [esp + 4]   // src
3147    mov       edx, [esp + 8]   // dst
3148    mov       ecx, [esp + 12]  // width
3149    vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
3150
3151 convertloop:
3152    vmovdqu   ymm0, [eax - 32 + ecx]
3153    vpshufb   ymm0, ymm0, ymm5
3154    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
3155    vmovdqu   [edx], ymm0
3156    lea       edx, [edx + 32]
3157    sub       ecx, 32
3158    jg        convertloop
3159    vzeroupper
3160    ret
3161  }
3162}
3163#endif  // HAS_MIRRORROW_AVX2
3164
3165#ifdef HAS_MIRRORUVROW_SSSE3
3166// Shuffle table for reversing the bytes of UV channels.
3167static const uvec8 kShuffleMirrorUV = {
3168  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3169};
3170
3171__declspec(naked)
3172void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3173                       int width) {
3174  __asm {
3175    push      edi
3176    mov       eax, [esp + 4 + 4]   // src
3177    mov       edx, [esp + 4 + 8]   // dst_u
3178    mov       edi, [esp + 4 + 12]  // dst_v
3179    mov       ecx, [esp + 4 + 16]  // width
3180    movdqa    xmm1, xmmword ptr kShuffleMirrorUV
3181    lea       eax, [eax + ecx * 2 - 16]
3182    sub       edi, edx
3183
3184 convertloop:
3185    movdqu    xmm0, [eax]
3186    lea       eax, [eax - 16]
3187    pshufb    xmm0, xmm1
3188    movlpd    qword ptr [edx], xmm0
3189    movhpd    qword ptr [edx + edi], xmm0
3190    lea       edx, [edx + 8]
3191    sub       ecx, 8
3192    jg        convertloop
3193
3194    pop       edi
3195    ret
3196  }
3197}
3198#endif  // HAS_MIRRORUVROW_SSSE3
3199
3200#ifdef HAS_ARGBMIRRORROW_SSE2
3201__declspec(naked)
3202void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3203  __asm {
3204    mov       eax, [esp + 4]   // src
3205    mov       edx, [esp + 8]   // dst
3206    mov       ecx, [esp + 12]  // width
3207    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
3208
3209 convertloop:
3210    movdqu    xmm0, [eax]
3211    lea       eax, [eax - 16]
3212    pshufd    xmm0, xmm0, 0x1b
3213    movdqu    [edx], xmm0
3214    lea       edx, [edx + 16]
3215    sub       ecx, 4
3216    jg        convertloop
3217    ret
3218  }
3219}
3220#endif  // HAS_ARGBMIRRORROW_SSE2
3221
3222#ifdef HAS_ARGBMIRRORROW_AVX2
3223// Shuffle table for reversing the bytes.
3224static const ulvec32 kARGBShuffleMirror_AVX2 = {
3225  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3226};
3227
3228__declspec(naked)
3229void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3230  __asm {
3231    mov       eax, [esp + 4]   // src
3232    mov       edx, [esp + 8]   // dst
3233    mov       ecx, [esp + 12]  // width
3234    vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
3235
3236 convertloop:
3237    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
3238    vmovdqu   [edx], ymm0
3239    lea       edx, [edx + 32]
3240    sub       ecx, 8
3241    jg        convertloop
3242    vzeroupper
3243    ret
3244  }
3245}
3246#endif  // HAS_ARGBMIRRORROW_AVX2
3247
3248#ifdef HAS_SPLITUVROW_SSE2
3249__declspec(naked)
3250void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3251                     int width) {
3252  __asm {
3253    push       edi
3254    mov        eax, [esp + 4 + 4]    // src_uv
3255    mov        edx, [esp + 4 + 8]    // dst_u
3256    mov        edi, [esp + 4 + 12]   // dst_v
3257    mov        ecx, [esp + 4 + 16]   // width
3258    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3259    psrlw      xmm5, 8
3260    sub        edi, edx
3261
3262  convertloop:
3263    movdqu     xmm0, [eax]
3264    movdqu     xmm1, [eax + 16]
3265    lea        eax,  [eax + 32]
3266    movdqa     xmm2, xmm0
3267    movdqa     xmm3, xmm1
3268    pand       xmm0, xmm5   // even bytes
3269    pand       xmm1, xmm5
3270    packuswb   xmm0, xmm1
3271    psrlw      xmm2, 8      // odd bytes
3272    psrlw      xmm3, 8
3273    packuswb   xmm2, xmm3
3274    movdqu     [edx], xmm0
3275    movdqu     [edx + edi], xmm2
3276    lea        edx, [edx + 16]
3277    sub        ecx, 16
3278    jg         convertloop
3279
3280    pop        edi
3281    ret
3282  }
3283}
3284
3285#endif  // HAS_SPLITUVROW_SSE2
3286
3287#ifdef HAS_SPLITUVROW_AVX2
3288__declspec(naked)
3289void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3290                     int width) {
3291  __asm {
3292    push       edi
3293    mov        eax, [esp + 4 + 4]    // src_uv
3294    mov        edx, [esp + 4 + 8]    // dst_u
3295    mov        edi, [esp + 4 + 12]   // dst_v
3296    mov        ecx, [esp + 4 + 16]   // width
3297    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3298    vpsrlw     ymm5, ymm5, 8
3299    sub        edi, edx
3300
3301  convertloop:
3302    vmovdqu    ymm0, [eax]
3303    vmovdqu    ymm1, [eax + 32]
3304    lea        eax,  [eax + 64]
3305    vpsrlw     ymm2, ymm0, 8      // odd bytes
3306    vpsrlw     ymm3, ymm1, 8
3307    vpand      ymm0, ymm0, ymm5   // even bytes
3308    vpand      ymm1, ymm1, ymm5
3309    vpackuswb  ymm0, ymm0, ymm1
3310    vpackuswb  ymm2, ymm2, ymm3
3311    vpermq     ymm0, ymm0, 0xd8
3312    vpermq     ymm2, ymm2, 0xd8
3313    vmovdqu    [edx], ymm0
3314    vmovdqu    [edx + edi], ymm2
3315    lea        edx, [edx + 32]
3316    sub        ecx, 32
3317    jg         convertloop
3318
3319    pop        edi
3320    vzeroupper
3321    ret
3322  }
3323}
3324#endif  // HAS_SPLITUVROW_AVX2
3325
3326#ifdef HAS_MERGEUVROW_SSE2
3327__declspec(naked)
3328void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3329                     int width) {
3330  __asm {
3331    push       edi
3332    mov        eax, [esp + 4 + 4]    // src_u
3333    mov        edx, [esp + 4 + 8]    // src_v
3334    mov        edi, [esp + 4 + 12]   // dst_uv
3335    mov        ecx, [esp + 4 + 16]   // width
3336    sub        edx, eax
3337
3338  convertloop:
3339    movdqu     xmm0, [eax]      // read 16 U's
3340    movdqu     xmm1, [eax + edx]  // and 16 V's
3341    lea        eax,  [eax + 16]
3342    movdqa     xmm2, xmm0
3343    punpcklbw  xmm0, xmm1       // first 8 UV pairs
3344    punpckhbw  xmm2, xmm1       // next 8 UV pairs
3345    movdqu     [edi], xmm0
3346    movdqu     [edi + 16], xmm2
3347    lea        edi, [edi + 32]
3348    sub        ecx, 16
3349    jg         convertloop
3350
3351    pop        edi
3352    ret
3353  }
3354}
3355#endif  //  HAS_MERGEUVROW_SSE2
3356
3357#ifdef HAS_MERGEUVROW_AVX2
3358__declspec(naked)
3359void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3360                     int width) {
3361  __asm {
3362    push       edi
3363    mov        eax, [esp + 4 + 4]    // src_u
3364    mov        edx, [esp + 4 + 8]    // src_v
3365    mov        edi, [esp + 4 + 12]   // dst_uv
3366    mov        ecx, [esp + 4 + 16]   // width
3367    sub        edx, eax
3368
3369  convertloop:
3370    vmovdqu    ymm0, [eax]           // read 32 U's
3371    vmovdqu    ymm1, [eax + edx]     // and 32 V's
3372    lea        eax,  [eax + 32]
3373    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
3374    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
3375    vextractf128 [edi], ymm2, 0       // bytes 0..15
3376    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
3377    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
3378    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
3379    lea        edi, [edi + 64]
3380    sub        ecx, 32
3381    jg         convertloop
3382
3383    pop        edi
3384    vzeroupper
3385    ret
3386  }
3387}
3388#endif  //  HAS_MERGEUVROW_AVX2
3389
3390#ifdef HAS_COPYROW_SSE2
3391// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3392__declspec(naked)
3393void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3394  __asm {
3395    mov        eax, [esp + 4]   // src
3396    mov        edx, [esp + 8]   // dst
3397    mov        ecx, [esp + 12]  // count
3398    test       eax, 15
3399    jne        convertloopu
3400    test       edx, 15
3401    jne        convertloopu
3402
3403  convertloopa:
3404    movdqa     xmm0, [eax]
3405    movdqa     xmm1, [eax + 16]
3406    lea        eax, [eax + 32]
3407    movdqa     [edx], xmm0
3408    movdqa     [edx + 16], xmm1
3409    lea        edx, [edx + 32]
3410    sub        ecx, 32
3411    jg         convertloopa
3412    ret
3413
3414  convertloopu:
3415    movdqu     xmm0, [eax]
3416    movdqu     xmm1, [eax + 16]
3417    lea        eax, [eax + 32]
3418    movdqu     [edx], xmm0
3419    movdqu     [edx + 16], xmm1
3420    lea        edx, [edx + 32]
3421    sub        ecx, 32
3422    jg         convertloopu
3423    ret
3424  }
3425}
3426#endif  // HAS_COPYROW_SSE2
3427
3428#ifdef HAS_COPYROW_AVX
3429// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
3430__declspec(naked)
3431void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
3432  __asm {
3433    mov        eax, [esp + 4]   // src
3434    mov        edx, [esp + 8]   // dst
3435    mov        ecx, [esp + 12]  // count
3436
3437  convertloop:
3438    vmovdqu    ymm0, [eax]
3439    vmovdqu    ymm1, [eax + 32]
3440    lea        eax, [eax + 64]
3441    vmovdqu    [edx], ymm0
3442    vmovdqu    [edx + 32], ymm1
3443    lea        edx, [edx + 64]
3444    sub        ecx, 64
3445    jg         convertloop
3446
3447    vzeroupper
3448    ret
3449  }
3450}
3451#endif  // HAS_COPYROW_AVX
3452
3453// Multiple of 1.
3454__declspec(naked)
3455void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3456  __asm {
3457    mov        eax, esi
3458    mov        edx, edi
3459    mov        esi, [esp + 4]   // src
3460    mov        edi, [esp + 8]   // dst
3461    mov        ecx, [esp + 12]  // count
3462    rep movsb
3463    mov        edi, edx
3464    mov        esi, eax
3465    ret
3466  }
3467}
3468
3469#ifdef HAS_ARGBCOPYALPHAROW_SSE2
3470// width in pixels
3471__declspec(naked)
3472void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3473  __asm {
3474    mov        eax, [esp + 4]   // src
3475    mov        edx, [esp + 8]   // dst
3476    mov        ecx, [esp + 12]  // count
3477    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
3478    pslld      xmm0, 24
3479    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
3480    psrld      xmm1, 8
3481
3482  convertloop:
3483    movdqu     xmm2, [eax]
3484    movdqu     xmm3, [eax + 16]
3485    lea        eax, [eax + 32]
3486    movdqu     xmm4, [edx]
3487    movdqu     xmm5, [edx + 16]
3488    pand       xmm2, xmm0
3489    pand       xmm3, xmm0
3490    pand       xmm4, xmm1
3491    pand       xmm5, xmm1
3492    por        xmm2, xmm4
3493    por        xmm3, xmm5
3494    movdqu     [edx], xmm2
3495    movdqu     [edx + 16], xmm3
3496    lea        edx, [edx + 32]
3497    sub        ecx, 8
3498    jg         convertloop
3499
3500    ret
3501  }
3502}
3503#endif  // HAS_ARGBCOPYALPHAROW_SSE2
3504
3505#ifdef HAS_ARGBCOPYALPHAROW_AVX2
3506// width in pixels
3507__declspec(naked)
3508void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3509  __asm {
3510    mov        eax, [esp + 4]   // src
3511    mov        edx, [esp + 8]   // dst
3512    mov        ecx, [esp + 12]  // count
3513    vpcmpeqb   ymm0, ymm0, ymm0
3514    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
3515
3516  convertloop:
3517    vmovdqu    ymm1, [eax]
3518    vmovdqu    ymm2, [eax + 32]
3519    lea        eax, [eax + 64]
3520    vpblendvb  ymm1, ymm1, [edx], ymm0
3521    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3522    vmovdqu    [edx], ymm1
3523    vmovdqu    [edx + 32], ymm2
3524    lea        edx, [edx + 64]
3525    sub        ecx, 16
3526    jg         convertloop
3527
3528    vzeroupper
3529    ret
3530  }
3531}
3532#endif  // HAS_ARGBCOPYALPHAROW_AVX2
3533
3534#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3535// width in pixels
3536__declspec(naked)
3537void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
3538  __asm {
3539    mov        eax, [esp + 4]   // src_argb
3540    mov        edx, [esp + 8]   // dst_a
3541    mov        ecx, [esp + 12]  // width
3542
3543  extractloop:
3544    movdqu     xmm0, [eax]
3545    movdqu     xmm1, [eax + 16]
3546    lea        eax, [eax + 32]
3547    psrld      xmm0, 24
3548    psrld      xmm1, 24
3549    packssdw   xmm0, xmm1
3550    packuswb   xmm0, xmm0
3551    movq       qword ptr [edx], xmm0
3552    lea        edx, [edx + 8]
3553    sub        ecx, 8
3554    jg         extractloop
3555
3556    ret
3557  }
3558}
3559#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
3560
3561#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3562// width in pixels
3563__declspec(naked)
3564void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3565  __asm {
3566    mov        eax, [esp + 4]   // src
3567    mov        edx, [esp + 8]   // dst
3568    mov        ecx, [esp + 12]  // count
3569    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
3570    pslld      xmm0, 24
3571    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
3572    psrld      xmm1, 8
3573
3574  convertloop:
3575    movq       xmm2, qword ptr [eax]  // 8 Y's
3576    lea        eax, [eax + 8]
3577    punpcklbw  xmm2, xmm2
3578    punpckhwd  xmm3, xmm2
3579    punpcklwd  xmm2, xmm2
3580    movdqu     xmm4, [edx]
3581    movdqu     xmm5, [edx + 16]
3582    pand       xmm2, xmm0
3583    pand       xmm3, xmm0
3584    pand       xmm4, xmm1
3585    pand       xmm5, xmm1
3586    por        xmm2, xmm4
3587    por        xmm3, xmm5
3588    movdqu     [edx], xmm2
3589    movdqu     [edx + 16], xmm3
3590    lea        edx, [edx + 32]
3591    sub        ecx, 8
3592    jg         convertloop
3593
3594    ret
3595  }
3596}
3597#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3598
3599#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3600// width in pixels
3601__declspec(naked)
3602void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3603  __asm {
3604    mov        eax, [esp + 4]   // src
3605    mov        edx, [esp + 8]   // dst
3606    mov        ecx, [esp + 12]  // count
3607    vpcmpeqb   ymm0, ymm0, ymm0
3608    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
3609
3610  convertloop:
3611    vpmovzxbd  ymm1, qword ptr [eax]
3612    vpmovzxbd  ymm2, qword ptr [eax + 8]
3613    lea        eax, [eax + 16]
3614    vpslld     ymm1, ymm1, 24
3615    vpslld     ymm2, ymm2, 24
3616    vpblendvb  ymm1, ymm1, [edx], ymm0
3617    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3618    vmovdqu    [edx], ymm1
3619    vmovdqu    [edx + 32], ymm2
3620    lea        edx, [edx + 64]
3621    sub        ecx, 16
3622    jg         convertloop
3623
3624    vzeroupper
3625    ret
3626  }
3627}
3628#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3629
3630#ifdef HAS_SETROW_X86
3631// Write 'count' bytes using an 8 bit value repeated.
3632// Count should be multiple of 4.
3633__declspec(naked)
3634void SetRow_X86(uint8* dst, uint8 v8, int count) {
3635  __asm {
3636    movzx      eax, byte ptr [esp + 8]    // v8
3637    mov        edx, 0x01010101  // Duplicate byte to all bytes.
3638    mul        edx              // overwrites edx with upper part of result.
3639    mov        edx, edi
3640    mov        edi, [esp + 4]   // dst
3641    mov        ecx, [esp + 12]  // count
3642    shr        ecx, 2
3643    rep stosd
3644    mov        edi, edx
3645    ret
3646  }
3647}
3648
3649// Write 'count' bytes using an 8 bit value repeated.
3650__declspec(naked)
3651void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
3652  __asm {
3653    mov        edx, edi
3654    mov        edi, [esp + 4]   // dst
3655    mov        eax, [esp + 8]   // v8
3656    mov        ecx, [esp + 12]  // count
3657    rep stosb
3658    mov        edi, edx
3659    ret
3660  }
3661}
3662
3663// Write 'count' 32 bit values.
3664__declspec(naked)
3665void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
3666  __asm {
3667    mov        edx, edi
3668    mov        edi, [esp + 4]   // dst
3669    mov        eax, [esp + 8]   // v32
3670    mov        ecx, [esp + 12]  // count
3671    rep stosd
3672    mov        edi, edx
3673    ret
3674  }
3675}
3676#endif  // HAS_SETROW_X86
3677
3678#ifdef HAS_YUY2TOYROW_AVX2
3679__declspec(naked)
3680void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
3681  __asm {
3682    mov        eax, [esp + 4]    // src_yuy2
3683    mov        edx, [esp + 8]    // dst_y
3684    mov        ecx, [esp + 12]   // width
3685    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3686    vpsrlw     ymm5, ymm5, 8
3687
3688  convertloop:
3689    vmovdqu    ymm0, [eax]
3690    vmovdqu    ymm1, [eax + 32]
3691    lea        eax,  [eax + 64]
3692    vpand      ymm0, ymm0, ymm5   // even bytes are Y
3693    vpand      ymm1, ymm1, ymm5
3694    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3695    vpermq     ymm0, ymm0, 0xd8
3696    vmovdqu    [edx], ymm0
3697    lea        edx, [edx + 32]
3698    sub        ecx, 32
3699    jg         convertloop
3700    vzeroupper
3701    ret
3702  }
3703}
3704
3705__declspec(naked)
3706void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3707                      uint8* dst_u, uint8* dst_v, int width) {
3708  __asm {
3709    push       esi
3710    push       edi
3711    mov        eax, [esp + 8 + 4]    // src_yuy2
3712    mov        esi, [esp + 8 + 8]    // stride_yuy2
3713    mov        edx, [esp + 8 + 12]   // dst_u
3714    mov        edi, [esp + 8 + 16]   // dst_v
3715    mov        ecx, [esp + 8 + 20]   // width
3716    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3717    vpsrlw     ymm5, ymm5, 8
3718    sub        edi, edx
3719
3720  convertloop:
3721    vmovdqu    ymm0, [eax]
3722    vmovdqu    ymm1, [eax + 32]
3723    vpavgb     ymm0, ymm0, [eax + esi]
3724    vpavgb     ymm1, ymm1, [eax + esi + 32]
3725    lea        eax,  [eax + 64]
3726    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
3727    vpsrlw     ymm1, ymm1, 8
3728    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3729    vpermq     ymm0, ymm0, 0xd8
3730    vpand      ymm1, ymm0, ymm5  // U
3731    vpsrlw     ymm0, ymm0, 8     // V
3732    vpackuswb  ymm1, ymm1, ymm1  // mutates.
3733    vpackuswb  ymm0, ymm0, ymm0  // mutates.
3734    vpermq     ymm1, ymm1, 0xd8
3735    vpermq     ymm0, ymm0, 0xd8
3736    vextractf128 [edx], ymm1, 0  // U
3737    vextractf128 [edx + edi], ymm0, 0 // V
3738    lea        edx, [edx + 16]
3739    sub        ecx, 32
3740    jg         convertloop
3741
3742    pop        edi
3743    pop        esi
3744    vzeroupper
3745    ret
3746  }
3747}
3748
3749__declspec(naked)
3750void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3751                         uint8* dst_u, uint8* dst_v, int width) {
3752  __asm {
3753    push       edi
3754    mov        eax, [esp + 4 + 4]    // src_yuy2
3755    mov        edx, [esp + 4 + 8]    // dst_u
3756    mov        edi, [esp + 4 + 12]   // dst_v
3757    mov        ecx, [esp + 4 + 16]   // width
3758    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3759    vpsrlw     ymm5, ymm5, 8
3760    sub        edi, edx
3761
3762  convertloop:
3763    vmovdqu    ymm0, [eax]
3764    vmovdqu    ymm1, [eax + 32]
3765    lea        eax,  [eax + 64]
3766    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
3767    vpsrlw     ymm1, ymm1, 8
3768    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3769    vpermq     ymm0, ymm0, 0xd8
3770    vpand      ymm1, ymm0, ymm5  // U
3771    vpsrlw     ymm0, ymm0, 8     // V
3772    vpackuswb  ymm1, ymm1, ymm1  // mutates.
3773    vpackuswb  ymm0, ymm0, ymm0  // mutates.
3774    vpermq     ymm1, ymm1, 0xd8
3775    vpermq     ymm0, ymm0, 0xd8
3776    vextractf128 [edx], ymm1, 0  // U
3777    vextractf128 [edx + edi], ymm0, 0 // V
3778    lea        edx, [edx + 16]
3779    sub        ecx, 32
3780    jg         convertloop
3781
3782    pop        edi
3783    vzeroupper
3784    ret
3785  }
3786}
3787
3788__declspec(naked)
3789void UYVYToYRow_AVX2(const uint8* src_uyvy,
3790                     uint8* dst_y, int width) {
3791  __asm {
3792    mov        eax, [esp + 4]    // src_uyvy
3793    mov        edx, [esp + 8]    // dst_y
3794    mov        ecx, [esp + 12]   // width
3795
3796  convertloop:
3797    vmovdqu    ymm0, [eax]
3798    vmovdqu    ymm1, [eax + 32]
3799    lea        eax,  [eax + 64]
3800    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
3801    vpsrlw     ymm1, ymm1, 8
3802    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3803    vpermq     ymm0, ymm0, 0xd8
3804    vmovdqu    [edx], ymm0
3805    lea        edx, [edx + 32]
3806    sub        ecx, 32
3807    jg         convertloop
3808    vzeroupper
3809    ret
3810  }
3811}
3812
3813__declspec(naked)
3814void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3815                      uint8* dst_u, uint8* dst_v, int width) {
3816  __asm {
3817    push       esi
3818    push       edi
3819    mov        eax, [esp + 8 + 4]    // src_yuy2
3820    mov        esi, [esp + 8 + 8]    // stride_yuy2
3821    mov        edx, [esp + 8 + 12]   // dst_u
3822    mov        edi, [esp + 8 + 16]   // dst_v
3823    mov        ecx, [esp + 8 + 20]   // width
3824    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3825    vpsrlw     ymm5, ymm5, 8
3826    sub        edi, edx
3827
3828  convertloop:
3829    vmovdqu    ymm0, [eax]
3830    vmovdqu    ymm1, [eax + 32]
3831    vpavgb     ymm0, ymm0, [eax + esi]
3832    vpavgb     ymm1, ymm1, [eax + esi + 32]
3833    lea        eax,  [eax + 64]
3834    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
3835    vpand      ymm1, ymm1, ymm5
3836    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3837    vpermq     ymm0, ymm0, 0xd8
3838    vpand      ymm1, ymm0, ymm5  // U
3839    vpsrlw     ymm0, ymm0, 8     // V
3840    vpackuswb  ymm1, ymm1, ymm1  // mutates.
3841    vpackuswb  ymm0, ymm0, ymm0  // mutates.
3842    vpermq     ymm1, ymm1, 0xd8
3843    vpermq     ymm0, ymm0, 0xd8
3844    vextractf128 [edx], ymm1, 0  // U
3845    vextractf128 [edx + edi], ymm0, 0 // V
3846    lea        edx, [edx + 16]
3847    sub        ecx, 32
3848    jg         convertloop
3849
3850    pop        edi
3851    pop        esi
3852    vzeroupper
3853    ret
3854  }
3855}
3856
3857__declspec(naked)
3858void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3859                         uint8* dst_u, uint8* dst_v, int width) {
3860  __asm {
3861    push       edi
3862    mov        eax, [esp + 4 + 4]    // src_yuy2
3863    mov        edx, [esp + 4 + 8]    // dst_u
3864    mov        edi, [esp + 4 + 12]   // dst_v
3865    mov        ecx, [esp + 4 + 16]   // width
3866    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3867    vpsrlw     ymm5, ymm5, 8
3868    sub        edi, edx
3869
3870  convertloop:
3871    vmovdqu    ymm0, [eax]
3872    vmovdqu    ymm1, [eax + 32]
3873    lea        eax,  [eax + 64]
3874    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
3875    vpand      ymm1, ymm1, ymm5
3876    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3877    vpermq     ymm0, ymm0, 0xd8
3878    vpand      ymm1, ymm0, ymm5  // U
3879    vpsrlw     ymm0, ymm0, 8     // V
3880    vpackuswb  ymm1, ymm1, ymm1  // mutates.
3881    vpackuswb  ymm0, ymm0, ymm0  // mutates.
3882    vpermq     ymm1, ymm1, 0xd8
3883    vpermq     ymm0, ymm0, 0xd8
3884    vextractf128 [edx], ymm1, 0  // U
3885    vextractf128 [edx + edi], ymm0, 0 // V
3886    lea        edx, [edx + 16]
3887    sub        ecx, 32
3888    jg         convertloop
3889
3890    pop        edi
3891    vzeroupper
3892    ret
3893  }
3894}
3895#endif  // HAS_YUY2TOYROW_AVX2
3896
3897#ifdef HAS_YUY2TOYROW_SSE2
3898__declspec(naked)
3899void YUY2ToYRow_SSE2(const uint8* src_yuy2,
3900                     uint8* dst_y, int width) {
3901  __asm {
3902    mov        eax, [esp + 4]    // src_yuy2
3903    mov        edx, [esp + 8]    // dst_y
3904    mov        ecx, [esp + 12]   // width
3905    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
3906    psrlw      xmm5, 8
3907
3908  convertloop:
3909    movdqu     xmm0, [eax]
3910    movdqu     xmm1, [eax + 16]
3911    lea        eax,  [eax + 32]
3912    pand       xmm0, xmm5   // even bytes are Y
3913    pand       xmm1, xmm5
3914    packuswb   xmm0, xmm1
3915    movdqu     [edx], xmm0
3916    lea        edx, [edx + 16]
3917    sub        ecx, 16
3918    jg         convertloop
3919    ret
3920  }
3921}
3922
3923__declspec(naked)
3924void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3925                      uint8* dst_u, uint8* dst_v, int width) {
3926  __asm {
3927    push       esi
3928    push       edi
3929    mov        eax, [esp + 8 + 4]    // src_yuy2
3930    mov        esi, [esp + 8 + 8]    // stride_yuy2
3931    mov        edx, [esp + 8 + 12]   // dst_u
3932    mov        edi, [esp + 8 + 16]   // dst_v
3933    mov        ecx, [esp + 8 + 20]   // width
3934    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3935    psrlw      xmm5, 8
3936    sub        edi, edx
3937
3938  convertloop:
3939    movdqu     xmm0, [eax]
3940    movdqu     xmm1, [eax + 16]
3941    movdqu     xmm2, [eax + esi]
3942    movdqu     xmm3, [eax + esi + 16]
3943    lea        eax,  [eax + 32]
3944    pavgb      xmm0, xmm2
3945    pavgb      xmm1, xmm3
3946    psrlw      xmm0, 8      // YUYV -> UVUV
3947    psrlw      xmm1, 8
3948    packuswb   xmm0, xmm1
3949    movdqa     xmm1, xmm0
3950    pand       xmm0, xmm5  // U
3951    packuswb   xmm0, xmm0
3952    psrlw      xmm1, 8     // V
3953    packuswb   xmm1, xmm1
3954    movq       qword ptr [edx], xmm0
3955    movq       qword ptr [edx + edi], xmm1
3956    lea        edx, [edx + 8]
3957    sub        ecx, 16
3958    jg         convertloop
3959
3960    pop        edi
3961    pop        esi
3962    ret
3963  }
3964}
3965
3966__declspec(naked)
3967void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3968                         uint8* dst_u, uint8* dst_v, int width) {
3969  __asm {
3970    push       edi
3971    mov        eax, [esp + 4 + 4]    // src_yuy2
3972    mov        edx, [esp + 4 + 8]    // dst_u
3973    mov        edi, [esp + 4 + 12]   // dst_v
3974    mov        ecx, [esp + 4 + 16]   // width
3975    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3976    psrlw      xmm5, 8
3977    sub        edi, edx
3978
3979  convertloop:
3980    movdqu     xmm0, [eax]
3981    movdqu     xmm1, [eax + 16]
3982    lea        eax,  [eax + 32]
3983    psrlw      xmm0, 8      // YUYV -> UVUV
3984    psrlw      xmm1, 8
3985    packuswb   xmm0, xmm1
3986    movdqa     xmm1, xmm0
3987    pand       xmm0, xmm5  // U
3988    packuswb   xmm0, xmm0
3989    psrlw      xmm1, 8     // V
3990    packuswb   xmm1, xmm1
3991    movq       qword ptr [edx], xmm0
3992    movq       qword ptr [edx + edi], xmm1
3993    lea        edx, [edx + 8]
3994    sub        ecx, 16
3995    jg         convertloop
3996
3997    pop        edi
3998    ret
3999  }
4000}
4001
4002__declspec(naked)
4003void UYVYToYRow_SSE2(const uint8* src_uyvy,
4004                     uint8* dst_y, int width) {
4005  __asm {
4006    mov        eax, [esp + 4]    // src_uyvy
4007    mov        edx, [esp + 8]    // dst_y
4008    mov        ecx, [esp + 12]   // width
4009
4010  convertloop:
4011    movdqu     xmm0, [eax]
4012    movdqu     xmm1, [eax + 16]
4013    lea        eax,  [eax + 32]
4014    psrlw      xmm0, 8    // odd bytes are Y
4015    psrlw      xmm1, 8
4016    packuswb   xmm0, xmm1
4017    movdqu     [edx], xmm0
4018    lea        edx, [edx + 16]
4019    sub        ecx, 16
4020    jg         convertloop
4021    ret
4022  }
4023}
4024
4025__declspec(naked)
4026void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
4027                      uint8* dst_u, uint8* dst_v, int width) {
4028  __asm {
4029    push       esi
4030    push       edi
4031    mov        eax, [esp + 8 + 4]    // src_yuy2
4032    mov        esi, [esp + 8 + 8]    // stride_yuy2
4033    mov        edx, [esp + 8 + 12]   // dst_u
4034    mov        edi, [esp + 8 + 16]   // dst_v
4035    mov        ecx, [esp + 8 + 20]   // width
4036    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4037    psrlw      xmm5, 8
4038    sub        edi, edx
4039
4040  convertloop:
4041    movdqu     xmm0, [eax]
4042    movdqu     xmm1, [eax + 16]
4043    movdqu     xmm2, [eax + esi]
4044    movdqu     xmm3, [eax + esi + 16]
4045    lea        eax,  [eax + 32]
4046    pavgb      xmm0, xmm2
4047    pavgb      xmm1, xmm3
4048    pand       xmm0, xmm5   // UYVY -> UVUV
4049    pand       xmm1, xmm5
4050    packuswb   xmm0, xmm1
4051    movdqa     xmm1, xmm0
4052    pand       xmm0, xmm5  // U
4053    packuswb   xmm0, xmm0
4054    psrlw      xmm1, 8     // V
4055    packuswb   xmm1, xmm1
4056    movq       qword ptr [edx], xmm0
4057    movq       qword ptr [edx + edi], xmm1
4058    lea        edx, [edx + 8]
4059    sub        ecx, 16
4060    jg         convertloop
4061
4062    pop        edi
4063    pop        esi
4064    ret
4065  }
4066}
4067
4068__declspec(naked)
4069void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
4070                         uint8* dst_u, uint8* dst_v, int width) {
4071  __asm {
4072    push       edi
4073    mov        eax, [esp + 4 + 4]    // src_yuy2
4074    mov        edx, [esp + 4 + 8]    // dst_u
4075    mov        edi, [esp + 4 + 12]   // dst_v
4076    mov        ecx, [esp + 4 + 16]   // width
4077    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4078    psrlw      xmm5, 8
4079    sub        edi, edx
4080
4081  convertloop:
4082    movdqu     xmm0, [eax]
4083    movdqu     xmm1, [eax + 16]
4084    lea        eax,  [eax + 32]
4085    pand       xmm0, xmm5   // UYVY -> UVUV
4086    pand       xmm1, xmm5
4087    packuswb   xmm0, xmm1
4088    movdqa     xmm1, xmm0
4089    pand       xmm0, xmm5  // U
4090    packuswb   xmm0, xmm0
4091    psrlw      xmm1, 8     // V
4092    packuswb   xmm1, xmm1
4093    movq       qword ptr [edx], xmm0
4094    movq       qword ptr [edx + edi], xmm1
4095    lea        edx, [edx + 8]
4096    sub        ecx, 16
4097    jg         convertloop
4098
4099    pop        edi
4100    ret
4101  }
4102}
4103#endif  // HAS_YUY2TOYROW_SSE2
4104
4105#ifdef HAS_BLENDPLANEROW_SSSE3
4106// Blend 8 pixels at a time.
4107// unsigned version of math
4108// =((A2*C2)+(B2*(255-C2))+255)/256
4109// signed version of math
4110// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4111__declspec(naked)
4112void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
4113                         const uint8* alpha, uint8* dst, int width) {
4114  __asm {
4115    push       esi
4116    push       edi
4117    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
4118    psllw      xmm5, 8
4119    mov        eax, 0x80808080  // 128 for biasing image to signed.
4120    movd       xmm6, eax
4121    pshufd     xmm6, xmm6, 0x00
4122
4123    mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
4124    movd       xmm7, eax
4125    pshufd     xmm7, xmm7, 0x00
4126    mov        eax, [esp + 8 + 4]   // src0
4127    mov        edx, [esp + 8 + 8]   // src1
4128    mov        esi, [esp + 8 + 12]  // alpha
4129    mov        edi, [esp + 8 + 16]  // dst
4130    mov        ecx, [esp + 8 + 20]  // width
4131    sub        eax, esi
4132    sub        edx, esi
4133    sub        edi, esi
4134
4135    // 8 pixel loop.
4136  convertloop8:
4137    movq       xmm0, qword ptr [esi]        // alpha
4138    punpcklbw  xmm0, xmm0
4139    pxor       xmm0, xmm5         // a, 255-a
4140    movq       xmm1, qword ptr [eax + esi]  // src0
4141    movq       xmm2, qword ptr [edx + esi]  // src1
4142    punpcklbw  xmm1, xmm2
4143    psubb      xmm1, xmm6         // bias src0/1 - 128
4144    pmaddubsw  xmm0, xmm1
4145    paddw      xmm0, xmm7         // unbias result - 32768 and round.
4146    psrlw      xmm0, 8
4147    packuswb   xmm0, xmm0
4148    movq       qword ptr [edi + esi], xmm0
4149    lea        esi, [esi + 8]
4150    sub        ecx, 8
4151    jg         convertloop8
4152
4153    pop        edi
4154    pop        esi
4155    ret
4156  }
4157}
4158#endif  // HAS_BLENDPLANEROW_SSSE3
4159
4160#ifdef HAS_BLENDPLANEROW_AVX2
4161// Blend 32 pixels at a time.
4162// unsigned version of math
4163// =((A2*C2)+(B2*(255-C2))+255)/256
4164// signed version of math
4165// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4166__declspec(naked)
4167void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
4168                         const uint8* alpha, uint8* dst, int width) {
4169  __asm {
4170    push        esi
4171    push        edi
4172    vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00
4173    vpsllw      ymm5, ymm5, 8
4174    mov         eax, 0x80808080  // 128 for biasing image to signed.
4175    vmovd       xmm6, eax
4176    vbroadcastss ymm6, xmm6
4177    mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
4178    vmovd       xmm7, eax
4179    vbroadcastss ymm7, xmm7
4180    mov         eax, [esp + 8 + 4]   // src0
4181    mov         edx, [esp + 8 + 8]   // src1
4182    mov         esi, [esp + 8 + 12]  // alpha
4183    mov         edi, [esp + 8 + 16]  // dst
4184    mov         ecx, [esp + 8 + 20]  // width
4185    sub         eax, esi
4186    sub         edx, esi
4187    sub         edi, esi
4188
4189    // 32 pixel loop.
4190  convertloop32:
4191    vmovdqu     ymm0, [esi]        // alpha
4192    vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31
4193    vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23
4194    vpxor       ymm3, ymm3, ymm5   // a, 255-a
4195    vpxor       ymm0, ymm0, ymm5   // a, 255-a
4196    vmovdqu     ymm1, [eax + esi]  // src0
4197    vmovdqu     ymm2, [edx + esi]  // src1
4198    vpunpckhbw  ymm4, ymm1, ymm2
4199    vpunpcklbw  ymm1, ymm1, ymm2
4200    vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128
4201    vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128
4202    vpmaddubsw  ymm3, ymm3, ymm4
4203    vpmaddubsw  ymm0, ymm0, ymm1
4204    vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.
4205    vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.
4206    vpsrlw      ymm3, ymm3, 8
4207    vpsrlw      ymm0, ymm0, 8
4208    vpackuswb   ymm0, ymm0, ymm3
4209    vmovdqu     [edi + esi], ymm0
4210    lea         esi, [esi + 32]
4211    sub         ecx, 32
4212    jg          convertloop32
4213
4214    pop         edi
4215    pop         esi
4216    vzeroupper
4217    ret
4218  }
4219}
4220#endif  // HAS_BLENDPLANEROW_AVX2
4221
4222#ifdef HAS_ARGBBLENDROW_SSSE3
4223// Shuffle table for isolating alpha.
4224static const uvec8 kShuffleAlpha = {
4225  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4226  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4227};
4228
4229// Blend 8 pixels at a time.
4230__declspec(naked)
4231void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4232                        uint8* dst_argb, int width) {
4233  __asm {
4234    push       esi
4235    mov        eax, [esp + 4 + 4]   // src_argb0
4236    mov        esi, [esp + 4 + 8]   // src_argb1
4237    mov        edx, [esp + 4 + 12]  // dst_argb
4238    mov        ecx, [esp + 4 + 16]  // width
4239    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
4240    psrlw      xmm7, 15
4241    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
4242    psrlw      xmm6, 8
4243    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
4244    psllw      xmm5, 8
4245    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
4246    pslld      xmm4, 24
4247    sub        ecx, 4
4248    jl         convertloop4b    // less than 4 pixels?
4249
4250    // 4 pixel loop.
4251  convertloop4:
4252    movdqu     xmm3, [eax]      // src argb
4253    lea        eax, [eax + 16]
4254    movdqa     xmm0, xmm3       // src argb
4255    pxor       xmm3, xmm4       // ~alpha
4256    movdqu     xmm2, [esi]      // _r_b
4257    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
4258    pand       xmm2, xmm6       // _r_b
4259    paddw      xmm3, xmm7       // 256 - alpha
4260    pmullw     xmm2, xmm3       // _r_b * alpha
4261    movdqu     xmm1, [esi]      // _a_g
4262    lea        esi, [esi + 16]
4263    psrlw      xmm1, 8          // _a_g
4264    por        xmm0, xmm4       // set alpha to 255
4265    pmullw     xmm1, xmm3       // _a_g * alpha
4266    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4267    paddusb    xmm0, xmm2       // + src argb
4268    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4269    paddusb    xmm0, xmm1       // + src argb
4270    movdqu     [edx], xmm0
4271    lea        edx, [edx + 16]
4272    sub        ecx, 4
4273    jge        convertloop4
4274
4275  convertloop4b:
4276    add        ecx, 4 - 1
4277    jl         convertloop1b
4278
4279    // 1 pixel loop.
4280  convertloop1:
4281    movd       xmm3, [eax]      // src argb
4282    lea        eax, [eax + 4]
4283    movdqa     xmm0, xmm3       // src argb
4284    pxor       xmm3, xmm4       // ~alpha
4285    movd       xmm2, [esi]      // _r_b
4286    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
4287    pand       xmm2, xmm6       // _r_b
4288    paddw      xmm3, xmm7       // 256 - alpha
4289    pmullw     xmm2, xmm3       // _r_b * alpha
4290    movd       xmm1, [esi]      // _a_g
4291    lea        esi, [esi + 4]
4292    psrlw      xmm1, 8          // _a_g
4293    por        xmm0, xmm4       // set alpha to 255
4294    pmullw     xmm1, xmm3       // _a_g * alpha
4295    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4296    paddusb    xmm0, xmm2       // + src argb
4297    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4298    paddusb    xmm0, xmm1       // + src argb
4299    movd       [edx], xmm0
4300    lea        edx, [edx + 4]
4301    sub        ecx, 1
4302    jge        convertloop1
4303
4304  convertloop1b:
4305    pop        esi
4306    ret
4307  }
4308}
4309#endif  // HAS_ARGBBLENDROW_SSSE3
4310
4311#ifdef HAS_ARGBATTENUATEROW_SSSE3
4312// Shuffle table duplicating alpha.
4313static const uvec8 kShuffleAlpha0 = {
4314  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4315};
4316static const uvec8 kShuffleAlpha1 = {
4317  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4318  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4319};
4320__declspec(naked)
4321void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4322  __asm {
4323    mov        eax, [esp + 4]   // src_argb0
4324    mov        edx, [esp + 8]   // dst_argb
4325    mov        ecx, [esp + 12]  // width
4326    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
4327    pslld      xmm3, 24
4328    movdqa     xmm4, xmmword ptr kShuffleAlpha0
4329    movdqa     xmm5, xmmword ptr kShuffleAlpha1
4330
4331 convertloop:
4332    movdqu     xmm0, [eax]      // read 4 pixels
4333    pshufb     xmm0, xmm4       // isolate first 2 alphas
4334    movdqu     xmm1, [eax]      // read 4 pixels
4335    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
4336    pmulhuw    xmm0, xmm1       // rgb * a
4337    movdqu     xmm1, [eax]      // read 4 pixels
4338    pshufb     xmm1, xmm5       // isolate next 2 alphas
4339    movdqu     xmm2, [eax]      // read 4 pixels
4340    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
4341    pmulhuw    xmm1, xmm2       // rgb * a
4342    movdqu     xmm2, [eax]      // mask original alpha
4343    lea        eax, [eax + 16]
4344    pand       xmm2, xmm3
4345    psrlw      xmm0, 8
4346    psrlw      xmm1, 8
4347    packuswb   xmm0, xmm1
4348    por        xmm0, xmm2       // copy original alpha
4349    movdqu     [edx], xmm0
4350    lea        edx, [edx + 16]
4351    sub        ecx, 4
4352    jg         convertloop
4353
4354    ret
4355  }
4356}
4357#endif  // HAS_ARGBATTENUATEROW_SSSE3
4358
4359#ifdef HAS_ARGBATTENUATEROW_AVX2
4360// Shuffle table duplicating alpha.
4361static const uvec8 kShuffleAlpha_AVX2 = {
4362  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
4363};
4364__declspec(naked)
4365void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
4366  __asm {
4367    mov        eax, [esp + 4]   // src_argb0
4368    mov        edx, [esp + 8]   // dst_argb
4369    mov        ecx, [esp + 12]  // width
4370    sub        edx, eax
4371    vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
4372    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
4373    vpslld     ymm5, ymm5, 24
4374
4375 convertloop:
4376    vmovdqu    ymm6, [eax]       // read 8 pixels.
4377    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4378    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4379    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
4380    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
4381    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
4382    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
4383    vpand      ymm6, ymm6, ymm5  // isolate alpha
4384    vpsrlw     ymm0, ymm0, 8
4385    vpsrlw     ymm1, ymm1, 8
4386    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4387    vpor       ymm0, ymm0, ymm6  // copy original alpha
4388    vmovdqu    [eax + edx], ymm0
4389    lea        eax, [eax + 32]
4390    sub        ecx, 8
4391    jg         convertloop
4392
4393    vzeroupper
4394    ret
4395  }
4396}
4397#endif  // HAS_ARGBATTENUATEROW_AVX2
4398
4399#ifdef HAS_ARGBUNATTENUATEROW_SSE2
4400// Unattenuate 4 pixels at a time.
4401__declspec(naked)
4402void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4403                             int width) {
4404  __asm {
4405    push       ebx
4406    push       esi
4407    push       edi
4408    mov        eax, [esp + 12 + 4]   // src_argb
4409    mov        edx, [esp + 12 + 8]   // dst_argb
4410    mov        ecx, [esp + 12 + 12]  // width
4411    lea        ebx, fixed_invtbl8
4412
4413 convertloop:
4414    movdqu     xmm0, [eax]      // read 4 pixels
4415    movzx      esi, byte ptr [eax + 3]  // first alpha
4416    movzx      edi, byte ptr [eax + 7]  // second alpha
4417    punpcklbw  xmm0, xmm0       // first 2
4418    movd       xmm2, dword ptr [ebx + esi * 4]
4419    movd       xmm3, dword ptr [ebx + edi * 4]
4420    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
4421    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
4422    movlhps    xmm2, xmm3
4423    pmulhuw    xmm0, xmm2       // rgb * a
4424
4425    movdqu     xmm1, [eax]      // read 4 pixels
4426    movzx      esi, byte ptr [eax + 11]  // third alpha
4427    movzx      edi, byte ptr [eax + 15]  // forth alpha
4428    punpckhbw  xmm1, xmm1       // next 2
4429    movd       xmm2, dword ptr [ebx + esi * 4]
4430    movd       xmm3, dword ptr [ebx + edi * 4]
4431    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
4432    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
4433    movlhps    xmm2, xmm3
4434    pmulhuw    xmm1, xmm2       // rgb * a
4435    lea        eax, [eax + 16]
4436    packuswb   xmm0, xmm1
4437    movdqu     [edx], xmm0
4438    lea        edx, [edx + 16]
4439    sub        ecx, 4
4440    jg         convertloop
4441
4442    pop        edi
4443    pop        esi
4444    pop        ebx
4445    ret
4446  }
4447}
4448#endif  // HAS_ARGBUNATTENUATEROW_SSE2
4449
4450#ifdef HAS_ARGBUNATTENUATEROW_AVX2
4451// Shuffle table duplicating alpha.
4452static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4453  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
4454};
4455// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4456// USE_GATHER is not on by default, due to being a slow instruction.
4457#ifdef USE_GATHER
4458__declspec(naked)
4459void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4460                             int width) {
4461  __asm {
4462    mov        eax, [esp + 4]   // src_argb0
4463    mov        edx, [esp + 8]   // dst_argb
4464    mov        ecx, [esp + 12]  // width
4465    sub        edx, eax
4466    vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
4467
4468 convertloop:
4469    vmovdqu    ymm6, [eax]       // read 8 pixels.
4470    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
4471    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
4472    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4473    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4474    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
4475    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4476    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4477    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
4478    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
4479    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4480    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4481    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4482    vmovdqu    [eax + edx], ymm0
4483    lea        eax, [eax + 32]
4484    sub        ecx, 8
4485    jg         convertloop
4486
4487    vzeroupper
4488    ret
4489  }
4490}
4491#else  // USE_GATHER
4492__declspec(naked)
4493void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4494                             int width) {
4495  __asm {
4496
4497    push       ebx
4498    push       esi
4499    push       edi
4500    mov        eax, [esp + 12 + 4]   // src_argb
4501    mov        edx, [esp + 12 + 8]   // dst_argb
4502    mov        ecx, [esp + 12 + 12]  // width
4503    sub        edx, eax
4504    lea        ebx, fixed_invtbl8
4505    vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
4506
4507 convertloop:
4508    // replace VPGATHER
4509    movzx      esi, byte ptr [eax + 3]                 // alpha0
4510    movzx      edi, byte ptr [eax + 7]                 // alpha1
4511    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
4512    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
4513    movzx      esi, byte ptr [eax + 11]                // alpha2
4514    movzx      edi, byte ptr [eax + 15]                // alpha3
4515    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
4516    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
4517    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
4518    movzx      esi, byte ptr [eax + 19]                // alpha4
4519    movzx      edi, byte ptr [eax + 23]                // alpha5
4520    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
4521    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
4522    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
4523    movzx      esi, byte ptr [eax + 27]                // alpha6
4524    movzx      edi, byte ptr [eax + 31]                // alpha7
4525    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
4526    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
4527    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
4528    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
4529    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
4530    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
4531    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4532    // end of VPGATHER
4533
4534    vmovdqu    ymm6, [eax]       // read 8 pixels.
4535    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4536    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4537    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4538    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4539    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
4540    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
4541    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4542    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4543    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4544    vmovdqu    [eax + edx], ymm0
4545    lea        eax, [eax + 32]
4546    sub        ecx, 8
4547    jg         convertloop
4548
4549    pop        edi
4550    pop        esi
4551    pop        ebx
4552    vzeroupper
4553    ret
4554  }
4555}
4556#endif  // USE_GATHER
4557#endif  // HAS_ARGBATTENUATEROW_AVX2
4558
4559#ifdef HAS_ARGBGRAYROW_SSSE3
4560// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4561__declspec(naked)
4562void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4563  __asm {
4564    mov        eax, [esp + 4]   /* src_argb */
4565    mov        edx, [esp + 8]   /* dst_argb */
4566    mov        ecx, [esp + 12]  /* width */
4567    movdqa     xmm4, xmmword ptr kARGBToYJ
4568    movdqa     xmm5, xmmword ptr kAddYJ64
4569
4570 convertloop:
4571    movdqu     xmm0, [eax]  // G
4572    movdqu     xmm1, [eax + 16]
4573    pmaddubsw  xmm0, xmm4
4574    pmaddubsw  xmm1, xmm4
4575    phaddw     xmm0, xmm1
4576    paddw      xmm0, xmm5  // Add .5 for rounding.
4577    psrlw      xmm0, 7
4578    packuswb   xmm0, xmm0   // 8 G bytes
4579    movdqu     xmm2, [eax]  // A
4580    movdqu     xmm3, [eax + 16]
4581    lea        eax, [eax + 32]
4582    psrld      xmm2, 24
4583    psrld      xmm3, 24
4584    packuswb   xmm2, xmm3
4585    packuswb   xmm2, xmm2   // 8 A bytes
4586    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
4587    punpcklbw  xmm0, xmm0   // 8 GG words
4588    punpcklbw  xmm3, xmm2   // 8 GA words
4589    movdqa     xmm1, xmm0
4590    punpcklwd  xmm0, xmm3   // GGGA first 4
4591    punpckhwd  xmm1, xmm3   // GGGA next 4
4592    movdqu     [edx], xmm0
4593    movdqu     [edx + 16], xmm1
4594    lea        edx, [edx + 32]
4595    sub        ecx, 8
4596    jg         convertloop
4597    ret
4598  }
4599}
4600#endif  // HAS_ARGBGRAYROW_SSSE3
4601
4602#ifdef HAS_ARGBSEPIAROW_SSSE3
4603//    b = (r * 35 + g * 68 + b * 17) >> 7
4604//    g = (r * 45 + g * 88 + b * 22) >> 7
4605//    r = (r * 50 + g * 98 + b * 24) >> 7
4606// Constant for ARGB color to sepia tone.
4607static const vec8 kARGBToSepiaB = {
4608  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
4609};
4610
4611static const vec8 kARGBToSepiaG = {
4612  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
4613};
4614
4615static const vec8 kARGBToSepiaR = {
4616  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
4617};
4618
4619// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4620__declspec(naked)
4621void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4622  __asm {
4623    mov        eax, [esp + 4]   /* dst_argb */
4624    mov        ecx, [esp + 8]   /* width */
4625    movdqa     xmm2, xmmword ptr kARGBToSepiaB
4626    movdqa     xmm3, xmmword ptr kARGBToSepiaG
4627    movdqa     xmm4, xmmword ptr kARGBToSepiaR
4628
4629 convertloop:
4630    movdqu     xmm0, [eax]  // B
4631    movdqu     xmm6, [eax + 16]
4632    pmaddubsw  xmm0, xmm2
4633    pmaddubsw  xmm6, xmm2
4634    phaddw     xmm0, xmm6
4635    psrlw      xmm0, 7
4636    packuswb   xmm0, xmm0   // 8 B values
4637    movdqu     xmm5, [eax]  // G
4638    movdqu     xmm1, [eax + 16]
4639    pmaddubsw  xmm5, xmm3
4640    pmaddubsw  xmm1, xmm3
4641    phaddw     xmm5, xmm1
4642    psrlw      xmm5, 7
4643    packuswb   xmm5, xmm5   // 8 G values
4644    punpcklbw  xmm0, xmm5   // 8 BG values
4645    movdqu     xmm5, [eax]  // R
4646    movdqu     xmm1, [eax + 16]
4647    pmaddubsw  xmm5, xmm4
4648    pmaddubsw  xmm1, xmm4
4649    phaddw     xmm5, xmm1
4650    psrlw      xmm5, 7
4651    packuswb   xmm5, xmm5   // 8 R values
4652    movdqu     xmm6, [eax]  // A
4653    movdqu     xmm1, [eax + 16]
4654    psrld      xmm6, 24
4655    psrld      xmm1, 24
4656    packuswb   xmm6, xmm1
4657    packuswb   xmm6, xmm6   // 8 A values
4658    punpcklbw  xmm5, xmm6   // 8 RA values
4659    movdqa     xmm1, xmm0   // Weave BG, RA together
4660    punpcklwd  xmm0, xmm5   // BGRA first 4
4661    punpckhwd  xmm1, xmm5   // BGRA next 4
4662    movdqu     [eax], xmm0
4663    movdqu     [eax + 16], xmm1
4664    lea        eax, [eax + 32]
4665    sub        ecx, 8
4666    jg         convertloop
4667    ret
4668  }
4669}
4670#endif  // HAS_ARGBSEPIAROW_SSSE3
4671
4672#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4673// Tranform 8 ARGB pixels (32 bytes) with color matrix.
4674// Same as Sepia except matrix is provided.
4675// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4676// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4677__declspec(naked)
4678void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4679                              const int8* matrix_argb, int width) {
4680  __asm {
4681    mov        eax, [esp + 4]   /* src_argb */
4682    mov        edx, [esp + 8]   /* dst_argb */
4683    mov        ecx, [esp + 12]  /* matrix_argb */
4684    movdqu     xmm5, [ecx]
4685    pshufd     xmm2, xmm5, 0x00
4686    pshufd     xmm3, xmm5, 0x55
4687    pshufd     xmm4, xmm5, 0xaa
4688    pshufd     xmm5, xmm5, 0xff
4689    mov        ecx, [esp + 16]  /* width */
4690
4691 convertloop:
4692    movdqu     xmm0, [eax]  // B
4693    movdqu     xmm7, [eax + 16]
4694    pmaddubsw  xmm0, xmm2
4695    pmaddubsw  xmm7, xmm2
4696    movdqu     xmm6, [eax]  // G
4697    movdqu     xmm1, [eax + 16]
4698    pmaddubsw  xmm6, xmm3
4699    pmaddubsw  xmm1, xmm3
4700    phaddsw    xmm0, xmm7   // B
4701    phaddsw    xmm6, xmm1   // G
4702    psraw      xmm0, 6      // B
4703    psraw      xmm6, 6      // G
4704    packuswb   xmm0, xmm0   // 8 B values
4705    packuswb   xmm6, xmm6   // 8 G values
4706    punpcklbw  xmm0, xmm6   // 8 BG values
4707    movdqu     xmm1, [eax]  // R
4708    movdqu     xmm7, [eax + 16]
4709    pmaddubsw  xmm1, xmm4
4710    pmaddubsw  xmm7, xmm4
4711    phaddsw    xmm1, xmm7   // R
4712    movdqu     xmm6, [eax]  // A
4713    movdqu     xmm7, [eax + 16]
4714    pmaddubsw  xmm6, xmm5
4715    pmaddubsw  xmm7, xmm5
4716    phaddsw    xmm6, xmm7   // A
4717    psraw      xmm1, 6      // R
4718    psraw      xmm6, 6      // A
4719    packuswb   xmm1, xmm1   // 8 R values
4720    packuswb   xmm6, xmm6   // 8 A values
4721    punpcklbw  xmm1, xmm6   // 8 RA values
4722    movdqa     xmm6, xmm0   // Weave BG, RA together
4723    punpcklwd  xmm0, xmm1   // BGRA first 4
4724    punpckhwd  xmm6, xmm1   // BGRA next 4
4725    movdqu     [edx], xmm0
4726    movdqu     [edx + 16], xmm6
4727    lea        eax, [eax + 32]
4728    lea        edx, [edx + 32]
4729    sub        ecx, 8
4730    jg         convertloop
4731    ret
4732  }
4733}
4734#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4735
4736#ifdef HAS_ARGBQUANTIZEROW_SSE2
4737// Quantize 4 ARGB pixels (16 bytes).
4738__declspec(naked)
4739void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4740                          int interval_offset, int width) {
4741  __asm {
4742    mov        eax, [esp + 4]    /* dst_argb */
4743    movd       xmm2, [esp + 8]   /* scale */
4744    movd       xmm3, [esp + 12]  /* interval_size */
4745    movd       xmm4, [esp + 16]  /* interval_offset */
4746    mov        ecx, [esp + 20]   /* width */
4747    pshuflw    xmm2, xmm2, 040h
4748    pshufd     xmm2, xmm2, 044h
4749    pshuflw    xmm3, xmm3, 040h
4750    pshufd     xmm3, xmm3, 044h
4751    pshuflw    xmm4, xmm4, 040h
4752    pshufd     xmm4, xmm4, 044h
4753    pxor       xmm5, xmm5  // constant 0
4754    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
4755    pslld      xmm6, 24
4756
4757 convertloop:
4758    movdqu     xmm0, [eax]  // read 4 pixels
4759    punpcklbw  xmm0, xmm5   // first 2 pixels
4760    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
4761    movdqu     xmm1, [eax]  // read 4 pixels
4762    punpckhbw  xmm1, xmm5   // next 2 pixels
4763    pmulhuw    xmm1, xmm2
4764    pmullw     xmm0, xmm3   // * interval_size
4765    movdqu     xmm7, [eax]  // read 4 pixels
4766    pmullw     xmm1, xmm3
4767    pand       xmm7, xmm6   // mask alpha
4768    paddw      xmm0, xmm4   // + interval_size / 2
4769    paddw      xmm1, xmm4
4770    packuswb   xmm0, xmm1
4771    por        xmm0, xmm7
4772    movdqu     [eax], xmm0
4773    lea        eax, [eax + 16]
4774    sub        ecx, 4
4775    jg         convertloop
4776    ret
4777  }
4778}
4779#endif  // HAS_ARGBQUANTIZEROW_SSE2
4780
4781#ifdef HAS_ARGBSHADEROW_SSE2
4782// Shade 4 pixels at a time by specified value.
4783__declspec(naked)
4784void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4785                       uint32 value) {
4786  __asm {
4787    mov        eax, [esp + 4]   // src_argb
4788    mov        edx, [esp + 8]   // dst_argb
4789    mov        ecx, [esp + 12]  // width
4790    movd       xmm2, [esp + 16]  // value
4791    punpcklbw  xmm2, xmm2
4792    punpcklqdq xmm2, xmm2
4793
4794 convertloop:
4795    movdqu     xmm0, [eax]      // read 4 pixels
4796    lea        eax, [eax + 16]
4797    movdqa     xmm1, xmm0
4798    punpcklbw  xmm0, xmm0       // first 2
4799    punpckhbw  xmm1, xmm1       // next 2
4800    pmulhuw    xmm0, xmm2       // argb * value
4801    pmulhuw    xmm1, xmm2       // argb * value
4802    psrlw      xmm0, 8
4803    psrlw      xmm1, 8
4804    packuswb   xmm0, xmm1
4805    movdqu     [edx], xmm0
4806    lea        edx, [edx + 16]
4807    sub        ecx, 4
4808    jg         convertloop
4809
4810    ret
4811  }
4812}
4813#endif  // HAS_ARGBSHADEROW_SSE2
4814
4815#ifdef HAS_ARGBMULTIPLYROW_SSE2
4816// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4817__declspec(naked)
4818void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4819                          uint8* dst_argb, int width) {
4820  __asm {
4821    push       esi
4822    mov        eax, [esp + 4 + 4]   // src_argb0
4823    mov        esi, [esp + 4 + 8]   // src_argb1
4824    mov        edx, [esp + 4 + 12]  // dst_argb
4825    mov        ecx, [esp + 4 + 16]  // width
4826    pxor       xmm5, xmm5  // constant 0
4827
4828 convertloop:
4829    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4830    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
4831    movdqu     xmm1, xmm0
4832    movdqu     xmm3, xmm2
4833    punpcklbw  xmm0, xmm0         // first 2
4834    punpckhbw  xmm1, xmm1         // next 2
4835    punpcklbw  xmm2, xmm5         // first 2
4836    punpckhbw  xmm3, xmm5         // next 2
4837    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
4838    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
4839    lea        eax, [eax + 16]
4840    lea        esi, [esi + 16]
4841    packuswb   xmm0, xmm1
4842    movdqu     [edx], xmm0
4843    lea        edx, [edx + 16]
4844    sub        ecx, 4
4845    jg         convertloop
4846
4847    pop        esi
4848    ret
4849  }
4850}
4851#endif  // HAS_ARGBMULTIPLYROW_SSE2
4852
4853#ifdef HAS_ARGBADDROW_SSE2
4854// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4855// TODO(fbarchard): Port this to posix, neon and other math functions.
4856__declspec(naked)
4857void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4858                     uint8* dst_argb, int width) {
4859  __asm {
4860    push       esi
4861    mov        eax, [esp + 4 + 4]   // src_argb0
4862    mov        esi, [esp + 4 + 8]   // src_argb1
4863    mov        edx, [esp + 4 + 12]  // dst_argb
4864    mov        ecx, [esp + 4 + 16]  // width
4865
4866    sub        ecx, 4
4867    jl         convertloop49
4868
4869 convertloop4:
4870    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4871    lea        eax, [eax + 16]
4872    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
4873    lea        esi, [esi + 16]
4874    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
4875    movdqu     [edx], xmm0
4876    lea        edx, [edx + 16]
4877    sub        ecx, 4
4878    jge        convertloop4
4879
4880 convertloop49:
4881    add        ecx, 4 - 1
4882    jl         convertloop19
4883
4884 convertloop1:
4885    movd       xmm0, [eax]        // read 1 pixels from src_argb0
4886    lea        eax, [eax + 4]
4887    movd       xmm1, [esi]        // read 1 pixels from src_argb1
4888    lea        esi, [esi + 4]
4889    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
4890    movd       [edx], xmm0
4891    lea        edx, [edx + 4]
4892    sub        ecx, 1
4893    jge        convertloop1
4894
4895 convertloop19:
4896    pop        esi
4897    ret
4898  }
4899}
4900#endif  // HAS_ARGBADDROW_SSE2
4901
4902#ifdef HAS_ARGBSUBTRACTROW_SSE2
4903// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4904__declspec(naked)
4905void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4906                          uint8* dst_argb, int width) {
4907  __asm {
4908    push       esi
4909    mov        eax, [esp + 4 + 4]   // src_argb0
4910    mov        esi, [esp + 4 + 8]   // src_argb1
4911    mov        edx, [esp + 4 + 12]  // dst_argb
4912    mov        ecx, [esp + 4 + 16]  // width
4913
4914 convertloop:
4915    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4916    lea        eax, [eax + 16]
4917    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
4918    lea        esi, [esi + 16]
4919    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
4920    movdqu     [edx], xmm0
4921    lea        edx, [edx + 16]
4922    sub        ecx, 4
4923    jg         convertloop
4924
4925    pop        esi
4926    ret
4927  }
4928}
4929#endif  // HAS_ARGBSUBTRACTROW_SSE2
4930
4931#ifdef HAS_ARGBMULTIPLYROW_AVX2
4932// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4933__declspec(naked)
4934void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4935                          uint8* dst_argb, int width) {
4936  __asm {
4937    push       esi
4938    mov        eax, [esp + 4 + 4]   // src_argb0
4939    mov        esi, [esp + 4 + 8]   // src_argb1
4940    mov        edx, [esp + 4 + 12]  // dst_argb
4941    mov        ecx, [esp + 4 + 16]  // width
4942    vpxor      ymm5, ymm5, ymm5     // constant 0
4943
4944 convertloop:
4945    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
4946    lea        eax, [eax + 32]
4947    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
4948    lea        esi, [esi + 32]
4949    vpunpcklbw ymm0, ymm1, ymm1   // low 4
4950    vpunpckhbw ymm1, ymm1, ymm1   // high 4
4951    vpunpcklbw ymm2, ymm3, ymm5   // low 4
4952    vpunpckhbw ymm3, ymm3, ymm5   // high 4
4953    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
4954    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
4955    vpackuswb  ymm0, ymm0, ymm1
4956    vmovdqu    [edx], ymm0
4957    lea        edx, [edx + 32]
4958    sub        ecx, 8
4959    jg         convertloop
4960
4961    pop        esi
4962    vzeroupper
4963    ret
4964  }
4965}
4966#endif  // HAS_ARGBMULTIPLYROW_AVX2
4967
4968#ifdef HAS_ARGBADDROW_AVX2
4969// Add 2 rows of ARGB pixels together, 8 pixels at a time.
4970__declspec(naked)
4971void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4972                     uint8* dst_argb, int width) {
4973  __asm {
4974    push       esi
4975    mov        eax, [esp + 4 + 4]   // src_argb0
4976    mov        esi, [esp + 4 + 8]   // src_argb1
4977    mov        edx, [esp + 4 + 12]  // dst_argb
4978    mov        ecx, [esp + 4 + 16]  // width
4979
4980 convertloop:
4981    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
4982    lea        eax, [eax + 32]
4983    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
4984    lea        esi, [esi + 32]
4985    vmovdqu    [edx], ymm0
4986    lea        edx, [edx + 32]
4987    sub        ecx, 8
4988    jg         convertloop
4989
4990    pop        esi
4991    vzeroupper
4992    ret
4993  }
4994}
4995#endif  // HAS_ARGBADDROW_AVX2
4996
4997#ifdef HAS_ARGBSUBTRACTROW_AVX2
4998// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4999__declspec(naked)
5000void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5001                          uint8* dst_argb, int width) {
5002  __asm {
5003    push       esi
5004    mov        eax, [esp + 4 + 4]   // src_argb0
5005    mov        esi, [esp + 4 + 8]   // src_argb1
5006    mov        edx, [esp + 4 + 12]  // dst_argb
5007    mov        ecx, [esp + 4 + 16]  // width
5008
5009 convertloop:
5010    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
5011    lea        eax, [eax + 32]
5012    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
5013    lea        esi, [esi + 32]
5014    vmovdqu    [edx], ymm0
5015    lea        edx, [edx + 32]
5016    sub        ecx, 8
5017    jg         convertloop
5018
5019    pop        esi
5020    vzeroupper
5021    ret
5022  }
5023}
5024#endif  // HAS_ARGBSUBTRACTROW_AVX2
5025
5026#ifdef HAS_SOBELXROW_SSE2
5027// SobelX as a matrix is
5028// -1  0  1
5029// -2  0  2
5030// -1  0  1
5031__declspec(naked)
5032void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5033                    const uint8* src_y2, uint8* dst_sobelx, int width) {
5034  __asm {
5035    push       esi
5036    push       edi
5037    mov        eax, [esp + 8 + 4]   // src_y0
5038    mov        esi, [esp + 8 + 8]   // src_y1
5039    mov        edi, [esp + 8 + 12]  // src_y2
5040    mov        edx, [esp + 8 + 16]  // dst_sobelx
5041    mov        ecx, [esp + 8 + 20]  // width
5042    sub        esi, eax
5043    sub        edi, eax
5044    sub        edx, eax
5045    pxor       xmm5, xmm5  // constant 0
5046
5047 convertloop:
5048    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
5049    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
5050    punpcklbw  xmm0, xmm5
5051    punpcklbw  xmm1, xmm5
5052    psubw      xmm0, xmm1
5053    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
5054    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5055    punpcklbw  xmm1, xmm5
5056    punpcklbw  xmm2, xmm5
5057    psubw      xmm1, xmm2
5058    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
5059    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
5060    punpcklbw  xmm2, xmm5
5061    punpcklbw  xmm3, xmm5
5062    psubw      xmm2, xmm3
5063    paddw      xmm0, xmm2
5064    paddw      xmm0, xmm1
5065    paddw      xmm0, xmm1
5066    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5067    psubw      xmm1, xmm0
5068    pmaxsw     xmm0, xmm1
5069    packuswb   xmm0, xmm0
5070    movq       qword ptr [eax + edx], xmm0
5071    lea        eax, [eax + 8]
5072    sub        ecx, 8
5073    jg         convertloop
5074
5075    pop        edi
5076    pop        esi
5077    ret
5078  }
5079}
5080#endif  // HAS_SOBELXROW_SSE2
5081
5082#ifdef HAS_SOBELYROW_SSE2
5083// SobelY as a matrix is
5084// -1 -2 -1
5085//  0  0  0
5086//  1  2  1
5087__declspec(naked)
5088void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5089                    uint8* dst_sobely, int width) {
5090  __asm {
5091    push       esi
5092    mov        eax, [esp + 4 + 4]   // src_y0
5093    mov        esi, [esp + 4 + 8]   // src_y1
5094    mov        edx, [esp + 4 + 12]  // dst_sobely
5095    mov        ecx, [esp + 4 + 16]  // width
5096    sub        esi, eax
5097    sub        edx, eax
5098    pxor       xmm5, xmm5  // constant 0
5099
5100 convertloop:
5101    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
5102    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
5103    punpcklbw  xmm0, xmm5
5104    punpcklbw  xmm1, xmm5
5105    psubw      xmm0, xmm1
5106    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
5107    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
5108    punpcklbw  xmm1, xmm5
5109    punpcklbw  xmm2, xmm5
5110    psubw      xmm1, xmm2
5111    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
5112    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5113    punpcklbw  xmm2, xmm5
5114    punpcklbw  xmm3, xmm5
5115    psubw      xmm2, xmm3
5116    paddw      xmm0, xmm2
5117    paddw      xmm0, xmm1
5118    paddw      xmm0, xmm1
5119    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5120    psubw      xmm1, xmm0
5121    pmaxsw     xmm0, xmm1
5122    packuswb   xmm0, xmm0
5123    movq       qword ptr [eax + edx], xmm0
5124    lea        eax, [eax + 8]
5125    sub        ecx, 8
5126    jg         convertloop
5127
5128    pop        esi
5129    ret
5130  }
5131}
5132#endif  // HAS_SOBELYROW_SSE2
5133
5134#ifdef HAS_SOBELROW_SSE2
5135// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5136// A = 255
5137// R = Sobel
5138// G = Sobel
5139// B = Sobel
5140__declspec(naked)
5141void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5142                   uint8* dst_argb, int width) {
5143  __asm {
5144    push       esi
5145    mov        eax, [esp + 4 + 4]   // src_sobelx
5146    mov        esi, [esp + 4 + 8]   // src_sobely
5147    mov        edx, [esp + 4 + 12]  // dst_argb
5148    mov        ecx, [esp + 4 + 16]  // width
5149    sub        esi, eax
5150    pcmpeqb    xmm5, xmm5           // alpha 255
5151    pslld      xmm5, 24             // 0xff000000
5152
5153 convertloop:
5154    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
5155    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5156    lea        eax, [eax + 16]
5157    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
5158    movdqa     xmm2, xmm0             // GG
5159    punpcklbw  xmm2, xmm0             // First 8
5160    punpckhbw  xmm0, xmm0             // Next 8
5161    movdqa     xmm1, xmm2             // GGGG
5162    punpcklwd  xmm1, xmm2             // First 4
5163    punpckhwd  xmm2, xmm2             // Next 4
5164    por        xmm1, xmm5             // GGGA
5165    por        xmm2, xmm5
5166    movdqa     xmm3, xmm0             // GGGG
5167    punpcklwd  xmm3, xmm0             // Next 4
5168    punpckhwd  xmm0, xmm0             // Last 4
5169    por        xmm3, xmm5             // GGGA
5170    por        xmm0, xmm5
5171    movdqu     [edx], xmm1
5172    movdqu     [edx + 16], xmm2
5173    movdqu     [edx + 32], xmm3
5174    movdqu     [edx + 48], xmm0
5175    lea        edx, [edx + 64]
5176    sub        ecx, 16
5177    jg         convertloop
5178
5179    pop        esi
5180    ret
5181  }
5182}
5183#endif  // HAS_SOBELROW_SSE2
5184
5185#ifdef HAS_SOBELTOPLANEROW_SSE2
5186// Adds Sobel X and Sobel Y and stores Sobel into a plane.
5187__declspec(naked)
5188void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5189                          uint8* dst_y, int width) {
5190  __asm {
5191    push       esi
5192    mov        eax, [esp + 4 + 4]   // src_sobelx
5193    mov        esi, [esp + 4 + 8]   // src_sobely
5194    mov        edx, [esp + 4 + 12]  // dst_argb
5195    mov        ecx, [esp + 4 + 16]  // width
5196    sub        esi, eax
5197
5198 convertloop:
5199    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
5200    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5201    lea        eax, [eax + 16]
5202    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
5203    movdqu     [edx], xmm0
5204    lea        edx, [edx + 16]
5205    sub        ecx, 16
5206    jg         convertloop
5207
5208    pop        esi
5209    ret
5210  }
5211}
5212#endif  // HAS_SOBELTOPLANEROW_SSE2
5213
5214#ifdef HAS_SOBELXYROW_SSE2
5215// Mixes Sobel X, Sobel Y and Sobel into ARGB.
5216// A = 255
5217// R = Sobel X
5218// G = Sobel
5219// B = Sobel Y
5220__declspec(naked)
5221void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5222                     uint8* dst_argb, int width) {
5223  __asm {
5224    push       esi
5225    mov        eax, [esp + 4 + 4]   // src_sobelx
5226    mov        esi, [esp + 4 + 8]   // src_sobely
5227    mov        edx, [esp + 4 + 12]  // dst_argb
5228    mov        ecx, [esp + 4 + 16]  // width
5229    sub        esi, eax
5230    pcmpeqb    xmm5, xmm5           // alpha 255
5231
5232 convertloop:
5233    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
5234    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5235    lea        eax, [eax + 16]
5236    movdqa     xmm2, xmm0
5237    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
5238    movdqa     xmm3, xmm0             // XA
5239    punpcklbw  xmm3, xmm5
5240    punpckhbw  xmm0, xmm5
5241    movdqa     xmm4, xmm1             // YS
5242    punpcklbw  xmm4, xmm2
5243    punpckhbw  xmm1, xmm2
5244    movdqa     xmm6, xmm4             // YSXA
5245    punpcklwd  xmm6, xmm3             // First 4
5246    punpckhwd  xmm4, xmm3             // Next 4
5247    movdqa     xmm7, xmm1             // YSXA
5248    punpcklwd  xmm7, xmm0             // Next 4
5249    punpckhwd  xmm1, xmm0             // Last 4
5250    movdqu     [edx], xmm6
5251    movdqu     [edx + 16], xmm4
5252    movdqu     [edx + 32], xmm7
5253    movdqu     [edx + 48], xmm1
5254    lea        edx, [edx + 64]
5255    sub        ecx, 16
5256    jg         convertloop
5257
5258    pop        esi
5259    ret
5260  }
5261}
5262#endif  // HAS_SOBELXYROW_SSE2
5263
5264#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5265// Consider float CumulativeSum.
5266// Consider calling CumulativeSum one row at time as needed.
5267// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5268// Convert cumulative sum for an area to an average for 1 pixel.
5269// topleft is pointer to top left of CumulativeSum buffer for area.
5270// botleft is pointer to bottom left of CumulativeSum buffer.
5271// width is offset from left to right of area in CumulativeSum buffer measured
5272//   in number of ints.
5273// area is the number of pixels in the area being averaged.
5274// dst points to pixel to store result to.
5275// count is number of averaged pixels to produce.
5276// Does 4 pixels at a time.
5277// This function requires alignment on accumulation buffer pointers.
5278void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5279                                    int width, int area, uint8* dst,
5280                                    int count) {
5281  __asm {
5282    mov        eax, topleft  // eax topleft
5283    mov        esi, botleft  // esi botleft
5284    mov        edx, width
5285    movd       xmm5, area
5286    mov        edi, dst
5287    mov        ecx, count
5288    cvtdq2ps   xmm5, xmm5
5289    rcpss      xmm4, xmm5  // 1.0f / area
5290    pshufd     xmm4, xmm4, 0
5291    sub        ecx, 4
5292    jl         l4b
5293
5294    cmp        area, 128  // 128 pixels will not overflow 15 bits.
5295    ja         l4
5296
5297    pshufd     xmm5, xmm5, 0        // area
5298    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
5299    psrld      xmm6, 16
5300    cvtdq2ps   xmm6, xmm6
5301    addps      xmm5, xmm6           // (65536.0 + area - 1)
5302    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
5303    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
5304    packssdw   xmm5, xmm5           // 16 bit shorts
5305
5306    // 4 pixel loop small blocks.
5307  s4:
5308    // top left
5309    movdqu     xmm0, [eax]
5310    movdqu     xmm1, [eax + 16]
5311    movdqu     xmm2, [eax + 32]
5312    movdqu     xmm3, [eax + 48]
5313
5314    // - top right
5315    psubd      xmm0, [eax + edx * 4]
5316    psubd      xmm1, [eax + edx * 4 + 16]
5317    psubd      xmm2, [eax + edx * 4 + 32]
5318    psubd      xmm3, [eax + edx * 4 + 48]
5319    lea        eax, [eax + 64]
5320
5321    // - bottom left
5322    psubd      xmm0, [esi]
5323    psubd      xmm1, [esi + 16]
5324    psubd      xmm2, [esi + 32]
5325    psubd      xmm3, [esi + 48]
5326
5327    // + bottom right
5328    paddd      xmm0, [esi + edx * 4]
5329    paddd      xmm1, [esi + edx * 4 + 16]
5330    paddd      xmm2, [esi + edx * 4 + 32]
5331    paddd      xmm3, [esi + edx * 4 + 48]
5332    lea        esi, [esi + 64]
5333
5334    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
5335    packssdw   xmm2, xmm3
5336
5337    pmulhuw    xmm0, xmm5
5338    pmulhuw    xmm2, xmm5
5339
5340    packuswb   xmm0, xmm2
5341    movdqu     [edi], xmm0
5342    lea        edi, [edi + 16]
5343    sub        ecx, 4
5344    jge        s4
5345
5346    jmp        l4b
5347
5348    // 4 pixel loop
5349  l4:
5350    // top left
5351    movdqu     xmm0, [eax]
5352    movdqu     xmm1, [eax + 16]
5353    movdqu     xmm2, [eax + 32]
5354    movdqu     xmm3, [eax + 48]
5355
5356    // - top right
5357    psubd      xmm0, [eax + edx * 4]
5358    psubd      xmm1, [eax + edx * 4 + 16]
5359    psubd      xmm2, [eax + edx * 4 + 32]
5360    psubd      xmm3, [eax + edx * 4 + 48]
5361    lea        eax, [eax + 64]
5362
5363    // - bottom left
5364    psubd      xmm0, [esi]
5365    psubd      xmm1, [esi + 16]
5366    psubd      xmm2, [esi + 32]
5367    psubd      xmm3, [esi + 48]
5368
5369    // + bottom right
5370    paddd      xmm0, [esi + edx * 4]
5371    paddd      xmm1, [esi + edx * 4 + 16]
5372    paddd      xmm2, [esi + edx * 4 + 32]
5373    paddd      xmm3, [esi + edx * 4 + 48]
5374    lea        esi, [esi + 64]
5375
5376    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
5377    cvtdq2ps   xmm1, xmm1
5378    mulps      xmm0, xmm4
5379    mulps      xmm1, xmm4
5380    cvtdq2ps   xmm2, xmm2
5381    cvtdq2ps   xmm3, xmm3
5382    mulps      xmm2, xmm4
5383    mulps      xmm3, xmm4
5384    cvtps2dq   xmm0, xmm0
5385    cvtps2dq   xmm1, xmm1
5386    cvtps2dq   xmm2, xmm2
5387    cvtps2dq   xmm3, xmm3
5388    packssdw   xmm0, xmm1
5389    packssdw   xmm2, xmm3
5390    packuswb   xmm0, xmm2
5391    movdqu     [edi], xmm0
5392    lea        edi, [edi + 16]
5393    sub        ecx, 4
5394    jge        l4
5395
5396  l4b:
5397    add        ecx, 4 - 1
5398    jl         l1b
5399
5400    // 1 pixel loop
5401  l1:
5402    movdqu     xmm0, [eax]
5403    psubd      xmm0, [eax + edx * 4]
5404    lea        eax, [eax + 16]
5405    psubd      xmm0, [esi]
5406    paddd      xmm0, [esi + edx * 4]
5407    lea        esi, [esi + 16]
5408    cvtdq2ps   xmm0, xmm0
5409    mulps      xmm0, xmm4
5410    cvtps2dq   xmm0, xmm0
5411    packssdw   xmm0, xmm0
5412    packuswb   xmm0, xmm0
5413    movd       dword ptr [edi], xmm0
5414    lea        edi, [edi + 4]
5415    sub        ecx, 1
5416    jge        l1
5417  l1b:
5418  }
5419}
5420#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5421
5422#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5423// Creates a table of cumulative sums where each value is a sum of all values
5424// above and to the left of the value.
5425void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
5426                                  const int32* previous_cumsum, int width) {
5427  __asm {
5428    mov        eax, row
5429    mov        edx, cumsum
5430    mov        esi, previous_cumsum
5431    mov        ecx, width
5432    pxor       xmm0, xmm0
5433    pxor       xmm1, xmm1
5434
5435    sub        ecx, 4
5436    jl         l4b
5437    test       edx, 15
5438    jne        l4b
5439
5440    // 4 pixel loop
5441  l4:
5442    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
5443    lea        eax, [eax + 16]
5444    movdqa     xmm4, xmm2
5445
5446    punpcklbw  xmm2, xmm1
5447    movdqa     xmm3, xmm2
5448    punpcklwd  xmm2, xmm1
5449    punpckhwd  xmm3, xmm1
5450
5451    punpckhbw  xmm4, xmm1
5452    movdqa     xmm5, xmm4
5453    punpcklwd  xmm4, xmm1
5454    punpckhwd  xmm5, xmm1
5455
5456    paddd      xmm0, xmm2
5457    movdqu     xmm2, [esi]  // previous row above.
5458    paddd      xmm2, xmm0
5459
5460    paddd      xmm0, xmm3
5461    movdqu     xmm3, [esi + 16]
5462    paddd      xmm3, xmm0
5463
5464    paddd      xmm0, xmm4
5465    movdqu     xmm4, [esi + 32]
5466    paddd      xmm4, xmm0
5467
5468    paddd      xmm0, xmm5
5469    movdqu     xmm5, [esi + 48]
5470    lea        esi, [esi + 64]
5471    paddd      xmm5, xmm0
5472
5473    movdqu     [edx], xmm2
5474    movdqu     [edx + 16], xmm3
5475    movdqu     [edx + 32], xmm4
5476    movdqu     [edx + 48], xmm5
5477
5478    lea        edx, [edx + 64]
5479    sub        ecx, 4
5480    jge        l4
5481
5482  l4b:
5483    add        ecx, 4 - 1
5484    jl         l1b
5485
5486    // 1 pixel loop
5487  l1:
5488    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
5489    lea        eax, [eax + 4]
5490    punpcklbw  xmm2, xmm1
5491    punpcklwd  xmm2, xmm1
5492    paddd      xmm0, xmm2
5493    movdqu     xmm2, [esi]
5494    lea        esi, [esi + 16]
5495    paddd      xmm2, xmm0
5496    movdqu     [edx], xmm2
5497    lea        edx, [edx + 16]
5498    sub        ecx, 1
5499    jge        l1
5500
5501 l1b:
5502  }
5503}
5504#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
5505
5506#ifdef HAS_ARGBAFFINEROW_SSE2
5507// Copy ARGB pixels from source image with slope to a row of destination.
5508__declspec(naked)
5509LIBYUV_API
5510void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
5511                        uint8* dst_argb, const float* uv_dudv, int width) {
5512  __asm {
5513    push       esi
5514    push       edi
5515    mov        eax, [esp + 12]  // src_argb
5516    mov        esi, [esp + 16]  // stride
5517    mov        edx, [esp + 20]  // dst_argb
5518    mov        ecx, [esp + 24]  // pointer to uv_dudv
5519    movq       xmm2, qword ptr [ecx]  // uv
5520    movq       xmm7, qword ptr [ecx + 8]  // dudv
5521    mov        ecx, [esp + 28]  // width
5522    shl        esi, 16          // 4, stride
5523    add        esi, 4
5524    movd       xmm5, esi
5525    sub        ecx, 4
5526    jl         l4b
5527
5528    // setup for 4 pixel loop
5529    pshufd     xmm7, xmm7, 0x44  // dup dudv
5530    pshufd     xmm5, xmm5, 0  // dup 4, stride
5531    movdqa     xmm0, xmm2    // x0, y0, x1, y1
5532    addps      xmm0, xmm7
5533    movlhps    xmm2, xmm0
5534    movdqa     xmm4, xmm7
5535    addps      xmm4, xmm4    // dudv *= 2
5536    movdqa     xmm3, xmm2    // x2, y2, x3, y3
5537    addps      xmm3, xmm4
5538    addps      xmm4, xmm4    // dudv *= 4
5539
5540    // 4 pixel loop
5541  l4:
5542    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
5543    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
5544    packssdw   xmm0, xmm1    // x, y as 8 shorts
5545    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
5546    movd       esi, xmm0
5547    pshufd     xmm0, xmm0, 0x39  // shift right
5548    movd       edi, xmm0
5549    pshufd     xmm0, xmm0, 0x39  // shift right
5550    movd       xmm1, [eax + esi]  // read pixel 0
5551    movd       xmm6, [eax + edi]  // read pixel 1
5552    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
5553    addps      xmm2, xmm4    // x, y += dx, dy first 2
5554    movq       qword ptr [edx], xmm1
5555    movd       esi, xmm0
5556    pshufd     xmm0, xmm0, 0x39  // shift right
5557    movd       edi, xmm0
5558    movd       xmm6, [eax + esi]  // read pixel 2
5559    movd       xmm0, [eax + edi]  // read pixel 3
5560    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
5561    addps      xmm3, xmm4    // x, y += dx, dy next 2
5562    movq       qword ptr 8[edx], xmm6
5563    lea        edx, [edx + 16]
5564    sub        ecx, 4
5565    jge        l4
5566
5567  l4b:
5568    add        ecx, 4 - 1
5569    jl         l1b
5570
5571    // 1 pixel loop
5572  l1:
5573    cvttps2dq  xmm0, xmm2    // x, y float to int
5574    packssdw   xmm0, xmm0    // x, y as shorts
5575    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
5576    addps      xmm2, xmm7    // x, y += dx, dy
5577    movd       esi, xmm0
5578    movd       xmm0, [eax + esi]  // copy a pixel
5579    movd       [edx], xmm0
5580    lea        edx, [edx + 4]
5581    sub        ecx, 1
5582    jge        l1
5583  l1b:
5584    pop        edi
5585    pop        esi
5586    ret
5587  }
5588}
5589#endif  // HAS_ARGBAFFINEROW_SSE2
5590
5591#ifdef HAS_INTERPOLATEROW_AVX2
5592// Bilinear filter 32x2 -> 32x1
5593__declspec(naked)
5594void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
5595                         ptrdiff_t src_stride, int dst_width,
5596                         int source_y_fraction) {
5597  __asm {
5598    push       esi
5599    push       edi
5600    mov        edi, [esp + 8 + 4]   // dst_ptr
5601    mov        esi, [esp + 8 + 8]   // src_ptr
5602    mov        edx, [esp + 8 + 12]  // src_stride
5603    mov        ecx, [esp + 8 + 16]  // dst_width
5604    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5605    // Dispatch to specialized filters if applicable.
5606    cmp        eax, 0
5607    je         xloop100  // 0 / 256.  Blend 100 / 0.
5608    sub        edi, esi
5609    cmp        eax, 128
5610    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
5611
5612    vmovd      xmm0, eax  // high fraction 0..255
5613    neg        eax
5614    add        eax, 256
5615    vmovd      xmm5, eax  // low fraction 256..1
5616    vpunpcklbw xmm5, xmm5, xmm0
5617    vpunpcklwd xmm5, xmm5, xmm5
5618    vbroadcastss ymm5, xmm5
5619
5620    mov        eax, 0x80808080  // 128b for bias and rounding.
5621    vmovd      xmm4, eax
5622    vbroadcastss ymm4, xmm4
5623
5624  xloop:
5625    vmovdqu    ymm0, [esi]
5626    vmovdqu    ymm2, [esi + edx]
5627    vpunpckhbw ymm1, ymm0, ymm2  // mutates
5628    vpunpcklbw ymm0, ymm0, ymm2
5629    vpsubb     ymm1, ymm1, ymm4  // bias to signed image
5630    vpsubb     ymm0, ymm0, ymm4
5631    vpmaddubsw ymm1, ymm5, ymm1
5632    vpmaddubsw ymm0, ymm5, ymm0
5633    vpaddw     ymm1, ymm1, ymm4  // unbias and round
5634    vpaddw     ymm0, ymm0, ymm4
5635    vpsrlw     ymm1, ymm1, 8
5636    vpsrlw     ymm0, ymm0, 8
5637    vpackuswb  ymm0, ymm0, ymm1  // unmutates
5638    vmovdqu    [esi + edi], ymm0
5639    lea        esi, [esi + 32]
5640    sub        ecx, 32
5641    jg         xloop
5642    jmp        xloop99
5643
5644   // Blend 50 / 50.
5645 xloop50:
5646   vmovdqu    ymm0, [esi]
5647   vpavgb     ymm0, ymm0, [esi + edx]
5648   vmovdqu    [esi + edi], ymm0
5649   lea        esi, [esi + 32]
5650   sub        ecx, 32
5651   jg         xloop50
5652   jmp        xloop99
5653
5654   // Blend 100 / 0 - Copy row unchanged.
5655 xloop100:
5656   rep movsb
5657
5658  xloop99:
5659    pop        edi
5660    pop        esi
5661    vzeroupper
5662    ret
5663  }
5664}
5665#endif  // HAS_INTERPOLATEROW_AVX2
5666
5667// Bilinear filter 16x2 -> 16x1
5668// TODO(fbarchard): Consider allowing 256 using memcpy.
5669__declspec(naked)
5670void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5671                          ptrdiff_t src_stride, int dst_width,
5672                          int source_y_fraction) {
5673  __asm {
5674    push       esi
5675    push       edi
5676
5677    mov        edi, [esp + 8 + 4]   // dst_ptr
5678    mov        esi, [esp + 8 + 8]   // src_ptr
5679    mov        edx, [esp + 8 + 12]  // src_stride
5680    mov        ecx, [esp + 8 + 16]  // dst_width
5681    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5682    sub        edi, esi
5683    // Dispatch to specialized filters if applicable.
5684    cmp        eax, 0
5685    je         xloop100  // 0 /256.  Blend 100 / 0.
5686    cmp        eax, 128
5687    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
5688
5689    movd       xmm0, eax  // high fraction 0..255
5690    neg        eax
5691    add        eax, 256
5692    movd       xmm5, eax  // low fraction 255..1
5693    punpcklbw  xmm5, xmm0
5694    punpcklwd  xmm5, xmm5
5695    pshufd     xmm5, xmm5, 0
5696    mov        eax, 0x80808080  // 128 for biasing image to signed.
5697    movd       xmm4, eax
5698    pshufd     xmm4, xmm4, 0x00
5699
5700  xloop:
5701    movdqu     xmm0, [esi]
5702    movdqu     xmm2, [esi + edx]
5703    movdqu     xmm1, xmm0
5704    punpcklbw  xmm0, xmm2
5705    punpckhbw  xmm1, xmm2
5706    psubb      xmm0, xmm4  // bias image by -128
5707    psubb      xmm1, xmm4
5708    movdqa     xmm2, xmm5
5709    movdqa     xmm3, xmm5
5710    pmaddubsw  xmm2, xmm0
5711    pmaddubsw  xmm3, xmm1
5712    paddw      xmm2, xmm4
5713    paddw      xmm3, xmm4
5714    psrlw      xmm2, 8
5715    psrlw      xmm3, 8
5716    packuswb   xmm2, xmm3
5717    movdqu     [esi + edi], xmm2
5718    lea        esi, [esi + 16]
5719    sub        ecx, 16
5720    jg         xloop
5721    jmp        xloop99
5722
5723    // Blend 50 / 50.
5724  xloop50:
5725    movdqu     xmm0, [esi]
5726    movdqu     xmm1, [esi + edx]
5727    pavgb      xmm0, xmm1
5728    movdqu     [esi + edi], xmm0
5729    lea        esi, [esi + 16]
5730    sub        ecx, 16
5731    jg         xloop50
5732    jmp        xloop99
5733
5734    // Blend 100 / 0 - Copy row unchanged.
5735  xloop100:
5736    movdqu     xmm0, [esi]
5737    movdqu     [esi + edi], xmm0
5738    lea        esi, [esi + 16]
5739    sub        ecx, 16
5740    jg         xloop100
5741
5742  xloop99:
5743    pop        edi
5744    pop        esi
5745    ret
5746  }
5747}
5748
5749// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5750__declspec(naked)
5751void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5752                          const uint8* shuffler, int width) {
5753  __asm {
5754    mov        eax, [esp + 4]    // src_argb
5755    mov        edx, [esp + 8]    // dst_argb
5756    mov        ecx, [esp + 12]   // shuffler
5757    movdqu     xmm5, [ecx]
5758    mov        ecx, [esp + 16]   // width
5759
5760  wloop:
5761    movdqu     xmm0, [eax]
5762    movdqu     xmm1, [eax + 16]
5763    lea        eax, [eax + 32]
5764    pshufb     xmm0, xmm5
5765    pshufb     xmm1, xmm5
5766    movdqu     [edx], xmm0
5767    movdqu     [edx + 16], xmm1
5768    lea        edx, [edx + 32]
5769    sub        ecx, 8
5770    jg         wloop
5771    ret
5772  }
5773}
5774
5775#ifdef HAS_ARGBSHUFFLEROW_AVX2
5776__declspec(naked)
5777void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5778                         const uint8* shuffler, int width) {
5779  __asm {
5780    mov        eax, [esp + 4]     // src_argb
5781    mov        edx, [esp + 8]     // dst_argb
5782    mov        ecx, [esp + 12]    // shuffler
5783    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
5784    mov        ecx, [esp + 16]    // width
5785
5786  wloop:
5787    vmovdqu    ymm0, [eax]
5788    vmovdqu    ymm1, [eax + 32]
5789    lea        eax, [eax + 64]
5790    vpshufb    ymm0, ymm0, ymm5
5791    vpshufb    ymm1, ymm1, ymm5
5792    vmovdqu    [edx], ymm0
5793    vmovdqu    [edx + 32], ymm1
5794    lea        edx, [edx + 64]
5795    sub        ecx, 16
5796    jg         wloop
5797
5798    vzeroupper
5799    ret
5800  }
5801}
5802#endif  // HAS_ARGBSHUFFLEROW_AVX2
5803
5804__declspec(naked)
5805void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5806                         const uint8* shuffler, int width) {
5807  __asm {
5808    push       ebx
5809    push       esi
5810    mov        eax, [esp + 8 + 4]    // src_argb
5811    mov        edx, [esp + 8 + 8]    // dst_argb
5812    mov        esi, [esp + 8 + 12]   // shuffler
5813    mov        ecx, [esp + 8 + 16]   // width
5814    pxor       xmm5, xmm5
5815
5816    mov        ebx, [esi]   // shuffler
5817    cmp        ebx, 0x03000102
5818    je         shuf_3012
5819    cmp        ebx, 0x00010203
5820    je         shuf_0123
5821    cmp        ebx, 0x00030201
5822    je         shuf_0321
5823    cmp        ebx, 0x02010003
5824    je         shuf_2103
5825
5826  // TODO(fbarchard): Use one source pointer and 3 offsets.
5827  shuf_any1:
5828    movzx      ebx, byte ptr [esi]
5829    movzx      ebx, byte ptr [eax + ebx]
5830    mov        [edx], bl
5831    movzx      ebx, byte ptr [esi + 1]
5832    movzx      ebx, byte ptr [eax + ebx]
5833    mov        [edx + 1], bl
5834    movzx      ebx, byte ptr [esi + 2]
5835    movzx      ebx, byte ptr [eax + ebx]
5836    mov        [edx + 2], bl
5837    movzx      ebx, byte ptr [esi + 3]
5838    movzx      ebx, byte ptr [eax + ebx]
5839    mov        [edx + 3], bl
5840    lea        eax, [eax + 4]
5841    lea        edx, [edx + 4]
5842    sub        ecx, 1
5843    jg         shuf_any1
5844    jmp        shuf99
5845
5846  shuf_0123:
5847    movdqu     xmm0, [eax]
5848    lea        eax, [eax + 16]
5849    movdqa     xmm1, xmm0
5850    punpcklbw  xmm0, xmm5
5851    punpckhbw  xmm1, xmm5
5852    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
5853    pshuflw    xmm0, xmm0, 01Bh
5854    pshufhw    xmm1, xmm1, 01Bh
5855    pshuflw    xmm1, xmm1, 01Bh
5856    packuswb   xmm0, xmm1
5857    movdqu     [edx], xmm0
5858    lea        edx, [edx + 16]
5859    sub        ecx, 4
5860    jg         shuf_0123
5861    jmp        shuf99
5862
5863  shuf_0321:
5864    movdqu     xmm0, [eax]
5865    lea        eax, [eax + 16]
5866    movdqa     xmm1, xmm0
5867    punpcklbw  xmm0, xmm5
5868    punpckhbw  xmm1, xmm5
5869    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
5870    pshuflw    xmm0, xmm0, 039h
5871    pshufhw    xmm1, xmm1, 039h
5872    pshuflw    xmm1, xmm1, 039h
5873    packuswb   xmm0, xmm1
5874    movdqu     [edx], xmm0
5875    lea        edx, [edx + 16]
5876    sub        ecx, 4
5877    jg         shuf_0321
5878    jmp        shuf99
5879
5880  shuf_2103:
5881    movdqu     xmm0, [eax]
5882    lea        eax, [eax + 16]
5883    movdqa     xmm1, xmm0
5884    punpcklbw  xmm0, xmm5
5885    punpckhbw  xmm1, xmm5
5886    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
5887    pshuflw    xmm0, xmm0, 093h
5888    pshufhw    xmm1, xmm1, 093h
5889    pshuflw    xmm1, xmm1, 093h
5890    packuswb   xmm0, xmm1
5891    movdqu     [edx], xmm0
5892    lea        edx, [edx + 16]
5893    sub        ecx, 4
5894    jg         shuf_2103
5895    jmp        shuf99
5896
5897  shuf_3012:
5898    movdqu     xmm0, [eax]
5899    lea        eax, [eax + 16]
5900    movdqa     xmm1, xmm0
5901    punpcklbw  xmm0, xmm5
5902    punpckhbw  xmm1, xmm5
5903    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
5904    pshuflw    xmm0, xmm0, 0C6h
5905    pshufhw    xmm1, xmm1, 0C6h
5906    pshuflw    xmm1, xmm1, 0C6h
5907    packuswb   xmm0, xmm1
5908    movdqu     [edx], xmm0
5909    lea        edx, [edx + 16]
5910    sub        ecx, 4
5911    jg         shuf_3012
5912
5913  shuf99:
5914    pop        esi
5915    pop        ebx
5916    ret
5917  }
5918}
5919
5920// YUY2 - Macro-pixel = 2 image pixels
5921// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5922
5923// UYVY - Macro-pixel = 2 image pixels
5924// U0Y0V0Y1
5925
5926__declspec(naked)
5927void I422ToYUY2Row_SSE2(const uint8* src_y,
5928                        const uint8* src_u,
5929                        const uint8* src_v,
5930                        uint8* dst_frame, int width) {
5931  __asm {
5932    push       esi
5933    push       edi
5934    mov        eax, [esp + 8 + 4]    // src_y
5935    mov        esi, [esp + 8 + 8]    // src_u
5936    mov        edx, [esp + 8 + 12]   // src_v
5937    mov        edi, [esp + 8 + 16]   // dst_frame
5938    mov        ecx, [esp + 8 + 20]   // width
5939    sub        edx, esi
5940
5941  convertloop:
5942    movq       xmm2, qword ptr [esi] // U
5943    movq       xmm3, qword ptr [esi + edx] // V
5944    lea        esi, [esi + 8]
5945    punpcklbw  xmm2, xmm3 // UV
5946    movdqu     xmm0, [eax] // Y
5947    lea        eax, [eax + 16]
5948    movdqa     xmm1, xmm0
5949    punpcklbw  xmm0, xmm2 // YUYV
5950    punpckhbw  xmm1, xmm2
5951    movdqu     [edi], xmm0
5952    movdqu     [edi + 16], xmm1
5953    lea        edi, [edi + 32]
5954    sub        ecx, 16
5955    jg         convertloop
5956
5957    pop        edi
5958    pop        esi
5959    ret
5960  }
5961}
5962
5963__declspec(naked)
5964void I422ToUYVYRow_SSE2(const uint8* src_y,
5965                        const uint8* src_u,
5966                        const uint8* src_v,
5967                        uint8* dst_frame, int width) {
5968  __asm {
5969    push       esi
5970    push       edi
5971    mov        eax, [esp + 8 + 4]    // src_y
5972    mov        esi, [esp + 8 + 8]    // src_u
5973    mov        edx, [esp + 8 + 12]   // src_v
5974    mov        edi, [esp + 8 + 16]   // dst_frame
5975    mov        ecx, [esp + 8 + 20]   // width
5976    sub        edx, esi
5977
5978  convertloop:
5979    movq       xmm2, qword ptr [esi] // U
5980    movq       xmm3, qword ptr [esi + edx] // V
5981    lea        esi, [esi + 8]
5982    punpcklbw  xmm2, xmm3 // UV
5983    movdqu     xmm0, [eax] // Y
5984    movdqa     xmm1, xmm2
5985    lea        eax, [eax + 16]
5986    punpcklbw  xmm1, xmm0 // UYVY
5987    punpckhbw  xmm2, xmm0
5988    movdqu     [edi], xmm1
5989    movdqu     [edi + 16], xmm2
5990    lea        edi, [edi + 32]
5991    sub        ecx, 16
5992    jg         convertloop
5993
5994    pop        edi
5995    pop        esi
5996    ret
5997  }
5998}
5999
6000#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
6001__declspec(naked)
6002void ARGBPolynomialRow_SSE2(const uint8* src_argb,
6003                            uint8* dst_argb, const float* poly,
6004                            int width) {
6005  __asm {
6006    push       esi
6007    mov        eax, [esp + 4 + 4]   /* src_argb */
6008    mov        edx, [esp + 4 + 8]   /* dst_argb */
6009    mov        esi, [esp + 4 + 12]  /* poly */
6010    mov        ecx, [esp + 4 + 16]  /* width */
6011    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
6012
6013    // 2 pixel loop.
6014 convertloop:
6015//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
6016//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
6017    movq       xmm0, qword ptr [eax]  // BGRABGRA
6018    lea        eax, [eax + 8]
6019    punpcklbw  xmm0, xmm3
6020    movdqa     xmm4, xmm0
6021    punpcklwd  xmm0, xmm3  // pixel 0
6022    punpckhwd  xmm4, xmm3  // pixel 1
6023    cvtdq2ps   xmm0, xmm0  // 4 floats
6024    cvtdq2ps   xmm4, xmm4
6025    movdqa     xmm1, xmm0  // X
6026    movdqa     xmm5, xmm4
6027    mulps      xmm0, [esi + 16]  // C1 * X
6028    mulps      xmm4, [esi + 16]
6029    addps      xmm0, [esi]  // result = C0 + C1 * X
6030    addps      xmm4, [esi]
6031    movdqa     xmm2, xmm1
6032    movdqa     xmm6, xmm5
6033    mulps      xmm2, xmm1  // X * X
6034    mulps      xmm6, xmm5
6035    mulps      xmm1, xmm2  // X * X * X
6036    mulps      xmm5, xmm6
6037    mulps      xmm2, [esi + 32]  // C2 * X * X
6038    mulps      xmm6, [esi + 32]
6039    mulps      xmm1, [esi + 48]  // C3 * X * X * X
6040    mulps      xmm5, [esi + 48]
6041    addps      xmm0, xmm2  // result += C2 * X * X
6042    addps      xmm4, xmm6
6043    addps      xmm0, xmm1  // result += C3 * X * X * X
6044    addps      xmm4, xmm5
6045    cvttps2dq  xmm0, xmm0
6046    cvttps2dq  xmm4, xmm4
6047    packuswb   xmm0, xmm4
6048    packuswb   xmm0, xmm0
6049    movq       qword ptr [edx], xmm0
6050    lea        edx, [edx + 8]
6051    sub        ecx, 2
6052    jg         convertloop
6053    pop        esi
6054    ret
6055  }
6056}
6057#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
6058
6059#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6060__declspec(naked)
6061void ARGBPolynomialRow_AVX2(const uint8* src_argb,
6062                            uint8* dst_argb, const float* poly,
6063                            int width) {
6064  __asm {
6065    mov        eax, [esp + 4]   /* src_argb */
6066    mov        edx, [esp + 8]   /* dst_argb */
6067    mov        ecx, [esp + 12]   /* poly */
6068    vbroadcastf128 ymm4, [ecx]       // C0
6069    vbroadcastf128 ymm5, [ecx + 16]  // C1
6070    vbroadcastf128 ymm6, [ecx + 32]  // C2
6071    vbroadcastf128 ymm7, [ecx + 48]  // C3
6072    mov        ecx, [esp + 16]  /* width */
6073
6074    // 2 pixel loop.
6075 convertloop:
6076    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
6077    lea         eax, [eax + 8]
6078    vcvtdq2ps   ymm0, ymm0        // X 8 floats
6079    vmulps      ymm2, ymm0, ymm0  // X * X
6080    vmulps      ymm3, ymm0, ymm7  // C3 * X
6081    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
6082    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
6083    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
6084    vcvttps2dq  ymm0, ymm0
6085    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
6086    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
6087    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
6088    vmovq       qword ptr [edx], xmm0
6089    lea         edx, [edx + 8]
6090    sub         ecx, 2
6091    jg          convertloop
6092    vzeroupper
6093    ret
6094  }
6095}
6096#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
6097
6098#ifdef HAS_ARGBCOLORTABLEROW_X86
6099// Tranform ARGB pixels with color table.
6100__declspec(naked)
6101void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
6102                           int width) {
6103  __asm {
6104    push       esi
6105    mov        eax, [esp + 4 + 4]   /* dst_argb */
6106    mov        esi, [esp + 4 + 8]   /* table_argb */
6107    mov        ecx, [esp + 4 + 12]  /* width */
6108
6109    // 1 pixel loop.
6110  convertloop:
6111    movzx      edx, byte ptr [eax]
6112    lea        eax, [eax + 4]
6113    movzx      edx, byte ptr [esi + edx * 4]
6114    mov        byte ptr [eax - 4], dl
6115    movzx      edx, byte ptr [eax - 4 + 1]
6116    movzx      edx, byte ptr [esi + edx * 4 + 1]
6117    mov        byte ptr [eax - 4 + 1], dl
6118    movzx      edx, byte ptr [eax - 4 + 2]
6119    movzx      edx, byte ptr [esi + edx * 4 + 2]
6120    mov        byte ptr [eax - 4 + 2], dl
6121    movzx      edx, byte ptr [eax - 4 + 3]
6122    movzx      edx, byte ptr [esi + edx * 4 + 3]
6123    mov        byte ptr [eax - 4 + 3], dl
6124    dec        ecx
6125    jg         convertloop
6126    pop        esi
6127    ret
6128  }
6129}
6130#endif  // HAS_ARGBCOLORTABLEROW_X86
6131
6132#ifdef HAS_RGBCOLORTABLEROW_X86
6133// Tranform RGB pixels with color table.
6134__declspec(naked)
6135void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
6136  __asm {
6137    push       esi
6138    mov        eax, [esp + 4 + 4]   /* dst_argb */
6139    mov        esi, [esp + 4 + 8]   /* table_argb */
6140    mov        ecx, [esp + 4 + 12]  /* width */
6141
6142    // 1 pixel loop.
6143  convertloop:
6144    movzx      edx, byte ptr [eax]
6145    lea        eax, [eax + 4]
6146    movzx      edx, byte ptr [esi + edx * 4]
6147    mov        byte ptr [eax - 4], dl
6148    movzx      edx, byte ptr [eax - 4 + 1]
6149    movzx      edx, byte ptr [esi + edx * 4 + 1]
6150    mov        byte ptr [eax - 4 + 1], dl
6151    movzx      edx, byte ptr [eax - 4 + 2]
6152    movzx      edx, byte ptr [esi + edx * 4 + 2]
6153    mov        byte ptr [eax - 4 + 2], dl
6154    dec        ecx
6155    jg         convertloop
6156
6157    pop        esi
6158    ret
6159  }
6160}
6161#endif  // HAS_RGBCOLORTABLEROW_X86
6162
6163#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6164// Tranform RGB pixels with luma table.
6165__declspec(naked)
6166void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6167                                 int width,
6168                                 const uint8* luma, uint32 lumacoeff) {
6169  __asm {
6170    push       esi
6171    push       edi
6172    mov        eax, [esp + 8 + 4]   /* src_argb */
6173    mov        edi, [esp + 8 + 8]   /* dst_argb */
6174    mov        ecx, [esp + 8 + 12]  /* width */
6175    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
6176    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
6177    pshufd     xmm2, xmm2, 0
6178    pshufd     xmm3, xmm3, 0
6179    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
6180    psllw      xmm4, 8
6181    pxor       xmm5, xmm5
6182
6183    // 4 pixel loop.
6184  convertloop:
6185    movdqu     xmm0, xmmword ptr [eax]      // generate luma ptr
6186    pmaddubsw  xmm0, xmm3
6187    phaddw     xmm0, xmm0
6188    pand       xmm0, xmm4  // mask out low bits
6189    punpcklwd  xmm0, xmm5
6190    paddd      xmm0, xmm2  // add table base
6191    movd       esi, xmm0
6192    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6193
6194    movzx      edx, byte ptr [eax]
6195    movzx      edx, byte ptr [esi + edx]
6196    mov        byte ptr [edi], dl
6197    movzx      edx, byte ptr [eax + 1]
6198    movzx      edx, byte ptr [esi + edx]
6199    mov        byte ptr [edi + 1], dl
6200    movzx      edx, byte ptr [eax + 2]
6201    movzx      edx, byte ptr [esi + edx]
6202    mov        byte ptr [edi + 2], dl
6203    movzx      edx, byte ptr [eax + 3]  // copy alpha.
6204    mov        byte ptr [edi + 3], dl
6205
6206    movd       esi, xmm0
6207    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6208
6209    movzx      edx, byte ptr [eax + 4]
6210    movzx      edx, byte ptr [esi + edx]
6211    mov        byte ptr [edi + 4], dl
6212    movzx      edx, byte ptr [eax + 5]
6213    movzx      edx, byte ptr [esi + edx]
6214    mov        byte ptr [edi + 5], dl
6215    movzx      edx, byte ptr [eax + 6]
6216    movzx      edx, byte ptr [esi + edx]
6217    mov        byte ptr [edi + 6], dl
6218    movzx      edx, byte ptr [eax + 7]  // copy alpha.
6219    mov        byte ptr [edi + 7], dl
6220
6221    movd       esi, xmm0
6222    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6223
6224    movzx      edx, byte ptr [eax + 8]
6225    movzx      edx, byte ptr [esi + edx]
6226    mov        byte ptr [edi + 8], dl
6227    movzx      edx, byte ptr [eax + 9]
6228    movzx      edx, byte ptr [esi + edx]
6229    mov        byte ptr [edi + 9], dl
6230    movzx      edx, byte ptr [eax + 10]
6231    movzx      edx, byte ptr [esi + edx]
6232    mov        byte ptr [edi + 10], dl
6233    movzx      edx, byte ptr [eax + 11]  // copy alpha.
6234    mov        byte ptr [edi + 11], dl
6235
6236    movd       esi, xmm0
6237
6238    movzx      edx, byte ptr [eax + 12]
6239    movzx      edx, byte ptr [esi + edx]
6240    mov        byte ptr [edi + 12], dl
6241    movzx      edx, byte ptr [eax + 13]
6242    movzx      edx, byte ptr [esi + edx]
6243    mov        byte ptr [edi + 13], dl
6244    movzx      edx, byte ptr [eax + 14]
6245    movzx      edx, byte ptr [esi + edx]
6246    mov        byte ptr [edi + 14], dl
6247    movzx      edx, byte ptr [eax + 15]  // copy alpha.
6248    mov        byte ptr [edi + 15], dl
6249
6250    lea        eax, [eax + 16]
6251    lea        edi, [edi + 16]
6252    sub        ecx, 4
6253    jg         convertloop
6254
6255    pop        edi
6256    pop        esi
6257    ret
6258  }
6259}
6260#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6261
6262#endif  // defined(_M_X64)
6263
6264#ifdef __cplusplus
6265}  // extern "C"
6266}  // namespace libyuv
6267#endif
6268
6269#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6270