1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
14    defined(_MSC_VER) && !defined(__clang__)
15#include <emmintrin.h>
16#include <tmmintrin.h>  // For _mm_maddubs_epi16
17#endif
18
19#ifdef __cplusplus
20namespace libyuv {
21extern "C" {
22#endif
23
24// This module is for Visual C.
25#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
26    defined(_MSC_VER) && !defined(__clang__)
27
28struct YuvConstants {
29  lvec8 kUVToB;     // 0
30  lvec8 kUVToG;     // 32
31  lvec8 kUVToR;     // 64
32  lvec16 kUVBiasB;  // 96
33  lvec16 kUVBiasG;  // 128
34  lvec16 kUVBiasR;  // 160
35  lvec16 kYToRgb;   // 192
36};
37
38// BT.601 YUV to RGB reference
39//  R = (Y - 16) * 1.164              - V * -1.596
40//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
41//  B = (Y - 16) * 1.164 - U * -2.018
42
43// Y contribution to R,G,B.  Scale and bias.
44// TODO(fbarchard): Consider moving constants into a common header.
45#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
46#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
47
48// U and V contributions to R,G,B.
49#define UB -128 /* max(-128, round(-2.018 * 64)) */
50#define UG 25 /* round(0.391 * 64) */
51#define VG 52 /* round(0.813 * 64) */
52#define VR -102 /* round(-1.596 * 64) */
53
54// Bias values to subtract 16 from Y and 128 from U and V.
55#define BB (UB * 128            + YGB)
56#define BG (UG * 128 + VG * 128 + YGB)
57#define BR            (VR * 128 + YGB)
58
59// BT601 constants for YUV to RGB.
60static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
61  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
62    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
63  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
64    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
65  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
66    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
67  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
68  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
69  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
70  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
71};
72
73// BT601 constants for NV21 where chroma plane is VU instead of UV.
74static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
75  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
76    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
77  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
78    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
79  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
80    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
81  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
82  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
83  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
84  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
85};
86
87#undef YG
88#undef YGB
89#undef UB
90#undef UG
91#undef VG
92#undef VR
93#undef BB
94#undef BG
95#undef BR
96
97// JPEG YUV to RGB reference
98// *  R = Y                - V * -1.40200
99// *  G = Y - U *  0.34414 - V *  0.71414
100// *  B = Y - U * -1.77200
101
102// Y contribution to R,G,B.  Scale and bias.
103// TODO(fbarchard): Consider moving constants into a common header.
104#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
105#define YGBJ 32  /* 64 / 2 */
106
107// U and V contributions to R,G,B.
108#define UBJ -113 /* round(-1.77200 * 64) */
109#define UGJ 22 /* round(0.34414 * 64) */
110#define VGJ 46 /* round(0.71414  * 64) */
111#define VRJ -90 /* round(-1.40200 * 64) */
112
113// Bias values to subtract 16 from Y and 128 from U and V.
114#define BBJ (UBJ * 128             + YGBJ)
115#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
116#define BRJ             (VRJ * 128 + YGBJ)
117
118// JPEG constants for YUV to RGB.
119static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
120  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
121    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
122  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
123    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
124    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
125    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
126  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
127    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
128  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
129    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
130  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
131    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
132  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
133    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
134  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
135    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
136};
137
138#undef YGJ
139#undef YGBJ
140#undef UBJ
141#undef UGJ
142#undef VGJ
143#undef VRJ
144#undef BBJ
145#undef BGJ
146#undef BRJ
147
148// 64 bit
149#if defined(_M_X64)
150#if defined(HAS_I422TOARGBROW_SSSE3)
151void I422ToARGBRow_SSSE3(const uint8* y_buf,
152                         const uint8* u_buf,
153                         const uint8* v_buf,
154                         uint8* dst_argb,
155                         int width) {
156  __m128i xmm0, xmm1, xmm2, xmm3;
157  const __m128i xmm5 = _mm_set1_epi8(-1);
158  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
159
160  while (width > 0) {
161    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
162    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
163    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
164    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
165    xmm1 = _mm_loadu_si128(&xmm0);
166    xmm2 = _mm_loadu_si128(&xmm0);
167    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
168    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
169    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
170    xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
171    xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
172    xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
173    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
174    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
175    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
176    xmm0 = _mm_adds_epi16(xmm0, xmm3);
177    xmm1 = _mm_adds_epi16(xmm1, xmm3);
178    xmm2 = _mm_adds_epi16(xmm2, xmm3);
179    xmm0 = _mm_srai_epi16(xmm0, 6);
180    xmm1 = _mm_srai_epi16(xmm1, 6);
181    xmm2 = _mm_srai_epi16(xmm2, 6);
182    xmm0 = _mm_packus_epi16(xmm0, xmm0);
183    xmm1 = _mm_packus_epi16(xmm1, xmm1);
184    xmm2 = _mm_packus_epi16(xmm2, xmm2);
185    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
186    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
187    xmm1 = _mm_loadu_si128(&xmm0);
188    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
189    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
190
191    _mm_storeu_si128((__m128i *)dst_argb, xmm0);
192    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
193
194    y_buf += 8;
195    u_buf += 4;
196    dst_argb += 32;
197    width -= 8;
198  }
199}
200#endif
201// 32 bit
202#else  // defined(_M_X64)
203#ifdef HAS_ARGBTOYROW_SSSE3
204
205// Constants for ARGB.
206static const vec8 kARGBToY = {
207  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
208};
209
210// JPeg full range.
211static const vec8 kARGBToYJ = {
212  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
213};
214
215static const vec8 kARGBToU = {
216  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
217};
218
219static const vec8 kARGBToUJ = {
220  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
221};
222
223static const vec8 kARGBToV = {
224  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
225};
226
227static const vec8 kARGBToVJ = {
228  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
229};
230
231// vpshufb for vphaddw + vpackuswb packed to shorts.
232static const lvec8 kShufARGBToUV_AVX = {
233  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
234  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
235};
236
237// Constants for BGRA.
238static const vec8 kBGRAToY = {
239  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
240};
241
242static const vec8 kBGRAToU = {
243  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
244};
245
246static const vec8 kBGRAToV = {
247  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
248};
249
250// Constants for ABGR.
251static const vec8 kABGRToY = {
252  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
253};
254
255static const vec8 kABGRToU = {
256  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
257};
258
259static const vec8 kABGRToV = {
260  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
261};
262
263// Constants for RGBA.
264static const vec8 kRGBAToY = {
265  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
266};
267
268static const vec8 kRGBAToU = {
269  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
270};
271
272static const vec8 kRGBAToV = {
273  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
274};
275
276static const uvec8 kAddY16 = {
277  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
278};
279
280// 7 bit fixed point 0.5.
281static const vec16 kAddYJ64 = {
282  64, 64, 64, 64, 64, 64, 64, 64
283};
284
285static const uvec8 kAddUV128 = {
286  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
287  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
288};
289
290static const uvec16 kAddUVJ128 = {
291  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
292};
293
294// Shuffle table for converting RGB24 to ARGB.
295static const uvec8 kShuffleMaskRGB24ToARGB = {
296  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
297};
298
299// Shuffle table for converting RAW to ARGB.
300static const uvec8 kShuffleMaskRAWToARGB = {
301  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
302};
303
304// Shuffle table for converting ARGB to RGB24.
305static const uvec8 kShuffleMaskARGBToRGB24 = {
306  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
307};
308
309// Shuffle table for converting ARGB to RAW.
310static const uvec8 kShuffleMaskARGBToRAW = {
311  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
312};
313
314// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
315static const uvec8 kShuffleMaskARGBToRGB24_0 = {
316  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
317};
318
319// Shuffle table for converting ARGB to RAW.
320static const uvec8 kShuffleMaskARGBToRAW_0 = {
321  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
322};
323
324// Duplicates gray value 3 times and fills in alpha opaque.
325__declspec(naked)
326void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
327  __asm {
328    mov        eax, [esp + 4]        // src_y
329    mov        edx, [esp + 8]        // dst_argb
330    mov        ecx, [esp + 12]       // pix
331    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
332    pslld      xmm5, 24
333
334  convertloop:
335    movq       xmm0, qword ptr [eax]
336    lea        eax,  [eax + 8]
337    punpcklbw  xmm0, xmm0
338    movdqa     xmm1, xmm0
339    punpcklwd  xmm0, xmm0
340    punpckhwd  xmm1, xmm1
341    por        xmm0, xmm5
342    por        xmm1, xmm5
343    movdqu     [edx], xmm0
344    movdqu     [edx + 16], xmm1
345    lea        edx, [edx + 32]
346    sub        ecx, 8
347    jg         convertloop
348    ret
349  }
350}
351
352#ifdef HAS_J400TOARGBROW_AVX2
353// Duplicates gray value 3 times and fills in alpha opaque.
354__declspec(naked)
355void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
356  __asm {
357    mov         eax, [esp + 4]        // src_y
358    mov         edx, [esp + 8]        // dst_argb
359    mov         ecx, [esp + 12]       // pix
360    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
361    vpslld      ymm5, ymm5, 24
362
363  convertloop:
364    vmovdqu     xmm0, [eax]
365    lea         eax,  [eax + 16]
366    vpermq      ymm0, ymm0, 0xd8
367    vpunpcklbw  ymm0, ymm0, ymm0
368    vpermq      ymm0, ymm0, 0xd8
369    vpunpckhwd  ymm1, ymm0, ymm0
370    vpunpcklwd  ymm0, ymm0, ymm0
371    vpor        ymm0, ymm0, ymm5
372    vpor        ymm1, ymm1, ymm5
373    vmovdqu     [edx], ymm0
374    vmovdqu     [edx + 32], ymm1
375    lea         edx, [edx + 64]
376    sub         ecx, 16
377    jg          convertloop
378    vzeroupper
379    ret
380  }
381}
382#endif  // HAS_J400TOARGBROW_AVX2
383
384__declspec(naked)
385void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
386  __asm {
387    mov       eax, [esp + 4]   // src_rgb24
388    mov       edx, [esp + 8]   // dst_argb
389    mov       ecx, [esp + 12]  // pix
390    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
391    pslld     xmm5, 24
392    movdqa    xmm4, kShuffleMaskRGB24ToARGB
393
394 convertloop:
395    movdqu    xmm0, [eax]
396    movdqu    xmm1, [eax + 16]
397    movdqu    xmm3, [eax + 32]
398    lea       eax, [eax + 48]
399    movdqa    xmm2, xmm3
400    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
401    pshufb    xmm2, xmm4
402    por       xmm2, xmm5
403    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
404    pshufb    xmm0, xmm4
405    movdqu    [edx + 32], xmm2
406    por       xmm0, xmm5
407    pshufb    xmm1, xmm4
408    movdqu    [edx], xmm0
409    por       xmm1, xmm5
410    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
411    pshufb    xmm3, xmm4
412    movdqu    [edx + 16], xmm1
413    por       xmm3, xmm5
414    movdqu    [edx + 48], xmm3
415    lea       edx, [edx + 64]
416    sub       ecx, 16
417    jg        convertloop
418    ret
419  }
420}
421
422__declspec(naked)
423void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
424                        int pix) {
425  __asm {
426    mov       eax, [esp + 4]   // src_raw
427    mov       edx, [esp + 8]   // dst_argb
428    mov       ecx, [esp + 12]  // pix
429    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
430    pslld     xmm5, 24
431    movdqa    xmm4, kShuffleMaskRAWToARGB
432
433 convertloop:
434    movdqu    xmm0, [eax]
435    movdqu    xmm1, [eax + 16]
436    movdqu    xmm3, [eax + 32]
437    lea       eax, [eax + 48]
438    movdqa    xmm2, xmm3
439    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
440    pshufb    xmm2, xmm4
441    por       xmm2, xmm5
442    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
443    pshufb    xmm0, xmm4
444    movdqu    [edx + 32], xmm2
445    por       xmm0, xmm5
446    pshufb    xmm1, xmm4
447    movdqu    [edx], xmm0
448    por       xmm1, xmm5
449    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
450    pshufb    xmm3, xmm4
451    movdqu    [edx + 16], xmm1
452    por       xmm3, xmm5
453    movdqu    [edx + 48], xmm3
454    lea       edx, [edx + 64]
455    sub       ecx, 16
456    jg        convertloop
457    ret
458  }
459}
460
461// pmul method to replicate bits.
462// Math to replicate bits:
463// (v << 8) | (v << 3)
464// v * 256 + v * 8
465// v * (256 + 8)
466// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
467// 20 instructions.
468__declspec(naked)
469void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
470                          int pix) {
471  __asm {
472    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
473    movd      xmm5, eax
474    pshufd    xmm5, xmm5, 0
475    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
476    movd      xmm6, eax
477    pshufd    xmm6, xmm6, 0
478    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
479    psllw     xmm3, 11
480    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
481    psllw     xmm4, 10
482    psrlw     xmm4, 5
483    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
484    psllw     xmm7, 8
485
486    mov       eax, [esp + 4]   // src_rgb565
487    mov       edx, [esp + 8]   // dst_argb
488    mov       ecx, [esp + 12]  // pix
489    sub       edx, eax
490    sub       edx, eax
491
492 convertloop:
493    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
494    movdqa    xmm1, xmm0
495    movdqa    xmm2, xmm0
496    pand      xmm1, xmm3    // R in upper 5 bits
497    psllw     xmm2, 11      // B in upper 5 bits
498    pmulhuw   xmm1, xmm5    // * (256 + 8)
499    pmulhuw   xmm2, xmm5    // * (256 + 8)
500    psllw     xmm1, 8
501    por       xmm1, xmm2    // RB
502    pand      xmm0, xmm4    // G in middle 6 bits
503    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
504    por       xmm0, xmm7    // AG
505    movdqa    xmm2, xmm1
506    punpcklbw xmm1, xmm0
507    punpckhbw xmm2, xmm0
508    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
509    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
510    lea       eax, [eax + 16]
511    sub       ecx, 8
512    jg        convertloop
513    ret
514  }
515}
516
517#ifdef HAS_RGB565TOARGBROW_AVX2
518// pmul method to replicate bits.
519// Math to replicate bits:
520// (v << 8) | (v << 3)
521// v * 256 + v * 8
522// v * (256 + 8)
523// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
524__declspec(naked)
525void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
526                          int pix) {
527  __asm {
528    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
529    vmovd      xmm5, eax
530    vbroadcastss ymm5, xmm5
531    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
532    movd       xmm6, eax
533    vbroadcastss ymm6, xmm6
534    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
535    vpsllw     ymm3, ymm3, 11
536    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
537    vpsllw     ymm4, ymm4, 10
538    vpsrlw     ymm4, ymm4, 5
539    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
540    vpsllw     ymm7, ymm7, 8
541
542    mov        eax, [esp + 4]   // src_rgb565
543    mov        edx, [esp + 8]   // dst_argb
544    mov        ecx, [esp + 12]  // pix
545    sub        edx, eax
546    sub        edx, eax
547
548 convertloop:
549    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
550    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
551    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
552    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
553    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
554    vpsllw     ymm1, ymm1, 8
555    vpor       ymm1, ymm1, ymm2    // RB
556    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
557    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
558    vpor       ymm0, ymm0, ymm7    // AG
559    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
560    vpermq     ymm1, ymm1, 0xd8
561    vpunpckhbw ymm2, ymm1, ymm0
562    vpunpcklbw ymm1, ymm1, ymm0
563    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
564    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
565    lea       eax, [eax + 32]
566    sub       ecx, 16
567    jg        convertloop
568    vzeroupper
569    ret
570  }
571}
572#endif  // HAS_RGB565TOARGBROW_AVX2
573
574#ifdef HAS_ARGB1555TOARGBROW_AVX2
575__declspec(naked)
576void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
577                            int pix) {
578  __asm {
579    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
580    vmovd      xmm5, eax
581    vbroadcastss ymm5, xmm5
582    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
583    movd       xmm6, eax
584    vbroadcastss ymm6, xmm6
585    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
586    vpsllw     ymm3, ymm3, 11
587    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
588    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
589    vpsllw     ymm7, ymm7, 8
590
591    mov        eax,  [esp + 4]   // src_argb1555
592    mov        edx,  [esp + 8]   // dst_argb
593    mov        ecx,  [esp + 12]  // pix
594    sub        edx,  eax
595    sub        edx,  eax
596
597 convertloop:
598    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
599    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
600    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
601    vpand      ymm1, ymm1, ymm3
602    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
603    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
604    vpsllw     ymm1, ymm1, 8
605    vpor       ymm1, ymm1, ymm2    // RB
606    vpsraw     ymm2, ymm0, 8       // A
607    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
608    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
609    vpand      ymm2, ymm2, ymm7
610    vpor       ymm0, ymm0, ymm2    // AG
611    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
612    vpermq     ymm1, ymm1, 0xd8
613    vpunpckhbw ymm2, ymm1, ymm0
614    vpunpcklbw ymm1, ymm1, ymm0
615    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
616    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
617    lea       eax, [eax + 32]
618    sub       ecx, 16
619    jg        convertloop
620    vzeroupper
621    ret
622  }
623}
624#endif  // HAS_ARGB1555TOARGBROW_AVX2
625
626#ifdef HAS_ARGB4444TOARGBROW_AVX2
627__declspec(naked)
628void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
629                            int pix) {
630  __asm {
631    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
632    vmovd     xmm4, eax
633    vbroadcastss ymm4, xmm4
634    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
635    mov       eax,  [esp + 4]   // src_argb4444
636    mov       edx,  [esp + 8]   // dst_argb
637    mov       ecx,  [esp + 12]  // pix
638    sub       edx,  eax
639    sub       edx,  eax
640
641 convertloop:
642    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
643    vpand      ymm2, ymm0, ymm5    // mask high nibbles
644    vpand      ymm0, ymm0, ymm4    // mask low nibbles
645    vpsrlw     ymm3, ymm2, 4
646    vpsllw     ymm1, ymm0, 4
647    vpor       ymm2, ymm2, ymm3
648    vpor       ymm0, ymm0, ymm1
649    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
650    vpermq     ymm2, ymm2, 0xd8
651    vpunpckhbw ymm1, ymm0, ymm2
652    vpunpcklbw ymm0, ymm0, ymm2
653    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
654    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
655    lea       eax, [eax + 32]
656    sub       ecx, 16
657    jg        convertloop
658    vzeroupper
659    ret
660  }
661}
662#endif  // HAS_ARGB4444TOARGBROW_AVX2
663
664// 24 instructions
665__declspec(naked)
666void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
667                            int pix) {
668  __asm {
669    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
670    movd      xmm5, eax
671    pshufd    xmm5, xmm5, 0
672    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
673    movd      xmm6, eax
674    pshufd    xmm6, xmm6, 0
675    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
676    psllw     xmm3, 11
677    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
678    psrlw     xmm4, 6
679    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
680    psllw     xmm7, 8
681
682    mov       eax, [esp + 4]   // src_argb1555
683    mov       edx, [esp + 8]   // dst_argb
684    mov       ecx, [esp + 12]  // pix
685    sub       edx, eax
686    sub       edx, eax
687
688 convertloop:
689    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
690    movdqa    xmm1, xmm0
691    movdqa    xmm2, xmm0
692    psllw     xmm1, 1       // R in upper 5 bits
693    psllw     xmm2, 11      // B in upper 5 bits
694    pand      xmm1, xmm3
695    pmulhuw   xmm2, xmm5    // * (256 + 8)
696    pmulhuw   xmm1, xmm5    // * (256 + 8)
697    psllw     xmm1, 8
698    por       xmm1, xmm2    // RB
699    movdqa    xmm2, xmm0
700    pand      xmm0, xmm4    // G in middle 5 bits
701    psraw     xmm2, 8       // A
702    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
703    pand      xmm2, xmm7
704    por       xmm0, xmm2    // AG
705    movdqa    xmm2, xmm1
706    punpcklbw xmm1, xmm0
707    punpckhbw xmm2, xmm0
708    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
709    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
710    lea       eax, [eax + 16]
711    sub       ecx, 8
712    jg        convertloop
713    ret
714  }
715}
716
717// 18 instructions.
718__declspec(naked)
719void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
720                            int pix) {
721  __asm {
722    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
723    movd      xmm4, eax
724    pshufd    xmm4, xmm4, 0
725    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
726    pslld     xmm5, 4
727    mov       eax, [esp + 4]   // src_argb4444
728    mov       edx, [esp + 8]   // dst_argb
729    mov       ecx, [esp + 12]  // pix
730    sub       edx, eax
731    sub       edx, eax
732
733 convertloop:
734    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
735    movdqa    xmm2, xmm0
736    pand      xmm0, xmm4    // mask low nibbles
737    pand      xmm2, xmm5    // mask high nibbles
738    movdqa    xmm1, xmm0
739    movdqa    xmm3, xmm2
740    psllw     xmm1, 4
741    psrlw     xmm3, 4
742    por       xmm0, xmm1
743    por       xmm2, xmm3
744    movdqa    xmm1, xmm0
745    punpcklbw xmm0, xmm2
746    punpckhbw xmm1, xmm2
747    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
748    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
749    lea       eax, [eax + 16]
750    sub       ecx, 8
751    jg        convertloop
752    ret
753  }
754}
755
756__declspec(naked)
757void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
758  __asm {
759    mov       eax, [esp + 4]   // src_argb
760    mov       edx, [esp + 8]   // dst_rgb
761    mov       ecx, [esp + 12]  // pix
762    movdqa    xmm6, kShuffleMaskARGBToRGB24
763
764 convertloop:
765    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
766    movdqu    xmm1, [eax + 16]
767    movdqu    xmm2, [eax + 32]
768    movdqu    xmm3, [eax + 48]
769    lea       eax, [eax + 64]
770    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
771    pshufb    xmm1, xmm6
772    pshufb    xmm2, xmm6
773    pshufb    xmm3, xmm6
774    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
775    psrldq    xmm1, 4      // 8 bytes from 1
776    pslldq    xmm4, 12     // 4 bytes from 1 for 0
777    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
778    por       xmm0, xmm4   // 4 bytes from 1 for 0
779    pslldq    xmm5, 8      // 8 bytes from 2 for 1
780    movdqu    [edx], xmm0  // store 0
781    por       xmm1, xmm5   // 8 bytes from 2 for 1
782    psrldq    xmm2, 8      // 4 bytes from 2
783    pslldq    xmm3, 4      // 12 bytes from 3 for 2
784    por       xmm2, xmm3   // 12 bytes from 3 for 2
785    movdqu    [edx + 16], xmm1   // store 1
786    movdqu    [edx + 32], xmm2   // store 2
787    lea       edx, [edx + 48]
788    sub       ecx, 16
789    jg        convertloop
790    ret
791  }
792}
793
794__declspec(naked)
795void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
796  __asm {
797    mov       eax, [esp + 4]   // src_argb
798    mov       edx, [esp + 8]   // dst_rgb
799    mov       ecx, [esp + 12]  // pix
800    movdqa    xmm6, kShuffleMaskARGBToRAW
801
802 convertloop:
803    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
804    movdqu    xmm1, [eax + 16]
805    movdqu    xmm2, [eax + 32]
806    movdqu    xmm3, [eax + 48]
807    lea       eax, [eax + 64]
808    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
809    pshufb    xmm1, xmm6
810    pshufb    xmm2, xmm6
811    pshufb    xmm3, xmm6
812    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
813    psrldq    xmm1, 4      // 8 bytes from 1
814    pslldq    xmm4, 12     // 4 bytes from 1 for 0
815    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
816    por       xmm0, xmm4   // 4 bytes from 1 for 0
817    pslldq    xmm5, 8      // 8 bytes from 2 for 1
818    movdqu    [edx], xmm0  // store 0
819    por       xmm1, xmm5   // 8 bytes from 2 for 1
820    psrldq    xmm2, 8      // 4 bytes from 2
821    pslldq    xmm3, 4      // 12 bytes from 3 for 2
822    por       xmm2, xmm3   // 12 bytes from 3 for 2
823    movdqu    [edx + 16], xmm1   // store 1
824    movdqu    [edx + 32], xmm2   // store 2
825    lea       edx, [edx + 48]
826    sub       ecx, 16
827    jg        convertloop
828    ret
829  }
830}
831
832// 4 pixels
833__declspec(naked)
834void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
835  __asm {
836    mov       eax, [esp + 4]   // src_argb
837    mov       edx, [esp + 8]   // dst_rgb
838    mov       ecx, [esp + 12]  // pix
839    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
840    psrld     xmm3, 27
841    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
842    psrld     xmm4, 26
843    pslld     xmm4, 5
844    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
845    pslld     xmm5, 11
846
847 convertloop:
848    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
849    movdqa    xmm1, xmm0    // B
850    movdqa    xmm2, xmm0    // G
851    pslld     xmm0, 8       // R
852    psrld     xmm1, 3       // B
853    psrld     xmm2, 5       // G
854    psrad     xmm0, 16      // R
855    pand      xmm1, xmm3    // B
856    pand      xmm2, xmm4    // G
857    pand      xmm0, xmm5    // R
858    por       xmm1, xmm2    // BG
859    por       xmm0, xmm1    // BGR
860    packssdw  xmm0, xmm0
861    lea       eax, [eax + 16]
862    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
863    lea       edx, [edx + 8]
864    sub       ecx, 4
865    jg        convertloop
866    ret
867  }
868}
869
870// 8 pixels
871__declspec(naked)
872void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
873                                const uint32 dither4, int pix) {
874  __asm {
875
876    mov       eax, [esp + 4]   // src_argb
877    mov       edx, [esp + 8]   // dst_rgb
878    movd      xmm6, [esp + 12] // dither4
879    mov       ecx, [esp + 16]  // pix
880    punpcklbw xmm6, xmm6       // make dither 16 bytes
881    movdqa    xmm7, xmm6
882    punpcklwd xmm6, xmm6
883    punpckhwd xmm7, xmm7
884    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
885    psrld     xmm3, 27
886    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
887    psrld     xmm4, 26
888    pslld     xmm4, 5
889    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
890    pslld     xmm5, 11
891
892 convertloop:
893    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
894    paddusb   xmm0, xmm6    // add dither
895    movdqa    xmm1, xmm0    // B
896    movdqa    xmm2, xmm0    // G
897    pslld     xmm0, 8       // R
898    psrld     xmm1, 3       // B
899    psrld     xmm2, 5       // G
900    psrad     xmm0, 16      // R
901    pand      xmm1, xmm3    // B
902    pand      xmm2, xmm4    // G
903    pand      xmm0, xmm5    // R
904    por       xmm1, xmm2    // BG
905    por       xmm0, xmm1    // BGR
906    packssdw  xmm0, xmm0
907    lea       eax, [eax + 16]
908    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
909    lea       edx, [edx + 8]
910    sub       ecx, 4
911    jg        convertloop
912    ret
913  }
914}
915
916#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
917__declspec(naked)
918void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
919                                const uint32 dither4, int pix) {
920  __asm {
921    mov        eax, [esp + 4]      // src_argb
922    mov        edx, [esp + 8]      // dst_rgb
923    vbroadcastss xmm6, [esp + 12]  // dither4
924    mov        ecx, [esp + 16]     // pix
925    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
926    vpermq     ymm6, ymm6, 0xd8
927    vpunpcklwd ymm6, ymm6, ymm6
928    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
929    vpsrld     ymm3, ymm3, 27
930    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
931    vpsrld     ymm4, ymm4, 26
932    vpslld     ymm4, ymm4, 5
933    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
934
935 convertloop:
936    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
937    vpaddusb   ymm0, ymm0, ymm6    // add dither
938    vpsrld     ymm2, ymm0, 5       // G
939    vpsrld     ymm1, ymm0, 3       // B
940    vpsrld     ymm0, ymm0, 8       // R
941    vpand      ymm2, ymm2, ymm4    // G
942    vpand      ymm1, ymm1, ymm3    // B
943    vpand      ymm0, ymm0, ymm5    // R
944    vpor       ymm1, ymm1, ymm2    // BG
945    vpor       ymm0, ymm0, ymm1    // BGR
946    vpackusdw  ymm0, ymm0, ymm0
947    vpermq     ymm0, ymm0, 0xd8
948    lea        eax, [eax + 32]
949    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
950    lea        edx, [edx + 16]
951    sub        ecx, 8
952    jg         convertloop
953    vzeroupper
954    ret
955  }
956}
957#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
958
959// TODO(fbarchard): Improve sign extension/packing.
960__declspec(naked)
961void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
962  __asm {
963    mov       eax, [esp + 4]   // src_argb
964    mov       edx, [esp + 8]   // dst_rgb
965    mov       ecx, [esp + 12]  // pix
966    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
967    psrld     xmm4, 27
968    movdqa    xmm5, xmm4       // generate mask 0x000003e0
969    pslld     xmm5, 5
970    movdqa    xmm6, xmm4       // generate mask 0x00007c00
971    pslld     xmm6, 10
972    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
973    pslld     xmm7, 15
974
975 convertloop:
976    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
977    movdqa    xmm1, xmm0    // B
978    movdqa    xmm2, xmm0    // G
979    movdqa    xmm3, xmm0    // R
980    psrad     xmm0, 16      // A
981    psrld     xmm1, 3       // B
982    psrld     xmm2, 6       // G
983    psrld     xmm3, 9       // R
984    pand      xmm0, xmm7    // A
985    pand      xmm1, xmm4    // B
986    pand      xmm2, xmm5    // G
987    pand      xmm3, xmm6    // R
988    por       xmm0, xmm1    // BA
989    por       xmm2, xmm3    // GR
990    por       xmm0, xmm2    // BGRA
991    packssdw  xmm0, xmm0
992    lea       eax, [eax + 16]
993    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
994    lea       edx, [edx + 8]
995    sub       ecx, 4
996    jg        convertloop
997    ret
998  }
999}
1000
1001__declspec(naked)
1002void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1003  __asm {
1004    mov       eax, [esp + 4]   // src_argb
1005    mov       edx, [esp + 8]   // dst_rgb
1006    mov       ecx, [esp + 12]  // pix
1007    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
1008    psllw     xmm4, 12
1009    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
1010    psrlw     xmm3, 8
1011
1012 convertloop:
1013    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
1014    movdqa    xmm1, xmm0
1015    pand      xmm0, xmm3    // low nibble
1016    pand      xmm1, xmm4    // high nibble
1017    psrld     xmm0, 4
1018    psrld     xmm1, 8
1019    por       xmm0, xmm1
1020    packuswb  xmm0, xmm0
1021    lea       eax, [eax + 16]
1022    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
1023    lea       edx, [edx + 8]
1024    sub       ecx, 4
1025    jg        convertloop
1026    ret
1027  }
1028}
1029
1030#ifdef HAS_ARGBTORGB565ROW_AVX2
1031__declspec(naked)
1032void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1033  __asm {
1034    mov        eax, [esp + 4]      // src_argb
1035    mov        edx, [esp + 8]      // dst_rgb
1036    mov        ecx, [esp + 12]     // pix
1037    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
1038    vpsrld     ymm3, ymm3, 27
1039    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
1040    vpsrld     ymm4, ymm4, 26
1041    vpslld     ymm4, ymm4, 5
1042    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
1043
1044 convertloop:
1045    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
1046    vpsrld     ymm2, ymm0, 5       // G
1047    vpsrld     ymm1, ymm0, 3       // B
1048    vpsrld     ymm0, ymm0, 8       // R
1049    vpand      ymm2, ymm2, ymm4    // G
1050    vpand      ymm1, ymm1, ymm3    // B
1051    vpand      ymm0, ymm0, ymm5    // R
1052    vpor       ymm1, ymm1, ymm2    // BG
1053    vpor       ymm0, ymm0, ymm1    // BGR
1054    vpackusdw  ymm0, ymm0, ymm0
1055    vpermq     ymm0, ymm0, 0xd8
1056    lea        eax, [eax + 32]
1057    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
1058    lea        edx, [edx + 16]
1059    sub        ecx, 8
1060    jg         convertloop
1061    vzeroupper
1062    ret
1063  }
1064}
1065#endif  // HAS_ARGBTORGB565ROW_AVX2
1066
1067#ifdef HAS_ARGBTOARGB1555ROW_AVX2
1068__declspec(naked)
1069void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1070  __asm {
1071    mov        eax, [esp + 4]      // src_argb
1072    mov        edx, [esp + 8]      // dst_rgb
1073    mov        ecx, [esp + 12]     // pix
1074    vpcmpeqb   ymm4, ymm4, ymm4
1075    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
1076    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
1077    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
1078    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
1079    vpslld     ymm7, ymm7, 15
1080
1081 convertloop:
1082    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
1083    vpsrld     ymm3, ymm0, 9       // R
1084    vpsrld     ymm2, ymm0, 6       // G
1085    vpsrld     ymm1, ymm0, 3       // B
1086    vpsrad     ymm0, ymm0, 16      // A
1087    vpand      ymm3, ymm3, ymm6    // R
1088    vpand      ymm2, ymm2, ymm5    // G
1089    vpand      ymm1, ymm1, ymm4    // B
1090    vpand      ymm0, ymm0, ymm7    // A
1091    vpor       ymm0, ymm0, ymm1    // BA
1092    vpor       ymm2, ymm2, ymm3    // GR
1093    vpor       ymm0, ymm0, ymm2    // BGRA
1094    vpackssdw  ymm0, ymm0, ymm0
1095    vpermq     ymm0, ymm0, 0xd8
1096    lea        eax, [eax + 32]
1097    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
1098    lea        edx, [edx + 16]
1099    sub        ecx, 8
1100    jg         convertloop
1101    vzeroupper
1102    ret
1103  }
1104}
1105#endif  // HAS_ARGBTOARGB1555ROW_AVX2
1106
1107#ifdef HAS_ARGBTOARGB4444ROW_AVX2
1108__declspec(naked)
1109void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1110  __asm {
1111    mov        eax, [esp + 4]   // src_argb
1112    mov        edx, [esp + 8]   // dst_rgb
1113    mov        ecx, [esp + 12]  // pix
1114    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
1115    vpsllw     ymm4, ymm4, 12
1116    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
1117
1118 convertloop:
1119    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
1120    vpand      ymm1, ymm0, ymm4    // high nibble
1121    vpand      ymm0, ymm0, ymm3    // low nibble
1122    vpsrld     ymm1, ymm1, 8
1123    vpsrld     ymm0, ymm0, 4
1124    vpor       ymm0, ymm0, ymm1
1125    vpackuswb  ymm0, ymm0, ymm0
1126    vpermq     ymm0, ymm0, 0xd8
1127    lea        eax, [eax + 32]
1128    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
1129    lea        edx, [edx + 16]
1130    sub        ecx, 8
1131    jg         convertloop
1132    vzeroupper
1133    ret
1134  }
1135}
1136#endif  // HAS_ARGBTOARGB4444ROW_AVX2
1137
1138// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1139__declspec(naked)
1140void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1141  __asm {
1142    mov        eax, [esp + 4]   /* src_argb */
1143    mov        edx, [esp + 8]   /* dst_y */
1144    mov        ecx, [esp + 12]  /* pix */
1145    movdqa     xmm4, kARGBToY
1146    movdqa     xmm5, kAddY16
1147
1148 convertloop:
1149    movdqu     xmm0, [eax]
1150    movdqu     xmm1, [eax + 16]
1151    movdqu     xmm2, [eax + 32]
1152    movdqu     xmm3, [eax + 48]
1153    pmaddubsw  xmm0, xmm4
1154    pmaddubsw  xmm1, xmm4
1155    pmaddubsw  xmm2, xmm4
1156    pmaddubsw  xmm3, xmm4
1157    lea        eax, [eax + 64]
1158    phaddw     xmm0, xmm1
1159    phaddw     xmm2, xmm3
1160    psrlw      xmm0, 7
1161    psrlw      xmm2, 7
1162    packuswb   xmm0, xmm2
1163    paddb      xmm0, xmm5
1164    movdqu     [edx], xmm0
1165    lea        edx, [edx + 16]
1166    sub        ecx, 16
1167    jg         convertloop
1168    ret
1169  }
1170}
1171
1172// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1173// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1174__declspec(naked)
1175void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1176  __asm {
1177    mov        eax, [esp + 4]   /* src_argb */
1178    mov        edx, [esp + 8]   /* dst_y */
1179    mov        ecx, [esp + 12]  /* pix */
1180    movdqa     xmm4, kARGBToYJ
1181    movdqa     xmm5, kAddYJ64
1182
1183 convertloop:
1184    movdqu     xmm0, [eax]
1185    movdqu     xmm1, [eax + 16]
1186    movdqu     xmm2, [eax + 32]
1187    movdqu     xmm3, [eax + 48]
1188    pmaddubsw  xmm0, xmm4
1189    pmaddubsw  xmm1, xmm4
1190    pmaddubsw  xmm2, xmm4
1191    pmaddubsw  xmm3, xmm4
1192    lea        eax, [eax + 64]
1193    phaddw     xmm0, xmm1
1194    phaddw     xmm2, xmm3
1195    paddw      xmm0, xmm5  // Add .5 for rounding.
1196    paddw      xmm2, xmm5
1197    psrlw      xmm0, 7
1198    psrlw      xmm2, 7
1199    packuswb   xmm0, xmm2
1200    movdqu     [edx], xmm0
1201    lea        edx, [edx + 16]
1202    sub        ecx, 16
1203    jg         convertloop
1204    ret
1205  }
1206}
1207
1208#ifdef HAS_ARGBTOYROW_AVX2
1209// vpermd for vphaddw + vpackuswb vpermd.
1210static const lvec32 kPermdARGBToY_AVX = {
1211  0, 4, 1, 5, 2, 6, 3, 7
1212};
1213
1214// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1215__declspec(naked)
1216void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
1217  __asm {
1218    mov        eax, [esp + 4]   /* src_argb */
1219    mov        edx, [esp + 8]   /* dst_y */
1220    mov        ecx, [esp + 12]  /* pix */
1221    vbroadcastf128 ymm4, kARGBToY
1222    vbroadcastf128 ymm5, kAddY16
1223    vmovdqu    ymm6, kPermdARGBToY_AVX
1224
1225 convertloop:
1226    vmovdqu    ymm0, [eax]
1227    vmovdqu    ymm1, [eax + 32]
1228    vmovdqu    ymm2, [eax + 64]
1229    vmovdqu    ymm3, [eax + 96]
1230    vpmaddubsw ymm0, ymm0, ymm4
1231    vpmaddubsw ymm1, ymm1, ymm4
1232    vpmaddubsw ymm2, ymm2, ymm4
1233    vpmaddubsw ymm3, ymm3, ymm4
1234    lea        eax, [eax + 128]
1235    vphaddw    ymm0, ymm0, ymm1  // mutates.
1236    vphaddw    ymm2, ymm2, ymm3
1237    vpsrlw     ymm0, ymm0, 7
1238    vpsrlw     ymm2, ymm2, 7
1239    vpackuswb  ymm0, ymm0, ymm2  // mutates.
1240    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1241    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
1242    vmovdqu    [edx], ymm0
1243    lea        edx, [edx + 32]
1244    sub        ecx, 32
1245    jg         convertloop
1246    vzeroupper
1247    ret
1248  }
1249}
1250#endif  //  HAS_ARGBTOYROW_AVX2
1251
1252#ifdef HAS_ARGBTOYJROW_AVX2
1253// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1254__declspec(naked)
1255void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
1256  __asm {
1257    mov        eax, [esp + 4]   /* src_argb */
1258    mov        edx, [esp + 8]   /* dst_y */
1259    mov        ecx, [esp + 12]  /* pix */
1260    vbroadcastf128 ymm4, kARGBToYJ
1261    vbroadcastf128 ymm5, kAddYJ64
1262    vmovdqu    ymm6, kPermdARGBToY_AVX
1263
1264 convertloop:
1265    vmovdqu    ymm0, [eax]
1266    vmovdqu    ymm1, [eax + 32]
1267    vmovdqu    ymm2, [eax + 64]
1268    vmovdqu    ymm3, [eax + 96]
1269    vpmaddubsw ymm0, ymm0, ymm4
1270    vpmaddubsw ymm1, ymm1, ymm4
1271    vpmaddubsw ymm2, ymm2, ymm4
1272    vpmaddubsw ymm3, ymm3, ymm4
1273    lea        eax, [eax + 128]
1274    vphaddw    ymm0, ymm0, ymm1  // mutates.
1275    vphaddw    ymm2, ymm2, ymm3
1276    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
1277    vpaddw     ymm2, ymm2, ymm5
1278    vpsrlw     ymm0, ymm0, 7
1279    vpsrlw     ymm2, ymm2, 7
1280    vpackuswb  ymm0, ymm0, ymm2  // mutates.
1281    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1282    vmovdqu    [edx], ymm0
1283    lea        edx, [edx + 32]
1284    sub        ecx, 32
1285    jg         convertloop
1286
1287    vzeroupper
1288    ret
1289  }
1290}
1291#endif  //  HAS_ARGBTOYJROW_AVX2
1292
1293__declspec(naked)
1294void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1295  __asm {
1296    mov        eax, [esp + 4]   /* src_argb */
1297    mov        edx, [esp + 8]   /* dst_y */
1298    mov        ecx, [esp + 12]  /* pix */
1299    movdqa     xmm4, kBGRAToY
1300    movdqa     xmm5, kAddY16
1301
1302 convertloop:
1303    movdqu     xmm0, [eax]
1304    movdqu     xmm1, [eax + 16]
1305    movdqu     xmm2, [eax + 32]
1306    movdqu     xmm3, [eax + 48]
1307    pmaddubsw  xmm0, xmm4
1308    pmaddubsw  xmm1, xmm4
1309    pmaddubsw  xmm2, xmm4
1310    pmaddubsw  xmm3, xmm4
1311    lea        eax, [eax + 64]
1312    phaddw     xmm0, xmm1
1313    phaddw     xmm2, xmm3
1314    psrlw      xmm0, 7
1315    psrlw      xmm2, 7
1316    packuswb   xmm0, xmm2
1317    paddb      xmm0, xmm5
1318    movdqu     [edx], xmm0
1319    lea        edx, [edx + 16]
1320    sub        ecx, 16
1321    jg         convertloop
1322    ret
1323  }
1324}
1325
1326__declspec(naked)
1327void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1328  __asm {
1329    mov        eax, [esp + 4]   /* src_argb */
1330    mov        edx, [esp + 8]   /* dst_y */
1331    mov        ecx, [esp + 12]  /* pix */
1332    movdqa     xmm4, kABGRToY
1333    movdqa     xmm5, kAddY16
1334
1335 convertloop:
1336    movdqu     xmm0, [eax]
1337    movdqu     xmm1, [eax + 16]
1338    movdqu     xmm2, [eax + 32]
1339    movdqu     xmm3, [eax + 48]
1340    pmaddubsw  xmm0, xmm4
1341    pmaddubsw  xmm1, xmm4
1342    pmaddubsw  xmm2, xmm4
1343    pmaddubsw  xmm3, xmm4
1344    lea        eax, [eax + 64]
1345    phaddw     xmm0, xmm1
1346    phaddw     xmm2, xmm3
1347    psrlw      xmm0, 7
1348    psrlw      xmm2, 7
1349    packuswb   xmm0, xmm2
1350    paddb      xmm0, xmm5
1351    movdqu     [edx], xmm0
1352    lea        edx, [edx + 16]
1353    sub        ecx, 16
1354    jg         convertloop
1355    ret
1356  }
1357}
1358
1359__declspec(naked)
1360void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1361  __asm {
1362    mov        eax, [esp + 4]   /* src_argb */
1363    mov        edx, [esp + 8]   /* dst_y */
1364    mov        ecx, [esp + 12]  /* pix */
1365    movdqa     xmm4, kRGBAToY
1366    movdqa     xmm5, kAddY16
1367
1368 convertloop:
1369    movdqu     xmm0, [eax]
1370    movdqu     xmm1, [eax + 16]
1371    movdqu     xmm2, [eax + 32]
1372    movdqu     xmm3, [eax + 48]
1373    pmaddubsw  xmm0, xmm4
1374    pmaddubsw  xmm1, xmm4
1375    pmaddubsw  xmm2, xmm4
1376    pmaddubsw  xmm3, xmm4
1377    lea        eax, [eax + 64]
1378    phaddw     xmm0, xmm1
1379    phaddw     xmm2, xmm3
1380    psrlw      xmm0, 7
1381    psrlw      xmm2, 7
1382    packuswb   xmm0, xmm2
1383    paddb      xmm0, xmm5
1384    movdqu     [edx], xmm0
1385    lea        edx, [edx + 16]
1386    sub        ecx, 16
1387    jg         convertloop
1388    ret
1389  }
1390}
1391
1392__declspec(naked)
1393void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1394                       uint8* dst_u, uint8* dst_v, int width) {
1395  __asm {
1396    push       esi
1397    push       edi
1398    mov        eax, [esp + 8 + 4]   // src_argb
1399    mov        esi, [esp + 8 + 8]   // src_stride_argb
1400    mov        edx, [esp + 8 + 12]  // dst_u
1401    mov        edi, [esp + 8 + 16]  // dst_v
1402    mov        ecx, [esp + 8 + 20]  // pix
1403    movdqa     xmm5, kAddUV128
1404    movdqa     xmm6, kARGBToV
1405    movdqa     xmm7, kARGBToU
1406    sub        edi, edx             // stride from u to v
1407
1408 convertloop:
1409    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1410    movdqu     xmm0, [eax]
1411    movdqu     xmm4, [eax + esi]
1412    pavgb      xmm0, xmm4
1413    movdqu     xmm1, [eax + 16]
1414    movdqu     xmm4, [eax + esi + 16]
1415    pavgb      xmm1, xmm4
1416    movdqu     xmm2, [eax + 32]
1417    movdqu     xmm4, [eax + esi + 32]
1418    pavgb      xmm2, xmm4
1419    movdqu     xmm3, [eax + 48]
1420    movdqu     xmm4, [eax + esi + 48]
1421    pavgb      xmm3, xmm4
1422
1423    lea        eax,  [eax + 64]
1424    movdqa     xmm4, xmm0
1425    shufps     xmm0, xmm1, 0x88
1426    shufps     xmm4, xmm1, 0xdd
1427    pavgb      xmm0, xmm4
1428    movdqa     xmm4, xmm2
1429    shufps     xmm2, xmm3, 0x88
1430    shufps     xmm4, xmm3, 0xdd
1431    pavgb      xmm2, xmm4
1432
1433    // step 2 - convert to U and V
1434    // from here down is very similar to Y code except
1435    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1436    movdqa     xmm1, xmm0
1437    movdqa     xmm3, xmm2
1438    pmaddubsw  xmm0, xmm7  // U
1439    pmaddubsw  xmm2, xmm7
1440    pmaddubsw  xmm1, xmm6  // V
1441    pmaddubsw  xmm3, xmm6
1442    phaddw     xmm0, xmm2
1443    phaddw     xmm1, xmm3
1444    psraw      xmm0, 8
1445    psraw      xmm1, 8
1446    packsswb   xmm0, xmm1
1447    paddb      xmm0, xmm5            // -> unsigned
1448
1449    // step 3 - store 8 U and 8 V values
1450    movlps     qword ptr [edx], xmm0 // U
1451    movhps     qword ptr [edx + edi], xmm0 // V
1452    lea        edx, [edx + 8]
1453    sub        ecx, 16
1454    jg         convertloop
1455
1456    pop        edi
1457    pop        esi
1458    ret
1459  }
1460}
1461
1462__declspec(naked)
1463void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1464                        uint8* dst_u, uint8* dst_v, int width) {
1465  __asm {
1466    push       esi
1467    push       edi
1468    mov        eax, [esp + 8 + 4]   // src_argb
1469    mov        esi, [esp + 8 + 8]   // src_stride_argb
1470    mov        edx, [esp + 8 + 12]  // dst_u
1471    mov        edi, [esp + 8 + 16]  // dst_v
1472    mov        ecx, [esp + 8 + 20]  // pix
1473    movdqa     xmm5, kAddUVJ128
1474    movdqa     xmm6, kARGBToVJ
1475    movdqa     xmm7, kARGBToUJ
1476    sub        edi, edx             // stride from u to v
1477
1478 convertloop:
1479    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1480    movdqu     xmm0, [eax]
1481    movdqu     xmm4, [eax + esi]
1482    pavgb      xmm0, xmm4
1483    movdqu     xmm1, [eax + 16]
1484    movdqu     xmm4, [eax + esi + 16]
1485    pavgb      xmm1, xmm4
1486    movdqu     xmm2, [eax + 32]
1487    movdqu     xmm4, [eax + esi + 32]
1488    pavgb      xmm2, xmm4
1489    movdqu     xmm3, [eax + 48]
1490    movdqu     xmm4, [eax + esi + 48]
1491    pavgb      xmm3, xmm4
1492
1493    lea        eax,  [eax + 64]
1494    movdqa     xmm4, xmm0
1495    shufps     xmm0, xmm1, 0x88
1496    shufps     xmm4, xmm1, 0xdd
1497    pavgb      xmm0, xmm4
1498    movdqa     xmm4, xmm2
1499    shufps     xmm2, xmm3, 0x88
1500    shufps     xmm4, xmm3, 0xdd
1501    pavgb      xmm2, xmm4
1502
1503    // step 2 - convert to U and V
1504    // from here down is very similar to Y code except
1505    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1506    movdqa     xmm1, xmm0
1507    movdqa     xmm3, xmm2
1508    pmaddubsw  xmm0, xmm7  // U
1509    pmaddubsw  xmm2, xmm7
1510    pmaddubsw  xmm1, xmm6  // V
1511    pmaddubsw  xmm3, xmm6
1512    phaddw     xmm0, xmm2
1513    phaddw     xmm1, xmm3
1514    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
1515    paddw      xmm1, xmm5
1516    psraw      xmm0, 8
1517    psraw      xmm1, 8
1518    packsswb   xmm0, xmm1
1519
1520    // step 3 - store 8 U and 8 V values
1521    movlps     qword ptr [edx], xmm0 // U
1522    movhps     qword ptr [edx + edi], xmm0 // V
1523    lea        edx, [edx + 8]
1524    sub        ecx, 16
1525    jg         convertloop
1526
1527    pop        edi
1528    pop        esi
1529    ret
1530  }
1531}
1532
1533#ifdef HAS_ARGBTOUVROW_AVX2
1534__declspec(naked)
1535void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1536                      uint8* dst_u, uint8* dst_v, int width) {
1537  __asm {
1538    push       esi
1539    push       edi
1540    mov        eax, [esp + 8 + 4]   // src_argb
1541    mov        esi, [esp + 8 + 8]   // src_stride_argb
1542    mov        edx, [esp + 8 + 12]  // dst_u
1543    mov        edi, [esp + 8 + 16]  // dst_v
1544    mov        ecx, [esp + 8 + 20]  // pix
1545    vbroadcastf128 ymm5, kAddUV128
1546    vbroadcastf128 ymm6, kARGBToV
1547    vbroadcastf128 ymm7, kARGBToU
1548    sub        edi, edx             // stride from u to v
1549
1550 convertloop:
1551    /* step 1 - subsample 32x2 argb pixels to 16x1 */
1552    vmovdqu    ymm0, [eax]
1553    vmovdqu    ymm1, [eax + 32]
1554    vmovdqu    ymm2, [eax + 64]
1555    vmovdqu    ymm3, [eax + 96]
1556    vpavgb     ymm0, ymm0, [eax + esi]
1557    vpavgb     ymm1, ymm1, [eax + esi + 32]
1558    vpavgb     ymm2, ymm2, [eax + esi + 64]
1559    vpavgb     ymm3, ymm3, [eax + esi + 96]
1560    lea        eax,  [eax + 128]
1561    vshufps    ymm4, ymm0, ymm1, 0x88
1562    vshufps    ymm0, ymm0, ymm1, 0xdd
1563    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1564    vshufps    ymm4, ymm2, ymm3, 0x88
1565    vshufps    ymm2, ymm2, ymm3, 0xdd
1566    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1567
1568    // step 2 - convert to U and V
1569    // from here down is very similar to Y code except
1570    // instead of 32 different pixels, its 16 pixels of U and 16 of V
1571    vpmaddubsw ymm1, ymm0, ymm7  // U
1572    vpmaddubsw ymm3, ymm2, ymm7
1573    vpmaddubsw ymm0, ymm0, ymm6  // V
1574    vpmaddubsw ymm2, ymm2, ymm6
1575    vphaddw    ymm1, ymm1, ymm3  // mutates
1576    vphaddw    ymm0, ymm0, ymm2
1577    vpsraw     ymm1, ymm1, 8
1578    vpsraw     ymm0, ymm0, 8
1579    vpacksswb  ymm0, ymm1, ymm0  // mutates
1580    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1581    vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
1582    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
1583
1584    // step 3 - store 16 U and 16 V values
1585    vextractf128 [edx], ymm0, 0 // U
1586    vextractf128 [edx + edi], ymm0, 1 // V
1587    lea        edx, [edx + 16]
1588    sub        ecx, 32
1589    jg         convertloop
1590
1591    pop        edi
1592    pop        esi
1593    vzeroupper
1594    ret
1595  }
1596}
1597#endif  // HAS_ARGBTOUVROW_AVX2
1598
1599__declspec(naked)
1600void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1601                          uint8* dst_u, uint8* dst_v, int width) {
1602  __asm {
1603    push       edi
1604    mov        eax, [esp + 4 + 4]   // src_argb
1605    mov        edx, [esp + 4 + 8]   // dst_u
1606    mov        edi, [esp + 4 + 12]  // dst_v
1607    mov        ecx, [esp + 4 + 16]  // pix
1608    movdqa     xmm5, kAddUV128
1609    movdqa     xmm6, kARGBToV
1610    movdqa     xmm7, kARGBToU
1611    sub        edi, edx             // stride from u to v
1612
1613 convertloop:
1614    /* convert to U and V */
1615    movdqu     xmm0, [eax]          // U
1616    movdqu     xmm1, [eax + 16]
1617    movdqu     xmm2, [eax + 32]
1618    movdqu     xmm3, [eax + 48]
1619    pmaddubsw  xmm0, xmm7
1620    pmaddubsw  xmm1, xmm7
1621    pmaddubsw  xmm2, xmm7
1622    pmaddubsw  xmm3, xmm7
1623    phaddw     xmm0, xmm1
1624    phaddw     xmm2, xmm3
1625    psraw      xmm0, 8
1626    psraw      xmm2, 8
1627    packsswb   xmm0, xmm2
1628    paddb      xmm0, xmm5
1629    movdqu     [edx], xmm0
1630
1631    movdqu     xmm0, [eax]          // V
1632    movdqu     xmm1, [eax + 16]
1633    movdqu     xmm2, [eax + 32]
1634    movdqu     xmm3, [eax + 48]
1635    pmaddubsw  xmm0, xmm6
1636    pmaddubsw  xmm1, xmm6
1637    pmaddubsw  xmm2, xmm6
1638    pmaddubsw  xmm3, xmm6
1639    phaddw     xmm0, xmm1
1640    phaddw     xmm2, xmm3
1641    psraw      xmm0, 8
1642    psraw      xmm2, 8
1643    packsswb   xmm0, xmm2
1644    paddb      xmm0, xmm5
1645    lea        eax,  [eax + 64]
1646    movdqu     [edx + edi], xmm0
1647    lea        edx,  [edx + 16]
1648    sub        ecx,  16
1649    jg         convertloop
1650
1651    pop        edi
1652    ret
1653  }
1654}
1655
1656__declspec(naked)
1657void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1658                          uint8* dst_u, uint8* dst_v, int width) {
1659  __asm {
1660    push       edi
1661    mov        eax, [esp + 4 + 4]   // src_argb
1662    mov        edx, [esp + 4 + 8]   // dst_u
1663    mov        edi, [esp + 4 + 12]  // dst_v
1664    mov        ecx, [esp + 4 + 16]  // pix
1665    movdqa     xmm5, kAddUV128
1666    movdqa     xmm6, kARGBToV
1667    movdqa     xmm7, kARGBToU
1668    sub        edi, edx             // stride from u to v
1669
1670 convertloop:
1671    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1672    movdqu     xmm0, [eax]
1673    movdqu     xmm1, [eax + 16]
1674    movdqu     xmm2, [eax + 32]
1675    movdqu     xmm3, [eax + 48]
1676    lea        eax,  [eax + 64]
1677    movdqa     xmm4, xmm0
1678    shufps     xmm0, xmm1, 0x88
1679    shufps     xmm4, xmm1, 0xdd
1680    pavgb      xmm0, xmm4
1681    movdqa     xmm4, xmm2
1682    shufps     xmm2, xmm3, 0x88
1683    shufps     xmm4, xmm3, 0xdd
1684    pavgb      xmm2, xmm4
1685
1686    // step 2 - convert to U and V
1687    // from here down is very similar to Y code except
1688    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1689    movdqa     xmm1, xmm0
1690    movdqa     xmm3, xmm2
1691    pmaddubsw  xmm0, xmm7  // U
1692    pmaddubsw  xmm2, xmm7
1693    pmaddubsw  xmm1, xmm6  // V
1694    pmaddubsw  xmm3, xmm6
1695    phaddw     xmm0, xmm2
1696    phaddw     xmm1, xmm3
1697    psraw      xmm0, 8
1698    psraw      xmm1, 8
1699    packsswb   xmm0, xmm1
1700    paddb      xmm0, xmm5            // -> unsigned
1701
1702    // step 3 - store 8 U and 8 V values
1703    movlps     qword ptr [edx], xmm0 // U
1704    movhps     qword ptr [edx + edi], xmm0 // V
1705    lea        edx, [edx + 8]
1706    sub        ecx, 16
1707    jg         convertloop
1708
1709    pop        edi
1710    ret
1711  }
1712}
1713
1714__declspec(naked)
1715void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1716                       uint8* dst_u, uint8* dst_v, int width) {
1717  __asm {
1718    push       esi
1719    push       edi
1720    mov        eax, [esp + 8 + 4]   // src_argb
1721    mov        esi, [esp + 8 + 8]   // src_stride_argb
1722    mov        edx, [esp + 8 + 12]  // dst_u
1723    mov        edi, [esp + 8 + 16]  // dst_v
1724    mov        ecx, [esp + 8 + 20]  // pix
1725    movdqa     xmm5, kAddUV128
1726    movdqa     xmm6, kBGRAToV
1727    movdqa     xmm7, kBGRAToU
1728    sub        edi, edx             // stride from u to v
1729
1730 convertloop:
1731    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1732    movdqu     xmm0, [eax]
1733    movdqu     xmm4, [eax + esi]
1734    pavgb      xmm0, xmm4
1735    movdqu     xmm1, [eax + 16]
1736    movdqu     xmm4, [eax + esi + 16]
1737    pavgb      xmm1, xmm4
1738    movdqu     xmm2, [eax + 32]
1739    movdqu     xmm4, [eax + esi + 32]
1740    pavgb      xmm2, xmm4
1741    movdqu     xmm3, [eax + 48]
1742    movdqu     xmm4, [eax + esi + 48]
1743    pavgb      xmm3, xmm4
1744
1745    lea        eax,  [eax + 64]
1746    movdqa     xmm4, xmm0
1747    shufps     xmm0, xmm1, 0x88
1748    shufps     xmm4, xmm1, 0xdd
1749    pavgb      xmm0, xmm4
1750    movdqa     xmm4, xmm2
1751    shufps     xmm2, xmm3, 0x88
1752    shufps     xmm4, xmm3, 0xdd
1753    pavgb      xmm2, xmm4
1754
1755    // step 2 - convert to U and V
1756    // from here down is very similar to Y code except
1757    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1758    movdqa     xmm1, xmm0
1759    movdqa     xmm3, xmm2
1760    pmaddubsw  xmm0, xmm7  // U
1761    pmaddubsw  xmm2, xmm7
1762    pmaddubsw  xmm1, xmm6  // V
1763    pmaddubsw  xmm3, xmm6
1764    phaddw     xmm0, xmm2
1765    phaddw     xmm1, xmm3
1766    psraw      xmm0, 8
1767    psraw      xmm1, 8
1768    packsswb   xmm0, xmm1
1769    paddb      xmm0, xmm5            // -> unsigned
1770
1771    // step 3 - store 8 U and 8 V values
1772    movlps     qword ptr [edx], xmm0 // U
1773    movhps     qword ptr [edx + edi], xmm0 // V
1774    lea        edx, [edx + 8]
1775    sub        ecx, 16
1776    jg         convertloop
1777
1778    pop        edi
1779    pop        esi
1780    ret
1781  }
1782}
1783
1784__declspec(naked)
1785void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1786                       uint8* dst_u, uint8* dst_v, int width) {
1787  __asm {
1788    push       esi
1789    push       edi
1790    mov        eax, [esp + 8 + 4]   // src_argb
1791    mov        esi, [esp + 8 + 8]   // src_stride_argb
1792    mov        edx, [esp + 8 + 12]  // dst_u
1793    mov        edi, [esp + 8 + 16]  // dst_v
1794    mov        ecx, [esp + 8 + 20]  // pix
1795    movdqa     xmm5, kAddUV128
1796    movdqa     xmm6, kABGRToV
1797    movdqa     xmm7, kABGRToU
1798    sub        edi, edx             // stride from u to v
1799
1800 convertloop:
1801    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1802    movdqu     xmm0, [eax]
1803    movdqu     xmm4, [eax + esi]
1804    pavgb      xmm0, xmm4
1805    movdqu     xmm1, [eax + 16]
1806    movdqu     xmm4, [eax + esi + 16]
1807    pavgb      xmm1, xmm4
1808    movdqu     xmm2, [eax + 32]
1809    movdqu     xmm4, [eax + esi + 32]
1810    pavgb      xmm2, xmm4
1811    movdqu     xmm3, [eax + 48]
1812    movdqu     xmm4, [eax + esi + 48]
1813    pavgb      xmm3, xmm4
1814
1815    lea        eax,  [eax + 64]
1816    movdqa     xmm4, xmm0
1817    shufps     xmm0, xmm1, 0x88
1818    shufps     xmm4, xmm1, 0xdd
1819    pavgb      xmm0, xmm4
1820    movdqa     xmm4, xmm2
1821    shufps     xmm2, xmm3, 0x88
1822    shufps     xmm4, xmm3, 0xdd
1823    pavgb      xmm2, xmm4
1824
1825    // step 2 - convert to U and V
1826    // from here down is very similar to Y code except
1827    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1828    movdqa     xmm1, xmm0
1829    movdqa     xmm3, xmm2
1830    pmaddubsw  xmm0, xmm7  // U
1831    pmaddubsw  xmm2, xmm7
1832    pmaddubsw  xmm1, xmm6  // V
1833    pmaddubsw  xmm3, xmm6
1834    phaddw     xmm0, xmm2
1835    phaddw     xmm1, xmm3
1836    psraw      xmm0, 8
1837    psraw      xmm1, 8
1838    packsswb   xmm0, xmm1
1839    paddb      xmm0, xmm5            // -> unsigned
1840
1841    // step 3 - store 8 U and 8 V values
1842    movlps     qword ptr [edx], xmm0 // U
1843    movhps     qword ptr [edx + edi], xmm0 // V
1844    lea        edx, [edx + 8]
1845    sub        ecx, 16
1846    jg         convertloop
1847
1848    pop        edi
1849    pop        esi
1850    ret
1851  }
1852}
1853
1854__declspec(naked)
1855void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1856                       uint8* dst_u, uint8* dst_v, int width) {
1857  __asm {
1858    push       esi
1859    push       edi
1860    mov        eax, [esp + 8 + 4]   // src_argb
1861    mov        esi, [esp + 8 + 8]   // src_stride_argb
1862    mov        edx, [esp + 8 + 12]  // dst_u
1863    mov        edi, [esp + 8 + 16]  // dst_v
1864    mov        ecx, [esp + 8 + 20]  // pix
1865    movdqa     xmm5, kAddUV128
1866    movdqa     xmm6, kRGBAToV
1867    movdqa     xmm7, kRGBAToU
1868    sub        edi, edx             // stride from u to v
1869
1870 convertloop:
1871    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1872    movdqu     xmm0, [eax]
1873    movdqu     xmm4, [eax + esi]
1874    pavgb      xmm0, xmm4
1875    movdqu     xmm1, [eax + 16]
1876    movdqu     xmm4, [eax + esi + 16]
1877    pavgb      xmm1, xmm4
1878    movdqu     xmm2, [eax + 32]
1879    movdqu     xmm4, [eax + esi + 32]
1880    pavgb      xmm2, xmm4
1881    movdqu     xmm3, [eax + 48]
1882    movdqu     xmm4, [eax + esi + 48]
1883    pavgb      xmm3, xmm4
1884
1885    lea        eax,  [eax + 64]
1886    movdqa     xmm4, xmm0
1887    shufps     xmm0, xmm1, 0x88
1888    shufps     xmm4, xmm1, 0xdd
1889    pavgb      xmm0, xmm4
1890    movdqa     xmm4, xmm2
1891    shufps     xmm2, xmm3, 0x88
1892    shufps     xmm4, xmm3, 0xdd
1893    pavgb      xmm2, xmm4
1894
1895    // step 2 - convert to U and V
1896    // from here down is very similar to Y code except
1897    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1898    movdqa     xmm1, xmm0
1899    movdqa     xmm3, xmm2
1900    pmaddubsw  xmm0, xmm7  // U
1901    pmaddubsw  xmm2, xmm7
1902    pmaddubsw  xmm1, xmm6  // V
1903    pmaddubsw  xmm3, xmm6
1904    phaddw     xmm0, xmm2
1905    phaddw     xmm1, xmm3
1906    psraw      xmm0, 8
1907    psraw      xmm1, 8
1908    packsswb   xmm0, xmm1
1909    paddb      xmm0, xmm5            // -> unsigned
1910
1911    // step 3 - store 8 U and 8 V values
1912    movlps     qword ptr [edx], xmm0 // U
1913    movhps     qword ptr [edx + edi], xmm0 // V
1914    lea        edx, [edx + 8]
1915    sub        ecx, 16
1916    jg         convertloop
1917
1918    pop        edi
1919    pop        esi
1920    ret
1921  }
1922}
1923#endif  // HAS_ARGBTOYROW_SSSE3
1924
1925// Read 16 UV from 444
1926#define READYUV444_AVX2 __asm {                                                \
1927    __asm vmovdqu    xmm0, [esi]                  /* U */         /* NOLINT */ \
1928    __asm vmovdqu    xmm1, [esi + edi]            /* V */         /* NOLINT */ \
1929    __asm lea        esi,  [esi + 16]                                          \
1930    __asm vpermq     ymm0, ymm0, 0xd8                                          \
1931    __asm vpermq     ymm1, ymm1, 0xd8                                          \
1932    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
1933  }
1934
1935// Read 8 UV from 422, upsample to 16 UV.
1936#define READYUV422_AVX2 __asm {                                                \
1937    __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \
1938    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \
1939    __asm lea        esi,  [esi + 8]                                           \
1940    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
1941    __asm vpermq     ymm0, ymm0, 0xd8                                          \
1942    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
1943  }
1944
1945// Read 4 UV from 411, upsample to 16 UV.
1946#define READYUV411_AVX2 __asm {                                                \
1947    __asm vmovd      xmm0, dword ptr [esi]        /* U */         /* NOLINT */ \
1948    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */         /* NOLINT */ \
1949    __asm lea        esi,  [esi + 4]                                           \
1950    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
1951    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
1952    __asm vpermq     ymm0, ymm0, 0xd8                                          \
1953    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
1954  }
1955
1956// Read 8 UV from NV12, upsample to 16 UV.
1957#define READNV12_AVX2 __asm {                                                  \
1958    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
1959    __asm lea        esi,  [esi + 16]                                          \
1960    __asm vpermq     ymm0, ymm0, 0xd8                                          \
1961    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
1962  }
1963
1964// Convert 16 pixels: 16 UV and 16 Y.
1965#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
1966    /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \
1967    __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \
1968    __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \
1969    __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \
1970    __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \
1971    __asm vpsubw     ymm2, ymm3, ymm2                                          \
1972    __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \
1973    __asm vpsubw     ymm1, ymm3, ymm1                                          \
1974    __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \
1975    __asm vpsubw     ymm0, ymm3, ymm0                                          \
1976    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
1977    __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \
1978    __asm lea        eax, [eax + 16]                                           \
1979    __asm vpermq     ymm3, ymm3, 0xd8                                          \
1980    __asm vpunpcklbw ymm3, ymm3, ymm3                                          \
1981    __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \
1982    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
1983    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
1984    __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \
1985    __asm vpsraw     ymm0, ymm0, 6                                             \
1986    __asm vpsraw     ymm1, ymm1, 6                                             \
1987    __asm vpsraw     ymm2, ymm2, 6                                             \
1988    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
1989    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
1990    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
1991  }
1992
1993// Store 16 ARGB values.
1994#define STOREARGB_AVX2 __asm {                                                 \
1995    /* Step 3: Weave into ARGB */                                              \
1996    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
1997    __asm vpermq     ymm0, ymm0, 0xd8                                          \
1998    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
1999    __asm vpermq     ymm2, ymm2, 0xd8                                          \
2000    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
2001    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
2002    __asm vmovdqu    0[edx], ymm1                                              \
2003    __asm vmovdqu    32[edx], ymm0                                             \
2004    __asm lea        edx,  [edx + 64]                                          \
2005  }
2006
2007#ifdef HAS_I422TOARGBROW_AVX2
2008// 16 pixels
2009// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2010__declspec(naked)
2011void I422ToARGBRow_AVX2(const uint8* y_buf,
2012                        const uint8* u_buf,
2013                        const uint8* v_buf,
2014                        uint8* dst_argb,
2015                        int width) {
2016  __asm {
2017    push       esi
2018    push       edi
2019    mov        eax, [esp + 8 + 4]   // Y
2020    mov        esi, [esp + 8 + 8]   // U
2021    mov        edi, [esp + 8 + 12]  // V
2022    mov        edx, [esp + 8 + 16]  // argb
2023    mov        ecx, [esp + 8 + 20]  // width
2024    sub        edi, esi
2025    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2026
2027 convertloop:
2028    READYUV422_AVX2
2029    YUVTORGB_AVX2(kYuvConstants)
2030    STOREARGB_AVX2
2031
2032    sub        ecx, 16
2033    jg         convertloop
2034
2035    pop        edi
2036    pop        esi
2037    vzeroupper
2038    ret
2039  }
2040}
2041#endif  // HAS_I422TOARGBROW_AVX2
2042
2043#ifdef HAS_J422TOARGBROW_AVX2
2044// 16 pixels
2045// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2046__declspec(naked)
2047void J422ToARGBRow_AVX2(const uint8* y_buf,
2048                        const uint8* u_buf,
2049                        const uint8* v_buf,
2050                        uint8* dst_argb,
2051                        int width) {
2052  __asm {
2053    push       esi
2054    push       edi
2055    mov        eax, [esp + 8 + 4]   // Y
2056    mov        esi, [esp + 8 + 8]   // U
2057    mov        edi, [esp + 8 + 12]  // V
2058    mov        edx, [esp + 8 + 16]  // argb
2059    mov        ecx, [esp + 8 + 20]  // width
2060    sub        edi, esi
2061    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2062
2063 convertloop:
2064    READYUV422_AVX2
2065    YUVTORGB_AVX2(kYuvJConstants)
2066    STOREARGB_AVX2
2067
2068    sub        ecx, 16
2069    jg         convertloop
2070
2071    pop        edi
2072    pop        esi
2073    vzeroupper
2074    ret
2075  }
2076}
2077#endif  // HAS_J422TOARGBROW_AVX2
2078
2079#ifdef HAS_I444TOARGBROW_AVX2
2080// 16 pixels
2081// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2082__declspec(naked)
2083void I444ToARGBRow_AVX2(const uint8* y_buf,
2084                        const uint8* u_buf,
2085                        const uint8* v_buf,
2086                        uint8* dst_argb,
2087                        int width) {
2088  __asm {
2089    push       esi
2090    push       edi
2091    mov        eax, [esp + 8 + 4]   // Y
2092    mov        esi, [esp + 8 + 8]   // U
2093    mov        edi, [esp + 8 + 12]  // V
2094    mov        edx, [esp + 8 + 16]  // argb
2095    mov        ecx, [esp + 8 + 20]  // width
2096    sub        edi, esi
2097    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2098
2099 convertloop:
2100    READYUV444_AVX2
2101    YUVTORGB_AVX2(kYuvConstants)
2102    STOREARGB_AVX2
2103
2104    sub        ecx, 16
2105    jg         convertloop
2106
2107    pop        edi
2108    pop        esi
2109    vzeroupper
2110    ret
2111  }
2112}
2113#endif  // HAS_I444TOARGBROW_AVX2
2114
2115#ifdef HAS_I411TOARGBROW_AVX2
2116// 16 pixels
2117// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2118__declspec(naked)
2119void I411ToARGBRow_AVX2(const uint8* y_buf,
2120                        const uint8* u_buf,
2121                        const uint8* v_buf,
2122                        uint8* dst_argb,
2123                        int width) {
2124  __asm {
2125    push       esi
2126    push       edi
2127    mov        eax, [esp + 8 + 4]   // Y
2128    mov        esi, [esp + 8 + 8]   // U
2129    mov        edi, [esp + 8 + 12]  // V
2130    mov        edx, [esp + 8 + 16]  // argb
2131    mov        ecx, [esp + 8 + 20]  // width
2132    sub        edi, esi
2133    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2134
2135 convertloop:
2136    READYUV411_AVX2
2137    YUVTORGB_AVX2(kYuvConstants)
2138    STOREARGB_AVX2
2139
2140    sub        ecx, 16
2141    jg         convertloop
2142
2143    pop        edi
2144    pop        esi
2145    vzeroupper
2146    ret
2147  }
2148}
2149#endif  // HAS_I411TOARGBROW_AVX2
2150
2151#ifdef HAS_NV12TOARGBROW_AVX2
2152// 16 pixels.
2153// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2154__declspec(naked)
2155void NV12ToARGBRow_AVX2(const uint8* y_buf,
2156                        const uint8* uv_buf,
2157                        uint8* dst_argb,
2158                        int width) {
2159  __asm {
2160    push       esi
2161    mov        eax, [esp + 4 + 4]   // Y
2162    mov        esi, [esp + 4 + 8]   // UV
2163    mov        edx, [esp + 4 + 12]  // argb
2164    mov        ecx, [esp + 4 + 16]  // width
2165    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2166
2167 convertloop:
2168    READNV12_AVX2
2169    YUVTORGB_AVX2(kYuvConstants)
2170    STOREARGB_AVX2
2171
2172    sub        ecx, 16
2173    jg         convertloop
2174
2175    pop        esi
2176    vzeroupper
2177    ret
2178  }
2179}
2180#endif  // HAS_NV12TOARGBROW_AVX2
2181
2182#ifdef HAS_NV21TOARGBROW_AVX2
2183// 16 pixels.
2184// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
2185__declspec(naked)
2186void NV21ToARGBRow_AVX2(const uint8* y_buf,
2187                        const uint8* uv_buf,
2188                        uint8* dst_argb,
2189                        int width) {
2190  __asm {
2191    push       esi
2192    mov        eax, [esp + 4 + 4]   // Y
2193    mov        esi, [esp + 4 + 8]   // UV
2194    mov        edx, [esp + 4 + 12]  // argb
2195    mov        ecx, [esp + 4 + 16]  // width
2196    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2197
2198 convertloop:
2199    READNV12_AVX2
2200    YUVTORGB_AVX2(kYvuConstants)
2201    STOREARGB_AVX2
2202
2203    sub        ecx, 16
2204    jg         convertloop
2205
2206    pop        esi
2207    vzeroupper
2208    ret
2209  }
2210}
2211#endif  // HAS_NV21TOARGBROW_AVX2
2212
2213#ifdef HAS_I422TOBGRAROW_AVX2
2214// 16 pixels
2215// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
2216// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
2217__declspec(naked)
2218void I422ToBGRARow_AVX2(const uint8* y_buf,
2219                        const uint8* u_buf,
2220                        const uint8* v_buf,
2221                        uint8* dst_argb,
2222                        int width) {
2223  __asm {
2224    push       esi
2225    push       edi
2226    mov        eax, [esp + 8 + 4]   // Y
2227    mov        esi, [esp + 8 + 8]   // U
2228    mov        edi, [esp + 8 + 12]  // V
2229    mov        edx, [esp + 8 + 16]  // argb
2230    mov        ecx, [esp + 8 + 20]  // width
2231    sub        edi, esi
2232    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2233
2234 convertloop:
2235    READYUV422_AVX2
2236    YUVTORGB_AVX2(kYuvConstants)
2237
2238    // Step 3: Weave into BGRA
2239    vpunpcklbw ymm1, ymm1, ymm0           // GB
2240    vpermq     ymm1, ymm1, 0xd8
2241    vpunpcklbw ymm2, ymm5, ymm2           // AR
2242    vpermq     ymm2, ymm2, 0xd8
2243    vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels
2244    vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels
2245    vmovdqu    [edx], ymm0
2246    vmovdqu    [edx + 32], ymm2
2247    lea        edx,  [edx + 64]
2248    sub        ecx, 16
2249    jg         convertloop
2250
2251    pop        edi
2252    pop        esi
2253    vzeroupper
2254    ret
2255  }
2256}
2257#endif  // HAS_I422TOBGRAROW_AVX2
2258
2259#ifdef HAS_I422TORGBAROW_AVX2
2260// 16 pixels
2261// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2262// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
2263__declspec(naked)
2264void I422ToRGBARow_AVX2(const uint8* y_buf,
2265                        const uint8* u_buf,
2266                        const uint8* v_buf,
2267                        uint8* dst_argb,
2268                        int width) {
2269  __asm {
2270    push       esi
2271    push       edi
2272    mov        eax, [esp + 8 + 4]   // Y
2273    mov        esi, [esp + 8 + 8]   // U
2274    mov        edi, [esp + 8 + 12]  // V
2275    mov        edx, [esp + 8 + 16]  // argb
2276    mov        ecx, [esp + 8 + 20]  // width
2277    sub        edi, esi
2278    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2279
2280 convertloop:
2281    READYUV422_AVX2
2282    YUVTORGB_AVX2(kYuvConstants)
2283
2284    // Step 3: Weave into RGBA
2285    vpunpcklbw ymm1, ymm1, ymm2           // GR
2286    vpermq     ymm1, ymm1, 0xd8
2287    vpunpcklbw ymm2, ymm5, ymm0           // AB
2288    vpermq     ymm2, ymm2, 0xd8
2289    vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels
2290    vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels
2291    vmovdqu    [edx], ymm0
2292    vmovdqu    [edx + 32], ymm1
2293    lea        edx,  [edx + 64]
2294    sub        ecx, 16
2295    jg         convertloop
2296
2297    pop        edi
2298    pop        esi
2299    vzeroupper
2300    ret
2301  }
2302}
2303#endif  // HAS_I422TORGBAROW_AVX2
2304
2305#ifdef HAS_I422TOABGRROW_AVX2
2306// 16 pixels
2307// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
2308// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
2309__declspec(naked)
2310void I422ToABGRRow_AVX2(const uint8* y_buf,
2311                        const uint8* u_buf,
2312                        const uint8* v_buf,
2313                        uint8* dst_argb,
2314                        int width) {
2315  __asm {
2316    push       esi
2317    push       edi
2318    mov        eax, [esp + 8 + 4]   // Y
2319    mov        esi, [esp + 8 + 8]   // U
2320    mov        edi, [esp + 8 + 12]  // V
2321    mov        edx, [esp + 8 + 16]  // argb
2322    mov        ecx, [esp + 8 + 20]  // width
2323    sub        edi, esi
2324    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2325
2326 convertloop:
2327    READYUV422_AVX2
2328    YUVTORGB_AVX2(kYuvConstants)
2329
2330    // Step 3: Weave into ABGR
2331    vpunpcklbw ymm1, ymm2, ymm1           // RG
2332    vpermq     ymm1, ymm1, 0xd8
2333    vpunpcklbw ymm2, ymm0, ymm5           // BA
2334    vpermq     ymm2, ymm2, 0xd8
2335    vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels
2336    vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels
2337    vmovdqu    [edx], ymm0
2338    vmovdqu    [edx + 32], ymm1
2339    lea        edx,  [edx + 64]
2340    sub        ecx, 16
2341    jg         convertloop
2342
2343    pop        edi
2344    pop        esi
2345    vzeroupper
2346    ret
2347  }
2348}
2349#endif  // HAS_I422TOABGRROW_AVX2
2350
2351#if defined(HAS_I422TOARGBROW_SSSE3)
2352// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2353
2354// Read 8 UV from 444.
2355#define READYUV444 __asm {                                                     \
2356    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
2357    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
2358    __asm lea        esi,  [esi + 8]                                           \
2359    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2360  }
2361
2362// Read 4 UV from 422, upsample to 8 UV.
2363#define READYUV422 __asm {                                                     \
2364    __asm movd       xmm0, [esi]          /* U */                              \
2365    __asm movd       xmm1, [esi + edi]    /* V */                              \
2366    __asm lea        esi,  [esi + 4]                                           \
2367    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2368    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2369  }
2370
2371// Read 2 UV from 411, upsample to 8 UV.
2372#define READYUV411 __asm {                                                     \
2373    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
2374    __asm movd       xmm0, ebx                                                 \
2375    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
2376    __asm movd       xmm1, ebx                                                 \
2377    __asm lea        esi,  [esi + 2]                                           \
2378    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2379    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2380    __asm punpckldq  xmm0, xmm0           /* UVUVUVUV (upsample) */            \
2381  }
2382
2383// Read 4 UV from NV12, upsample to 8 UV.
2384#define READNV12 __asm {                                                       \
2385    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
2386    __asm lea        esi,  [esi + 8]                                           \
2387    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2388  }
2389
2390// Convert 8 pixels: 8 UV and 8 Y.
2391#define YUVTORGB(YuvConstants) __asm {                                         \
2392    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
2393    __asm movdqa     xmm1, xmm0                                                \
2394    __asm movdqa     xmm2, xmm0                                                \
2395    __asm movdqa     xmm3, xmm0                                                \
2396    __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \
2397    __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \
2398    __asm psubw      xmm0, xmm1                                                \
2399    __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \
2400    __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \
2401    __asm psubw      xmm1, xmm2                                                \
2402    __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \
2403    __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \
2404    __asm psubw      xmm2, xmm3                                                \
2405    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
2406    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
2407    __asm lea        eax, [eax + 8]                                            \
2408    __asm punpcklbw  xmm3, xmm3                                                \
2409    __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \
2410    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
2411    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
2412    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
2413    __asm psraw      xmm0, 6                                                   \
2414    __asm psraw      xmm1, 6                                                   \
2415    __asm psraw      xmm2, 6                                                   \
2416    __asm packuswb   xmm0, xmm0           /* B */                              \
2417    __asm packuswb   xmm1, xmm1           /* G */                              \
2418    __asm packuswb   xmm2, xmm2           /* R */                              \
2419  }
2420
2421// Store 8 ARGB values.
2422#define STOREARGB __asm {                                                      \
2423    /* Step 3: Weave into ARGB */                                              \
2424    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
2425    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
2426    __asm movdqa     xmm1, xmm0                                                \
2427    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
2428    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
2429    __asm movdqu     0[edx], xmm0                                              \
2430    __asm movdqu     16[edx], xmm1                                             \
2431    __asm lea        edx,  [edx + 32]                                          \
2432  }
2433
2434// Store 8 BGRA values.
2435#define STOREBGRA __asm {                                                      \
2436    /* Step 3: Weave into BGRA */                                              \
2437    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
2438    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
2439    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
2440    __asm movdqa     xmm0, xmm5                                                \
2441    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
2442    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
2443    __asm movdqu     0[edx], xmm5                                              \
2444    __asm movdqu     16[edx], xmm0                                             \
2445    __asm lea        edx,  [edx + 32]                                          \
2446  }
2447
2448// Store 8 ABGR values.
2449#define STOREABGR __asm {                                                      \
2450    /* Step 3: Weave into ABGR */                                              \
2451    __asm punpcklbw  xmm2, xmm1           /* RG */                             \
2452    __asm punpcklbw  xmm0, xmm5           /* BA */                             \
2453    __asm movdqa     xmm1, xmm2                                                \
2454    __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \
2455    __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \
2456    __asm movdqu     0[edx], xmm2                                              \
2457    __asm movdqu     16[edx], xmm1                                             \
2458    __asm lea        edx,  [edx + 32]                                          \
2459  }
2460
2461// Store 8 RGBA values.
2462#define STORERGBA __asm {                                                      \
2463    /* Step 3: Weave into RGBA */                                              \
2464    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
2465    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
2466    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
2467    __asm movdqa     xmm0, xmm5                                                \
2468    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
2469    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
2470    __asm movdqu     0[edx], xmm5                                              \
2471    __asm movdqu     16[edx], xmm0                                             \
2472    __asm lea        edx,  [edx + 32]                                          \
2473  }
2474
2475// Store 8 RGB24 values.
2476#define STORERGB24 __asm {                                                     \
2477    /* Step 3: Weave into RRGB */                                              \
2478    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
2479    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
2480    __asm movdqa     xmm1, xmm0                                                \
2481    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
2482    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
2483    /* Step 4: RRGB -> RGB24 */                                                \
2484    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
2485    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
2486    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
2487    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
2488    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
2489    __asm lea        edx,  [edx + 24]                                          \
2490  }
2491
2492// Store 8 RAW values.
2493#define STORERAW __asm {                                                       \
2494    /* Step 3: Weave into RRGB */                                              \
2495    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
2496    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
2497    __asm movdqa     xmm1, xmm0                                                \
2498    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
2499    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
2500    /* Step 4: RRGB -> RAW */                                                  \
2501    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
2502    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
2503    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
2504    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
2505    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
2506    __asm lea        edx,  [edx + 24]                                          \
2507  }
2508
2509// Store 8 RGB565 values.
2510#define STORERGB565 __asm {                                                    \
2511    /* Step 3: Weave into RRGB */                                              \
2512    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
2513    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
2514    __asm movdqa     xmm1, xmm0                                                \
2515    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
2516    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
2517    /* Step 4: RRGB -> RGB565 */                                               \
2518    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
2519    __asm movdqa     xmm2, xmm0    /* G */                                     \
2520    __asm pslld      xmm0, 8       /* R */                                     \
2521    __asm psrld      xmm3, 3       /* B */                                     \
2522    __asm psrld      xmm2, 5       /* G */                                     \
2523    __asm psrad      xmm0, 16      /* R */                                     \
2524    __asm pand       xmm3, xmm5    /* B */                                     \
2525    __asm pand       xmm2, xmm6    /* G */                                     \
2526    __asm pand       xmm0, xmm7    /* R */                                     \
2527    __asm por        xmm3, xmm2    /* BG */                                    \
2528    __asm por        xmm0, xmm3    /* BGR */                                   \
2529    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
2530    __asm movdqa     xmm2, xmm1    /* G */                                     \
2531    __asm pslld      xmm1, 8       /* R */                                     \
2532    __asm psrld      xmm3, 3       /* B */                                     \
2533    __asm psrld      xmm2, 5       /* G */                                     \
2534    __asm psrad      xmm1, 16      /* R */                                     \
2535    __asm pand       xmm3, xmm5    /* B */                                     \
2536    __asm pand       xmm2, xmm6    /* G */                                     \
2537    __asm pand       xmm1, xmm7    /* R */                                     \
2538    __asm por        xmm3, xmm2    /* BG */                                    \
2539    __asm por        xmm1, xmm3    /* BGR */                                   \
2540    __asm packssdw   xmm0, xmm1                                                \
2541    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
2542    __asm lea        edx, [edx + 16]                                           \
2543  }
2544
2545// 8 pixels.
2546// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2547__declspec(naked)
2548void I444ToARGBRow_SSSE3(const uint8* y_buf,
2549                         const uint8* u_buf,
2550                         const uint8* v_buf,
2551                         uint8* dst_argb,
2552                         int width) {
2553  __asm {
2554    push       esi
2555    push       edi
2556    mov        eax, [esp + 8 + 4]   // Y
2557    mov        esi, [esp + 8 + 8]   // U
2558    mov        edi, [esp + 8 + 12]  // V
2559    mov        edx, [esp + 8 + 16]  // argb
2560    mov        ecx, [esp + 8 + 20]  // width
2561    sub        edi, esi
2562    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2563
2564 convertloop:
2565    READYUV444
2566    YUVTORGB(kYuvConstants)
2567    STOREARGB
2568
2569    sub        ecx, 8
2570    jg         convertloop
2571
2572    pop        edi
2573    pop        esi
2574    ret
2575  }
2576}
2577
2578// 8 pixels.
2579// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2580__declspec(naked)
2581void I422ToRGB24Row_SSSE3(const uint8* y_buf,
2582                          const uint8* u_buf,
2583                          const uint8* v_buf,
2584                          uint8* dst_rgb24,
2585                          int width) {
2586  __asm {
2587    push       esi
2588    push       edi
2589    mov        eax, [esp + 8 + 4]   // Y
2590    mov        esi, [esp + 8 + 8]   // U
2591    mov        edi, [esp + 8 + 12]  // V
2592    mov        edx, [esp + 8 + 16]  // rgb24
2593    mov        ecx, [esp + 8 + 20]  // width
2594    sub        edi, esi
2595    movdqa     xmm5, kShuffleMaskARGBToRGB24_0
2596    movdqa     xmm6, kShuffleMaskARGBToRGB24
2597
2598 convertloop:
2599    READYUV422
2600    YUVTORGB(kYuvConstants)
2601    STORERGB24
2602
2603    sub        ecx, 8
2604    jg         convertloop
2605
2606    pop        edi
2607    pop        esi
2608    ret
2609  }
2610}
2611
2612// 8 pixels.
2613// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
2614__declspec(naked)
2615void I422ToRAWRow_SSSE3(const uint8* y_buf,
2616                        const uint8* u_buf,
2617                        const uint8* v_buf,
2618                        uint8* dst_raw,
2619                        int width) {
2620  __asm {
2621    push       esi
2622    push       edi
2623    mov        eax, [esp + 8 + 4]   // Y
2624    mov        esi, [esp + 8 + 8]   // U
2625    mov        edi, [esp + 8 + 12]  // V
2626    mov        edx, [esp + 8 + 16]  // raw
2627    mov        ecx, [esp + 8 + 20]  // width
2628    sub        edi, esi
2629    movdqa     xmm5, kShuffleMaskARGBToRAW_0
2630    movdqa     xmm6, kShuffleMaskARGBToRAW
2631
2632 convertloop:
2633    READYUV422
2634    YUVTORGB(kYuvConstants)
2635    STORERAW
2636
2637    sub        ecx, 8
2638    jg         convertloop
2639
2640    pop        edi
2641    pop        esi
2642    ret
2643  }
2644}
2645
2646// 8 pixels
2647// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2648__declspec(naked)
2649void I422ToRGB565Row_SSSE3(const uint8* y_buf,
2650                           const uint8* u_buf,
2651                           const uint8* v_buf,
2652                           uint8* rgb565_buf,
2653                           int width) {
2654  __asm {
2655    push       esi
2656    push       edi
2657    mov        eax, [esp + 8 + 4]   // Y
2658    mov        esi, [esp + 8 + 8]   // U
2659    mov        edi, [esp + 8 + 12]  // V
2660    mov        edx, [esp + 8 + 16]  // rgb565
2661    mov        ecx, [esp + 8 + 20]  // width
2662    sub        edi, esi
2663    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
2664    psrld      xmm5, 27
2665    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
2666    psrld      xmm6, 26
2667    pslld      xmm6, 5
2668    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
2669    pslld      xmm7, 11
2670
2671 convertloop:
2672    READYUV422
2673    YUVTORGB(kYuvConstants)
2674    STORERGB565
2675
2676    sub        ecx, 8
2677    jg         convertloop
2678
2679    pop        edi
2680    pop        esi
2681    ret
2682  }
2683}
2684
2685// 8 pixels.
2686// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2687__declspec(naked)
2688void I422ToARGBRow_SSSE3(const uint8* y_buf,
2689                         const uint8* u_buf,
2690                         const uint8* v_buf,
2691                         uint8* dst_argb,
2692                         int width) {
2693  __asm {
2694    push       esi
2695    push       edi
2696    mov        eax, [esp + 8 + 4]   // Y
2697    mov        esi, [esp + 8 + 8]   // U
2698    mov        edi, [esp + 8 + 12]  // V
2699    mov        edx, [esp + 8 + 16]  // argb
2700    mov        ecx, [esp + 8 + 20]  // width
2701    sub        edi, esi
2702    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2703
2704 convertloop:
2705    READYUV422
2706    YUVTORGB(kYuvConstants)
2707    STOREARGB
2708
2709    sub        ecx, 8
2710    jg         convertloop
2711
2712    pop        edi
2713    pop        esi
2714    ret
2715  }
2716}
2717
2718// 8 pixels.
2719// JPeg color space version of I422ToARGB
2720// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2721__declspec(naked)
2722void J422ToARGBRow_SSSE3(const uint8* y_buf,
2723                         const uint8* u_buf,
2724                         const uint8* v_buf,
2725                         uint8* dst_argb,
2726                         int width) {
2727  __asm {
2728    push       esi
2729    push       edi
2730    mov        eax, [esp + 8 + 4]   // Y
2731    mov        esi, [esp + 8 + 8]   // U
2732    mov        edi, [esp + 8 + 12]  // V
2733    mov        edx, [esp + 8 + 16]  // argb
2734    mov        ecx, [esp + 8 + 20]  // width
2735    sub        edi, esi
2736    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2737
2738 convertloop:
2739    READYUV422
2740    YUVTORGB(kYuvJConstants)
2741    STOREARGB
2742
2743    sub        ecx, 8
2744    jg         convertloop
2745
2746    pop        edi
2747    pop        esi
2748    ret
2749  }
2750}
2751
2752// 8 pixels.
2753// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2754// Similar to I420 but duplicate UV once more.
2755__declspec(naked)
2756void I411ToARGBRow_SSSE3(const uint8* y_buf,
2757                         const uint8* u_buf,
2758                         const uint8* v_buf,
2759                         uint8* dst_argb,
2760                         int width) {
2761  __asm {
2762    push       ebx
2763    push       esi
2764    push       edi
2765    mov        eax, [esp + 12 + 4]   // Y
2766    mov        esi, [esp + 12 + 8]   // U
2767    mov        edi, [esp + 12 + 12]  // V
2768    mov        edx, [esp + 12 + 16]  // argb
2769    mov        ecx, [esp + 12 + 20]  // width
2770    sub        edi, esi
2771    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
2772
2773 convertloop:
2774    READYUV411  // modifies EBX
2775    YUVTORGB(kYuvConstants)
2776    STOREARGB
2777
2778    sub        ecx, 8
2779    jg         convertloop
2780
2781    pop        edi
2782    pop        esi
2783    pop        ebx
2784    ret
2785  }
2786}
2787
2788// 8 pixels.
2789// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2790__declspec(naked)
2791void NV12ToARGBRow_SSSE3(const uint8* y_buf,
2792                         const uint8* uv_buf,
2793                         uint8* dst_argb,
2794                         int width) {
2795  __asm {
2796    push       esi
2797    mov        eax, [esp + 4 + 4]   // Y
2798    mov        esi, [esp + 4 + 8]   // UV
2799    mov        edx, [esp + 4 + 12]  // argb
2800    mov        ecx, [esp + 4 + 16]  // width
2801    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2802
2803 convertloop:
2804    READNV12
2805    YUVTORGB(kYuvConstants)
2806    STOREARGB
2807
2808    sub        ecx, 8
2809    jg         convertloop
2810
2811    pop        esi
2812    ret
2813  }
2814}
2815
2816// 8 pixels.
2817// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
2818__declspec(naked)
2819void NV21ToARGBRow_SSSE3(const uint8* y_buf,
2820                         const uint8* uv_buf,
2821                         uint8* dst_argb,
2822                         int width) {
2823  __asm {
2824    push       esi
2825    mov        eax, [esp + 4 + 4]   // Y
2826    mov        esi, [esp + 4 + 8]   // UV
2827    mov        edx, [esp + 4 + 12]  // argb
2828    mov        ecx, [esp + 4 + 16]  // width
2829    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2830
2831 convertloop:
2832    READNV12
2833    YUVTORGB(kYvuConstants)
2834    STOREARGB
2835
2836    sub        ecx, 8
2837    jg         convertloop
2838
2839    pop        esi
2840    ret
2841  }
2842}
2843
2844__declspec(naked)
2845void I422ToBGRARow_SSSE3(const uint8* y_buf,
2846                         const uint8* u_buf,
2847                         const uint8* v_buf,
2848                         uint8* dst_bgra,
2849                         int width) {
2850  __asm {
2851    push       esi
2852    push       edi
2853    mov        eax, [esp + 8 + 4]   // Y
2854    mov        esi, [esp + 8 + 8]   // U
2855    mov        edi, [esp + 8 + 12]  // V
2856    mov        edx, [esp + 8 + 16]  // bgra
2857    mov        ecx, [esp + 8 + 20]  // width
2858    sub        edi, esi
2859
2860 convertloop:
2861    READYUV422
2862    YUVTORGB(kYuvConstants)
2863    STOREBGRA
2864
2865    sub        ecx, 8
2866    jg         convertloop
2867
2868    pop        edi
2869    pop        esi
2870    ret
2871  }
2872}
2873
2874__declspec(naked)
2875void I422ToABGRRow_SSSE3(const uint8* y_buf,
2876                         const uint8* u_buf,
2877                         const uint8* v_buf,
2878                         uint8* dst_abgr,
2879                         int width) {
2880  __asm {
2881    push       esi
2882    push       edi
2883    mov        eax, [esp + 8 + 4]   // Y
2884    mov        esi, [esp + 8 + 8]   // U
2885    mov        edi, [esp + 8 + 12]  // V
2886    mov        edx, [esp + 8 + 16]  // abgr
2887    mov        ecx, [esp + 8 + 20]  // width
2888    sub        edi, esi
2889    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2890
2891 convertloop:
2892    READYUV422
2893    YUVTORGB(kYuvConstants)
2894    STOREABGR
2895
2896    sub        ecx, 8
2897    jg         convertloop
2898
2899    pop        edi
2900    pop        esi
2901    ret
2902  }
2903}
2904
2905__declspec(naked)
2906void I422ToRGBARow_SSSE3(const uint8* y_buf,
2907                         const uint8* u_buf,
2908                         const uint8* v_buf,
2909                         uint8* dst_rgba,
2910                         int width) {
2911  __asm {
2912    push       esi
2913    push       edi
2914    mov        eax, [esp + 8 + 4]   // Y
2915    mov        esi, [esp + 8 + 8]   // U
2916    mov        edi, [esp + 8 + 12]  // V
2917    mov        edx, [esp + 8 + 16]  // rgba
2918    mov        ecx, [esp + 8 + 20]  // width
2919    sub        edi, esi
2920
2921 convertloop:
2922    READYUV422
2923    YUVTORGB(kYuvConstants)
2924    STORERGBA
2925
2926    sub        ecx, 8
2927    jg         convertloop
2928
2929    pop        edi
2930    pop        esi
2931    ret
2932  }
2933}
2934
2935#endif  // HAS_I422TOARGBROW_SSSE3
2936
2937#ifdef HAS_I400TOARGBROW_SSE2
2938// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
2939__declspec(naked)
2940void I400ToARGBRow_SSE2(const uint8* y_buf,
2941                        uint8* rgb_buf,
2942                        int width) {
2943  __asm {
2944    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
2945    movd       xmm2, eax
2946    pshufd     xmm2, xmm2,0
2947    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
2948    movd       xmm3, eax
2949    pshufd     xmm3, xmm3, 0
2950    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
2951    pslld      xmm4, 24
2952
2953    mov        eax, [esp + 4]       // Y
2954    mov        edx, [esp + 8]       // rgb
2955    mov        ecx, [esp + 12]      // width
2956
2957 convertloop:
2958    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2959    movq       xmm0, qword ptr [eax]
2960    lea        eax, [eax + 8]
2961    punpcklbw  xmm0, xmm0           // Y.Y
2962    pmulhuw    xmm0, xmm2
2963    psubusw    xmm0, xmm3
2964    psrlw      xmm0, 6
2965    packuswb   xmm0, xmm0           // G
2966
2967    // Step 2: Weave into ARGB
2968    punpcklbw  xmm0, xmm0           // GG
2969    movdqa     xmm1, xmm0
2970    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
2971    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
2972    por        xmm0, xmm4
2973    por        xmm1, xmm4
2974    movdqu     [edx], xmm0
2975    movdqu     [edx + 16], xmm1
2976    lea        edx,  [edx + 32]
2977    sub        ecx, 8
2978    jg         convertloop
2979    ret
2980  }
2981}
2982#endif  // HAS_I400TOARGBROW_SSE2
2983
2984#ifdef HAS_I400TOARGBROW_AVX2
2985// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2986// note: vpunpcklbw mutates and vpackuswb unmutates.
2987__declspec(naked)
2988void I400ToARGBRow_AVX2(const uint8* y_buf,
2989                        uint8* rgb_buf,
2990                        int width) {
2991  __asm {
2992    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
2993    vmovd      xmm2, eax
2994    vbroadcastss ymm2, xmm2
2995    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
2996    vmovd      xmm3, eax
2997    vbroadcastss ymm3, xmm3
2998    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
2999    vpslld     ymm4, ymm4, 24
3000
3001    mov        eax, [esp + 4]       // Y
3002    mov        edx, [esp + 8]       // rgb
3003    mov        ecx, [esp + 12]      // width
3004
3005 convertloop:
3006    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
3007    vmovdqu    xmm0, [eax]
3008    lea        eax, [eax + 16]
3009    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
3010    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
3011    vpmulhuw   ymm0, ymm0, ymm2
3012    vpsubusw   ymm0, ymm0, ymm3
3013    vpsrlw     ymm0, ymm0, 6
3014    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
3015
3016    // TODO(fbarchard): Weave alpha with unpack.
3017    // Step 2: Weave into ARGB
3018    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
3019    vpermq     ymm1, ymm1, 0xd8
3020    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
3021    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
3022    vpor       ymm0, ymm0, ymm4
3023    vpor       ymm1, ymm1, ymm4
3024    vmovdqu    [edx], ymm0
3025    vmovdqu    [edx + 32], ymm1
3026    lea        edx,  [edx + 64]
3027    sub        ecx, 16
3028    jg         convertloop
3029    vzeroupper
3030    ret
3031  }
3032}
3033#endif  // HAS_I400TOARGBROW_AVX2
3034
3035#ifdef HAS_MIRRORROW_SSSE3
3036// Shuffle table for reversing the bytes.
3037static const uvec8 kShuffleMirror = {
3038  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3039};
3040
3041// TODO(fbarchard): Replace lea with -16 offset.
3042__declspec(naked)
3043void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3044  __asm {
3045    mov       eax, [esp + 4]   // src
3046    mov       edx, [esp + 8]   // dst
3047    mov       ecx, [esp + 12]  // width
3048    movdqa    xmm5, kShuffleMirror
3049
3050 convertloop:
3051    movdqu    xmm0, [eax - 16 + ecx]
3052    pshufb    xmm0, xmm5
3053    movdqu    [edx], xmm0
3054    lea       edx, [edx + 16]
3055    sub       ecx, 16
3056    jg        convertloop
3057    ret
3058  }
3059}
3060#endif  // HAS_MIRRORROW_SSSE3
3061
3062#ifdef HAS_MIRRORROW_AVX2
3063__declspec(naked)
3064void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3065  __asm {
3066    mov       eax, [esp + 4]   // src
3067    mov       edx, [esp + 8]   // dst
3068    mov       ecx, [esp + 12]  // width
3069    vbroadcastf128 ymm5, kShuffleMirror
3070
3071 convertloop:
3072    vmovdqu   ymm0, [eax - 32 + ecx]
3073    vpshufb   ymm0, ymm0, ymm5
3074    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
3075    vmovdqu   [edx], ymm0
3076    lea       edx, [edx + 32]
3077    sub       ecx, 32
3078    jg        convertloop
3079    vzeroupper
3080    ret
3081  }
3082}
3083#endif  // HAS_MIRRORROW_AVX2
3084
3085#ifdef HAS_MIRRORROW_SSE2
3086__declspec(naked)
3087void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3088  __asm {
3089    mov       eax, [esp + 4]   // src
3090    mov       edx, [esp + 8]   // dst
3091    mov       ecx, [esp + 12]  // width
3092
3093 convertloop:
3094    movdqu    xmm0, [eax - 16 + ecx]
3095    movdqa    xmm1, xmm0        // swap bytes
3096    psllw     xmm0, 8
3097    psrlw     xmm1, 8
3098    por       xmm0, xmm1
3099    pshuflw   xmm0, xmm0, 0x1b  // swap words
3100    pshufhw   xmm0, xmm0, 0x1b
3101    pshufd    xmm0, xmm0, 0x4e  // swap qwords
3102    movdqu    [edx], xmm0
3103    lea       edx, [edx + 16]
3104    sub       ecx, 16
3105    jg        convertloop
3106    ret
3107  }
3108}
3109#endif  // HAS_MIRRORROW_SSE2
3110
3111#ifdef HAS_MIRRORROW_UV_SSSE3
3112// Shuffle table for reversing the bytes of UV channels.
3113static const uvec8 kShuffleMirrorUV = {
3114  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3115};
3116
3117__declspec(naked)
3118void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3119                       int width) {
3120  __asm {
3121    push      edi
3122    mov       eax, [esp + 4 + 4]   // src
3123    mov       edx, [esp + 4 + 8]   // dst_u
3124    mov       edi, [esp + 4 + 12]  // dst_v
3125    mov       ecx, [esp + 4 + 16]  // width
3126    movdqa    xmm1, kShuffleMirrorUV
3127    lea       eax, [eax + ecx * 2 - 16]
3128    sub       edi, edx
3129
3130 convertloop:
3131    movdqu    xmm0, [eax]
3132    lea       eax, [eax - 16]
3133    pshufb    xmm0, xmm1
3134    movlpd    qword ptr [edx], xmm0
3135    movhpd    qword ptr [edx + edi], xmm0
3136    lea       edx, [edx + 8]
3137    sub       ecx, 8
3138    jg        convertloop
3139
3140    pop       edi
3141    ret
3142  }
3143}
3144#endif  // HAS_MIRRORROW_UV_SSSE3
3145
3146#ifdef HAS_ARGBMIRRORROW_SSE2
3147__declspec(naked)
3148void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3149  __asm {
3150    mov       eax, [esp + 4]   // src
3151    mov       edx, [esp + 8]   // dst
3152    mov       ecx, [esp + 12]  // width
3153    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
3154
3155 convertloop:
3156    movdqu    xmm0, [eax]
3157    lea       eax, [eax - 16]
3158    pshufd    xmm0, xmm0, 0x1b
3159    movdqu    [edx], xmm0
3160    lea       edx, [edx + 16]
3161    sub       ecx, 4
3162    jg        convertloop
3163    ret
3164  }
3165}
3166#endif  // HAS_ARGBMIRRORROW_SSE2
3167
3168#ifdef HAS_ARGBMIRRORROW_AVX2
3169// Shuffle table for reversing the bytes.
3170static const ulvec32 kARGBShuffleMirror_AVX2 = {
3171  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3172};
3173
3174__declspec(naked)
3175void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3176  __asm {
3177    mov       eax, [esp + 4]   // src
3178    mov       edx, [esp + 8]   // dst
3179    mov       ecx, [esp + 12]  // width
3180    vmovdqu   ymm5, kARGBShuffleMirror_AVX2
3181
3182 convertloop:
3183    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
3184    vmovdqu   [edx], ymm0
3185    lea       edx, [edx + 32]
3186    sub       ecx, 8
3187    jg        convertloop
3188    vzeroupper
3189    ret
3190  }
3191}
3192#endif  // HAS_ARGBMIRRORROW_AVX2
3193
3194#ifdef HAS_SPLITUVROW_SSE2
3195__declspec(naked)
3196void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3197  __asm {
3198    push       edi
3199    mov        eax, [esp + 4 + 4]    // src_uv
3200    mov        edx, [esp + 4 + 8]    // dst_u
3201    mov        edi, [esp + 4 + 12]   // dst_v
3202    mov        ecx, [esp + 4 + 16]   // pix
3203    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3204    psrlw      xmm5, 8
3205    sub        edi, edx
3206
3207  convertloop:
3208    movdqu     xmm0, [eax]
3209    movdqu     xmm1, [eax + 16]
3210    lea        eax,  [eax + 32]
3211    movdqa     xmm2, xmm0
3212    movdqa     xmm3, xmm1
3213    pand       xmm0, xmm5   // even bytes
3214    pand       xmm1, xmm5
3215    packuswb   xmm0, xmm1
3216    psrlw      xmm2, 8      // odd bytes
3217    psrlw      xmm3, 8
3218    packuswb   xmm2, xmm3
3219    movdqu     [edx], xmm0
3220    movdqu     [edx + edi], xmm2
3221    lea        edx, [edx + 16]
3222    sub        ecx, 16
3223    jg         convertloop
3224
3225    pop        edi
3226    ret
3227  }
3228}
3229
3230#endif  // HAS_SPLITUVROW_SSE2
3231
3232#ifdef HAS_SPLITUVROW_AVX2
3233__declspec(naked)
3234void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3235  __asm {
3236    push       edi
3237    mov        eax, [esp + 4 + 4]    // src_uv
3238    mov        edx, [esp + 4 + 8]    // dst_u
3239    mov        edi, [esp + 4 + 12]   // dst_v
3240    mov        ecx, [esp + 4 + 16]   // pix
3241    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3242    vpsrlw     ymm5, ymm5, 8
3243    sub        edi, edx
3244
3245  convertloop:
3246    vmovdqu    ymm0, [eax]
3247    vmovdqu    ymm1, [eax + 32]
3248    lea        eax,  [eax + 64]
3249    vpsrlw     ymm2, ymm0, 8      // odd bytes
3250    vpsrlw     ymm3, ymm1, 8
3251    vpand      ymm0, ymm0, ymm5   // even bytes
3252    vpand      ymm1, ymm1, ymm5
3253    vpackuswb  ymm0, ymm0, ymm1
3254    vpackuswb  ymm2, ymm2, ymm3
3255    vpermq     ymm0, ymm0, 0xd8
3256    vpermq     ymm2, ymm2, 0xd8
3257    vmovdqu    [edx], ymm0
3258    vmovdqu    [edx + edi], ymm2
3259    lea        edx, [edx + 32]
3260    sub        ecx, 32
3261    jg         convertloop
3262
3263    pop        edi
3264    vzeroupper
3265    ret
3266  }
3267}
3268#endif  // HAS_SPLITUVROW_AVX2
3269
3270#ifdef HAS_MERGEUVROW_SSE2
3271__declspec(naked)
3272void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3273                     int width) {
3274  __asm {
3275    push       edi
3276    mov        eax, [esp + 4 + 4]    // src_u
3277    mov        edx, [esp + 4 + 8]    // src_v
3278    mov        edi, [esp + 4 + 12]   // dst_uv
3279    mov        ecx, [esp + 4 + 16]   // width
3280    sub        edx, eax
3281
3282  convertloop:
3283    movdqu     xmm0, [eax]      // read 16 U's
3284    movdqu     xmm1, [eax + edx]  // and 16 V's
3285    lea        eax,  [eax + 16]
3286    movdqa     xmm2, xmm0
3287    punpcklbw  xmm0, xmm1       // first 8 UV pairs
3288    punpckhbw  xmm2, xmm1       // next 8 UV pairs
3289    movdqu     [edi], xmm0
3290    movdqu     [edi + 16], xmm2
3291    lea        edi, [edi + 32]
3292    sub        ecx, 16
3293    jg         convertloop
3294
3295    pop        edi
3296    ret
3297  }
3298}
3299#endif  //  HAS_MERGEUVROW_SSE2
3300
3301#ifdef HAS_MERGEUVROW_AVX2
3302__declspec(naked)
3303void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3304                     int width) {
3305  __asm {
3306    push       edi
3307    mov        eax, [esp + 4 + 4]    // src_u
3308    mov        edx, [esp + 4 + 8]    // src_v
3309    mov        edi, [esp + 4 + 12]   // dst_uv
3310    mov        ecx, [esp + 4 + 16]   // width
3311    sub        edx, eax
3312
3313  convertloop:
3314    vmovdqu    ymm0, [eax]           // read 32 U's
3315    vmovdqu    ymm1, [eax + edx]     // and 32 V's
3316    lea        eax,  [eax + 32]
3317    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
3318    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
3319    vextractf128 [edi], ymm2, 0       // bytes 0..15
3320    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
3321    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
3322    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
3323    lea        edi, [edi + 64]
3324    sub        ecx, 32
3325    jg         convertloop
3326
3327    pop        edi
3328    vzeroupper
3329    ret
3330  }
3331}
3332#endif  //  HAS_MERGEUVROW_AVX2
3333
3334#ifdef HAS_COPYROW_SSE2
3335// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3336__declspec(naked)
3337void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3338  __asm {
3339    mov        eax, [esp + 4]   // src
3340    mov        edx, [esp + 8]   // dst
3341    mov        ecx, [esp + 12]  // count
3342
3343  convertloop:
3344    movdqu     xmm0, [eax]
3345    movdqu     xmm1, [eax + 16]
3346    lea        eax, [eax + 32]
3347    movdqu     [edx], xmm0
3348    movdqu     [edx + 16], xmm1
3349    lea        edx, [edx + 32]
3350    sub        ecx, 32
3351    jg         convertloop
3352    ret
3353  }
3354}
3355#endif  // HAS_COPYROW_SSE2
3356
3357#ifdef HAS_COPYROW_AVX
3358// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
3359__declspec(naked)
3360void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
3361  __asm {
3362    mov        eax, [esp + 4]   // src
3363    mov        edx, [esp + 8]   // dst
3364    mov        ecx, [esp + 12]  // count
3365
3366  convertloop:
3367    vmovdqu    ymm0, [eax]
3368    vmovdqu    ymm1, [eax + 32]
3369    lea        eax, [eax + 64]
3370    vmovdqu    [edx], ymm0
3371    vmovdqu    [edx + 32], ymm1
3372    lea        edx, [edx + 64]
3373    sub        ecx, 64
3374    jg         convertloop
3375
3376    vzeroupper
3377    ret
3378  }
3379}
3380#endif  // HAS_COPYROW_AVX
3381
3382// Multiple of 1.
3383__declspec(naked)
3384void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3385  __asm {
3386    mov        eax, esi
3387    mov        edx, edi
3388    mov        esi, [esp + 4]   // src
3389    mov        edi, [esp + 8]   // dst
3390    mov        ecx, [esp + 12]  // count
3391    rep movsb
3392    mov        edi, edx
3393    mov        esi, eax
3394    ret
3395  }
3396}
3397
3398#ifdef HAS_ARGBCOPYALPHAROW_SSE2
3399// width in pixels
3400__declspec(naked)
3401void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3402  __asm {
3403    mov        eax, [esp + 4]   // src
3404    mov        edx, [esp + 8]   // dst
3405    mov        ecx, [esp + 12]  // count
3406    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
3407    pslld      xmm0, 24
3408    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
3409    psrld      xmm1, 8
3410
3411  convertloop:
3412    movdqu     xmm2, [eax]
3413    movdqu     xmm3, [eax + 16]
3414    lea        eax, [eax + 32]
3415    movdqu     xmm4, [edx]
3416    movdqu     xmm5, [edx + 16]
3417    pand       xmm2, xmm0
3418    pand       xmm3, xmm0
3419    pand       xmm4, xmm1
3420    pand       xmm5, xmm1
3421    por        xmm2, xmm4
3422    por        xmm3, xmm5
3423    movdqu     [edx], xmm2
3424    movdqu     [edx + 16], xmm3
3425    lea        edx, [edx + 32]
3426    sub        ecx, 8
3427    jg         convertloop
3428
3429    ret
3430  }
3431}
3432#endif  // HAS_ARGBCOPYALPHAROW_SSE2
3433
3434#ifdef HAS_ARGBCOPYALPHAROW_AVX2
3435// width in pixels
3436__declspec(naked)
3437void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3438  __asm {
3439    mov        eax, [esp + 4]   // src
3440    mov        edx, [esp + 8]   // dst
3441    mov        ecx, [esp + 12]  // count
3442    vpcmpeqb   ymm0, ymm0, ymm0
3443    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
3444
3445  convertloop:
3446    vmovdqu    ymm1, [eax]
3447    vmovdqu    ymm2, [eax + 32]
3448    lea        eax, [eax + 64]
3449    vpblendvb  ymm1, ymm1, [edx], ymm0
3450    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3451    vmovdqu    [edx], ymm1
3452    vmovdqu    [edx + 32], ymm2
3453    lea        edx, [edx + 64]
3454    sub        ecx, 16
3455    jg         convertloop
3456
3457    vzeroupper
3458    ret
3459  }
3460}
3461#endif  // HAS_ARGBCOPYALPHAROW_AVX2
3462
3463#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3464// width in pixels
3465__declspec(naked)
3466void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3467  __asm {
3468    mov        eax, [esp + 4]   // src
3469    mov        edx, [esp + 8]   // dst
3470    mov        ecx, [esp + 12]  // count
3471    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
3472    pslld      xmm0, 24
3473    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
3474    psrld      xmm1, 8
3475
3476  convertloop:
3477    movq       xmm2, qword ptr [eax]  // 8 Y's
3478    lea        eax, [eax + 8]
3479    punpcklbw  xmm2, xmm2
3480    punpckhwd  xmm3, xmm2
3481    punpcklwd  xmm2, xmm2
3482    movdqu     xmm4, [edx]
3483    movdqu     xmm5, [edx + 16]
3484    pand       xmm2, xmm0
3485    pand       xmm3, xmm0
3486    pand       xmm4, xmm1
3487    pand       xmm5, xmm1
3488    por        xmm2, xmm4
3489    por        xmm3, xmm5
3490    movdqu     [edx], xmm2
3491    movdqu     [edx + 16], xmm3
3492    lea        edx, [edx + 32]
3493    sub        ecx, 8
3494    jg         convertloop
3495
3496    ret
3497  }
3498}
3499#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3500
3501#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3502// width in pixels
3503__declspec(naked)
3504void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3505  __asm {
3506    mov        eax, [esp + 4]   // src
3507    mov        edx, [esp + 8]   // dst
3508    mov        ecx, [esp + 12]  // count
3509    vpcmpeqb   ymm0, ymm0, ymm0
3510    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
3511
3512  convertloop:
3513    vpmovzxbd  ymm1, qword ptr [eax]
3514    vpmovzxbd  ymm2, qword ptr [eax + 8]
3515    lea        eax, [eax + 16]
3516    vpslld     ymm1, ymm1, 24
3517    vpslld     ymm2, ymm2, 24
3518    vpblendvb  ymm1, ymm1, [edx], ymm0
3519    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3520    vmovdqu    [edx], ymm1
3521    vmovdqu    [edx + 32], ymm2
3522    lea        edx, [edx + 64]
3523    sub        ecx, 16
3524    jg         convertloop
3525
3526    vzeroupper
3527    ret
3528  }
3529}
3530#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3531
3532#ifdef HAS_SETROW_X86
3533// Write 'count' bytes using an 8 bit value repeated.
3534// Count should be multiple of 4.
3535__declspec(naked)
3536void SetRow_X86(uint8* dst, uint8 v8, int count) {
3537  __asm {
3538    movzx      eax, byte ptr [esp + 8]    // v8
3539    mov        edx, 0x01010101  // Duplicate byte to all bytes.
3540    mul        edx              // overwrites edx with upper part of result.
3541    mov        edx, edi
3542    mov        edi, [esp + 4]   // dst
3543    mov        ecx, [esp + 12]  // count
3544    shr        ecx, 2
3545    rep stosd
3546    mov        edi, edx
3547    ret
3548  }
3549}
3550
3551// Write 'count' bytes using an 8 bit value repeated.
3552__declspec(naked)
3553void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
3554  __asm {
3555    mov        edx, edi
3556    mov        edi, [esp + 4]   // dst
3557    mov        eax, [esp + 8]   // v8
3558    mov        ecx, [esp + 12]  // count
3559    rep stosb
3560    mov        edi, edx
3561    ret
3562  }
3563}
3564
3565// Write 'count' 32 bit values.
3566__declspec(naked)
3567void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
3568  __asm {
3569    mov        edx, edi
3570    mov        edi, [esp + 4]   // dst
3571    mov        eax, [esp + 8]   // v32
3572    mov        ecx, [esp + 12]  // count
3573    rep stosd
3574    mov        edi, edx
3575    ret
3576  }
3577}
3578#endif  // HAS_SETROW_X86
3579
3580#ifdef HAS_YUY2TOYROW_AVX2
3581__declspec(naked)
3582void YUY2ToYRow_AVX2(const uint8* src_yuy2,
3583                     uint8* dst_y, int pix) {
3584  __asm {
3585    mov        eax, [esp + 4]    // src_yuy2
3586    mov        edx, [esp + 8]    // dst_y
3587    mov        ecx, [esp + 12]   // pix
3588    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3589    vpsrlw     ymm5, ymm5, 8
3590
3591  convertloop:
3592    vmovdqu    ymm0, [eax]
3593    vmovdqu    ymm1, [eax + 32]
3594    lea        eax,  [eax + 64]
3595    vpand      ymm0, ymm0, ymm5   // even bytes are Y
3596    vpand      ymm1, ymm1, ymm5
3597    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3598    vpermq     ymm0, ymm0, 0xd8
3599    vmovdqu    [edx], ymm0
3600    lea        edx, [edx + 32]
3601    sub        ecx, 32
3602    jg         convertloop
3603    vzeroupper
3604    ret
3605  }
3606}
3607
3608__declspec(naked)
3609void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3610                      uint8* dst_u, uint8* dst_v, int pix) {
3611  __asm {
3612    push       esi
3613    push       edi
3614    mov        eax, [esp + 8 + 4]    // src_yuy2
3615    mov        esi, [esp + 8 + 8]    // stride_yuy2
3616    mov        edx, [esp + 8 + 12]   // dst_u
3617    mov        edi, [esp + 8 + 16]   // dst_v
3618    mov        ecx, [esp + 8 + 20]   // pix
3619    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3620    vpsrlw     ymm5, ymm5, 8
3621    sub        edi, edx
3622
3623  convertloop:
3624    vmovdqu    ymm0, [eax]
3625    vmovdqu    ymm1, [eax + 32]
3626    vpavgb     ymm0, ymm0, [eax + esi]
3627    vpavgb     ymm1, ymm1, [eax + esi + 32]
3628    lea        eax,  [eax + 64]
3629    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
3630    vpsrlw     ymm1, ymm1, 8
3631    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3632    vpermq     ymm0, ymm0, 0xd8
3633    vpand      ymm1, ymm0, ymm5  // U
3634    vpsrlw     ymm0, ymm0, 8     // V
3635    vpackuswb  ymm1, ymm1, ymm1  // mutates.
3636    vpackuswb  ymm0, ymm0, ymm0  // mutates.
3637    vpermq     ymm1, ymm1, 0xd8
3638    vpermq     ymm0, ymm0, 0xd8
3639    vextractf128 [edx], ymm1, 0  // U
3640    vextractf128 [edx + edi], ymm0, 0 // V
3641    lea        edx, [edx + 16]
3642    sub        ecx, 32
3643    jg         convertloop
3644
3645    pop        edi
3646    pop        esi
3647    vzeroupper
3648    ret
3649  }
3650}
3651
3652__declspec(naked)
3653void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3654                         uint8* dst_u, uint8* dst_v, int pix) {
3655  __asm {
3656    push       edi
3657    mov        eax, [esp + 4 + 4]    // src_yuy2
3658    mov        edx, [esp + 4 + 8]    // dst_u
3659    mov        edi, [esp + 4 + 12]   // dst_v
3660    mov        ecx, [esp + 4 + 16]   // pix
3661    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3662    vpsrlw     ymm5, ymm5, 8
3663    sub        edi, edx
3664
3665  convertloop:
3666    vmovdqu    ymm0, [eax]
3667    vmovdqu    ymm1, [eax + 32]
3668    lea        eax,  [eax + 64]
3669    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
3670    vpsrlw     ymm1, ymm1, 8
3671    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3672    vpermq     ymm0, ymm0, 0xd8
3673    vpand      ymm1, ymm0, ymm5  // U
3674    vpsrlw     ymm0, ymm0, 8     // V
3675    vpackuswb  ymm1, ymm1, ymm1  // mutates.
3676    vpackuswb  ymm0, ymm0, ymm0  // mutates.
3677    vpermq     ymm1, ymm1, 0xd8
3678    vpermq     ymm0, ymm0, 0xd8
3679    vextractf128 [edx], ymm1, 0  // U
3680    vextractf128 [edx + edi], ymm0, 0 // V
3681    lea        edx, [edx + 16]
3682    sub        ecx, 32
3683    jg         convertloop
3684
3685    pop        edi
3686    vzeroupper
3687    ret
3688  }
3689}
3690
3691__declspec(naked)
3692void UYVYToYRow_AVX2(const uint8* src_uyvy,
3693                     uint8* dst_y, int pix) {
3694  __asm {
3695    mov        eax, [esp + 4]    // src_uyvy
3696    mov        edx, [esp + 8]    // dst_y
3697    mov        ecx, [esp + 12]   // pix
3698
3699  convertloop:
3700    vmovdqu    ymm0, [eax]
3701    vmovdqu    ymm1, [eax + 32]
3702    lea        eax,  [eax + 64]
3703    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
3704    vpsrlw     ymm1, ymm1, 8
3705    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3706    vpermq     ymm0, ymm0, 0xd8
3707    vmovdqu    [edx], ymm0
3708    lea        edx, [edx + 32]
3709    sub        ecx, 32
3710    jg         convertloop
3711    vzeroupper
3712    ret
3713  }
3714}
3715
3716__declspec(naked)
3717void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3718                      uint8* dst_u, uint8* dst_v, int pix) {
3719  __asm {
3720    push       esi
3721    push       edi
3722    mov        eax, [esp + 8 + 4]    // src_yuy2
3723    mov        esi, [esp + 8 + 8]    // stride_yuy2
3724    mov        edx, [esp + 8 + 12]   // dst_u
3725    mov        edi, [esp + 8 + 16]   // dst_v
3726    mov        ecx, [esp + 8 + 20]   // pix
3727    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3728    vpsrlw     ymm5, ymm5, 8
3729    sub        edi, edx
3730
3731  convertloop:
3732    vmovdqu    ymm0, [eax]
3733    vmovdqu    ymm1, [eax + 32]
3734    vpavgb     ymm0, ymm0, [eax + esi]
3735    vpavgb     ymm1, ymm1, [eax + esi + 32]
3736    lea        eax,  [eax + 64]
3737    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
3738    vpand      ymm1, ymm1, ymm5
3739    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3740    vpermq     ymm0, ymm0, 0xd8
3741    vpand      ymm1, ymm0, ymm5  // U
3742    vpsrlw     ymm0, ymm0, 8     // V
3743    vpackuswb  ymm1, ymm1, ymm1  // mutates.
3744    vpackuswb  ymm0, ymm0, ymm0  // mutates.
3745    vpermq     ymm1, ymm1, 0xd8
3746    vpermq     ymm0, ymm0, 0xd8
3747    vextractf128 [edx], ymm1, 0  // U
3748    vextractf128 [edx + edi], ymm0, 0 // V
3749    lea        edx, [edx + 16]
3750    sub        ecx, 32
3751    jg         convertloop
3752
3753    pop        edi
3754    pop        esi
3755    vzeroupper
3756    ret
3757  }
3758}
3759
3760__declspec(naked)
3761void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3762                         uint8* dst_u, uint8* dst_v, int pix) {
3763  __asm {
3764    push       edi
3765    mov        eax, [esp + 4 + 4]    // src_yuy2
3766    mov        edx, [esp + 4 + 8]    // dst_u
3767    mov        edi, [esp + 4 + 12]   // dst_v
3768    mov        ecx, [esp + 4 + 16]   // pix
3769    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3770    vpsrlw     ymm5, ymm5, 8
3771    sub        edi, edx
3772
3773  convertloop:
3774    vmovdqu    ymm0, [eax]
3775    vmovdqu    ymm1, [eax + 32]
3776    lea        eax,  [eax + 64]
3777    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
3778    vpand      ymm1, ymm1, ymm5
3779    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3780    vpermq     ymm0, ymm0, 0xd8
3781    vpand      ymm1, ymm0, ymm5  // U
3782    vpsrlw     ymm0, ymm0, 8     // V
3783    vpackuswb  ymm1, ymm1, ymm1  // mutates.
3784    vpackuswb  ymm0, ymm0, ymm0  // mutates.
3785    vpermq     ymm1, ymm1, 0xd8
3786    vpermq     ymm0, ymm0, 0xd8
3787    vextractf128 [edx], ymm1, 0  // U
3788    vextractf128 [edx + edi], ymm0, 0 // V
3789    lea        edx, [edx + 16]
3790    sub        ecx, 32
3791    jg         convertloop
3792
3793    pop        edi
3794    vzeroupper
3795    ret
3796  }
3797}
3798#endif  // HAS_YUY2TOYROW_AVX2
3799
3800#ifdef HAS_YUY2TOYROW_SSE2
3801__declspec(naked)
3802void YUY2ToYRow_SSE2(const uint8* src_yuy2,
3803                     uint8* dst_y, int pix) {
3804  __asm {
3805    mov        eax, [esp + 4]    // src_yuy2
3806    mov        edx, [esp + 8]    // dst_y
3807    mov        ecx, [esp + 12]   // pix
3808    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
3809    psrlw      xmm5, 8
3810
3811  convertloop:
3812    movdqu     xmm0, [eax]
3813    movdqu     xmm1, [eax + 16]
3814    lea        eax,  [eax + 32]
3815    pand       xmm0, xmm5   // even bytes are Y
3816    pand       xmm1, xmm5
3817    packuswb   xmm0, xmm1
3818    movdqu     [edx], xmm0
3819    lea        edx, [edx + 16]
3820    sub        ecx, 16
3821    jg         convertloop
3822    ret
3823  }
3824}
3825
3826__declspec(naked)
3827void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3828                      uint8* dst_u, uint8* dst_v, int pix) {
3829  __asm {
3830    push       esi
3831    push       edi
3832    mov        eax, [esp + 8 + 4]    // src_yuy2
3833    mov        esi, [esp + 8 + 8]    // stride_yuy2
3834    mov        edx, [esp + 8 + 12]   // dst_u
3835    mov        edi, [esp + 8 + 16]   // dst_v
3836    mov        ecx, [esp + 8 + 20]   // pix
3837    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3838    psrlw      xmm5, 8
3839    sub        edi, edx
3840
3841  convertloop:
3842    movdqu     xmm0, [eax]
3843    movdqu     xmm1, [eax + 16]
3844    movdqu     xmm2, [eax + esi]
3845    movdqu     xmm3, [eax + esi + 16]
3846    lea        eax,  [eax + 32]
3847    pavgb      xmm0, xmm2
3848    pavgb      xmm1, xmm3
3849    psrlw      xmm0, 8      // YUYV -> UVUV
3850    psrlw      xmm1, 8
3851    packuswb   xmm0, xmm1
3852    movdqa     xmm1, xmm0
3853    pand       xmm0, xmm5  // U
3854    packuswb   xmm0, xmm0
3855    psrlw      xmm1, 8     // V
3856    packuswb   xmm1, xmm1
3857    movq       qword ptr [edx], xmm0
3858    movq       qword ptr [edx + edi], xmm1
3859    lea        edx, [edx + 8]
3860    sub        ecx, 16
3861    jg         convertloop
3862
3863    pop        edi
3864    pop        esi
3865    ret
3866  }
3867}
3868
3869__declspec(naked)
3870void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3871                         uint8* dst_u, uint8* dst_v, int pix) {
3872  __asm {
3873    push       edi
3874    mov        eax, [esp + 4 + 4]    // src_yuy2
3875    mov        edx, [esp + 4 + 8]    // dst_u
3876    mov        edi, [esp + 4 + 12]   // dst_v
3877    mov        ecx, [esp + 4 + 16]   // pix
3878    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3879    psrlw      xmm5, 8
3880    sub        edi, edx
3881
3882  convertloop:
3883    movdqu     xmm0, [eax]
3884    movdqu     xmm1, [eax + 16]
3885    lea        eax,  [eax + 32]
3886    psrlw      xmm0, 8      // YUYV -> UVUV
3887    psrlw      xmm1, 8
3888    packuswb   xmm0, xmm1
3889    movdqa     xmm1, xmm0
3890    pand       xmm0, xmm5  // U
3891    packuswb   xmm0, xmm0
3892    psrlw      xmm1, 8     // V
3893    packuswb   xmm1, xmm1
3894    movq       qword ptr [edx], xmm0
3895    movq       qword ptr [edx + edi], xmm1
3896    lea        edx, [edx + 8]
3897    sub        ecx, 16
3898    jg         convertloop
3899
3900    pop        edi
3901    ret
3902  }
3903}
3904
3905__declspec(naked)
3906void UYVYToYRow_SSE2(const uint8* src_uyvy,
3907                     uint8* dst_y, int pix) {
3908  __asm {
3909    mov        eax, [esp + 4]    // src_uyvy
3910    mov        edx, [esp + 8]    // dst_y
3911    mov        ecx, [esp + 12]   // pix
3912
3913  convertloop:
3914    movdqu     xmm0, [eax]
3915    movdqu     xmm1, [eax + 16]
3916    lea        eax,  [eax + 32]
3917    psrlw      xmm0, 8    // odd bytes are Y
3918    psrlw      xmm1, 8
3919    packuswb   xmm0, xmm1
3920    movdqu     [edx], xmm0
3921    lea        edx, [edx + 16]
3922    sub        ecx, 16
3923    jg         convertloop
3924    ret
3925  }
3926}
3927
3928__declspec(naked)
3929void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
3930                      uint8* dst_u, uint8* dst_v, int pix) {
3931  __asm {
3932    push       esi
3933    push       edi
3934    mov        eax, [esp + 8 + 4]    // src_yuy2
3935    mov        esi, [esp + 8 + 8]    // stride_yuy2
3936    mov        edx, [esp + 8 + 12]   // dst_u
3937    mov        edi, [esp + 8 + 16]   // dst_v
3938    mov        ecx, [esp + 8 + 20]   // pix
3939    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3940    psrlw      xmm5, 8
3941    sub        edi, edx
3942
3943  convertloop:
3944    movdqu     xmm0, [eax]
3945    movdqu     xmm1, [eax + 16]
3946    movdqu     xmm2, [eax + esi]
3947    movdqu     xmm3, [eax + esi + 16]
3948    lea        eax,  [eax + 32]
3949    pavgb      xmm0, xmm2
3950    pavgb      xmm1, xmm3
3951    pand       xmm0, xmm5   // UYVY -> UVUV
3952    pand       xmm1, xmm5
3953    packuswb   xmm0, xmm1
3954    movdqa     xmm1, xmm0
3955    pand       xmm0, xmm5  // U
3956    packuswb   xmm0, xmm0
3957    psrlw      xmm1, 8     // V
3958    packuswb   xmm1, xmm1
3959    movq       qword ptr [edx], xmm0
3960    movq       qword ptr [edx + edi], xmm1
3961    lea        edx, [edx + 8]
3962    sub        ecx, 16
3963    jg         convertloop
3964
3965    pop        edi
3966    pop        esi
3967    ret
3968  }
3969}
3970
3971__declspec(naked)
3972void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3973                         uint8* dst_u, uint8* dst_v, int pix) {
3974  __asm {
3975    push       edi
3976    mov        eax, [esp + 4 + 4]    // src_yuy2
3977    mov        edx, [esp + 4 + 8]    // dst_u
3978    mov        edi, [esp + 4 + 12]   // dst_v
3979    mov        ecx, [esp + 4 + 16]   // pix
3980    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3981    psrlw      xmm5, 8
3982    sub        edi, edx
3983
3984  convertloop:
3985    movdqu     xmm0, [eax]
3986    movdqu     xmm1, [eax + 16]
3987    lea        eax,  [eax + 32]
3988    pand       xmm0, xmm5   // UYVY -> UVUV
3989    pand       xmm1, xmm5
3990    packuswb   xmm0, xmm1
3991    movdqa     xmm1, xmm0
3992    pand       xmm0, xmm5  // U
3993    packuswb   xmm0, xmm0
3994    psrlw      xmm1, 8     // V
3995    packuswb   xmm1, xmm1
3996    movq       qword ptr [edx], xmm0
3997    movq       qword ptr [edx + edi], xmm1
3998    lea        edx, [edx + 8]
3999    sub        ecx, 16
4000    jg         convertloop
4001
4002    pop        edi
4003    ret
4004  }
4005}
4006#endif  // HAS_YUY2TOYROW_SSE2
4007
4008#ifdef HAS_ARGBBLENDROW_SSE2
4009// Blend 8 pixels at a time.
4010__declspec(naked)
4011void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4012                       uint8* dst_argb, int width) {
4013  __asm {
4014    push       esi
4015    mov        eax, [esp + 4 + 4]   // src_argb0
4016    mov        esi, [esp + 4 + 8]   // src_argb1
4017    mov        edx, [esp + 4 + 12]  // dst_argb
4018    mov        ecx, [esp + 4 + 16]  // width
4019    pcmpeqb    xmm7, xmm7       // generate constant 1
4020    psrlw      xmm7, 15
4021    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
4022    psrlw      xmm6, 8
4023    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
4024    psllw      xmm5, 8
4025    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
4026    pslld      xmm4, 24
4027    sub        ecx, 4
4028    jl         convertloop4b    // less than 4 pixels?
4029
4030    // 4 pixel loop.
4031  convertloop4:
4032    movdqu     xmm3, [eax]      // src argb
4033    lea        eax, [eax + 16]
4034    movdqa     xmm0, xmm3       // src argb
4035    pxor       xmm3, xmm4       // ~alpha
4036    movdqu     xmm2, [esi]      // _r_b
4037    psrlw      xmm3, 8          // alpha
4038    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
4039    pshuflw    xmm3, xmm3, 0F5h
4040    pand       xmm2, xmm6       // _r_b
4041    paddw      xmm3, xmm7       // 256 - alpha
4042    pmullw     xmm2, xmm3       // _r_b * alpha
4043    movdqu     xmm1, [esi]      // _a_g
4044    lea        esi, [esi + 16]
4045    psrlw      xmm1, 8          // _a_g
4046    por        xmm0, xmm4       // set alpha to 255
4047    pmullw     xmm1, xmm3       // _a_g * alpha
4048    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4049    paddusb    xmm0, xmm2       // + src argb
4050    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4051    paddusb    xmm0, xmm1       // + src argb
4052    movdqu     [edx], xmm0
4053    lea        edx, [edx + 16]
4054    sub        ecx, 4
4055    jge        convertloop4
4056
4057  convertloop4b:
4058    add        ecx, 4 - 1
4059    jl         convertloop1b
4060
4061    // 1 pixel loop.
4062  convertloop1:
4063    movd       xmm3, [eax]      // src argb
4064    lea        eax, [eax + 4]
4065    movdqa     xmm0, xmm3       // src argb
4066    pxor       xmm3, xmm4       // ~alpha
4067    movd       xmm2, [esi]      // _r_b
4068    psrlw      xmm3, 8          // alpha
4069    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
4070    pshuflw    xmm3, xmm3, 0F5h
4071    pand       xmm2, xmm6       // _r_b
4072    paddw      xmm3, xmm7       // 256 - alpha
4073    pmullw     xmm2, xmm3       // _r_b * alpha
4074    movd       xmm1, [esi]      // _a_g
4075    lea        esi, [esi + 4]
4076    psrlw      xmm1, 8          // _a_g
4077    por        xmm0, xmm4       // set alpha to 255
4078    pmullw     xmm1, xmm3       // _a_g * alpha
4079    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4080    paddusb    xmm0, xmm2       // + src argb
4081    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4082    paddusb    xmm0, xmm1       // + src argb
4083    movd       [edx], xmm0
4084    lea        edx, [edx + 4]
4085    sub        ecx, 1
4086    jge        convertloop1
4087
4088  convertloop1b:
4089    pop        esi
4090    ret
4091  }
4092}
4093#endif  // HAS_ARGBBLENDROW_SSE2
4094
4095#ifdef HAS_ARGBBLENDROW_SSSE3
4096// Shuffle table for isolating alpha.
4097static const uvec8 kShuffleAlpha = {
4098  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4099  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4100};
4101// Same as SSE2, but replaces:
4102//    psrlw      xmm3, 8          // alpha
4103//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
4104//    pshuflw    xmm3, xmm3, 0F5h
4105// with..
4106//    pshufb     xmm3, kShuffleAlpha // alpha
4107// Blend 8 pixels at a time.
4108
4109__declspec(naked)
4110void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4111                        uint8* dst_argb, int width) {
4112  __asm {
4113    push       esi
4114    mov        eax, [esp + 4 + 4]   // src_argb0
4115    mov        esi, [esp + 4 + 8]   // src_argb1
4116    mov        edx, [esp + 4 + 12]  // dst_argb
4117    mov        ecx, [esp + 4 + 16]  // width
4118    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
4119    psrlw      xmm7, 15
4120    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
4121    psrlw      xmm6, 8
4122    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
4123    psllw      xmm5, 8
4124    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
4125    pslld      xmm4, 24
4126    sub        ecx, 4
4127    jl         convertloop4b    // less than 4 pixels?
4128
4129    // 4 pixel loop.
4130  convertloop4:
4131    movdqu     xmm3, [eax]      // src argb
4132    lea        eax, [eax + 16]
4133    movdqa     xmm0, xmm3       // src argb
4134    pxor       xmm3, xmm4       // ~alpha
4135    movdqu     xmm2, [esi]      // _r_b
4136    pshufb     xmm3, kShuffleAlpha // alpha
4137    pand       xmm2, xmm6       // _r_b
4138    paddw      xmm3, xmm7       // 256 - alpha
4139    pmullw     xmm2, xmm3       // _r_b * alpha
4140    movdqu     xmm1, [esi]      // _a_g
4141    lea        esi, [esi + 16]
4142    psrlw      xmm1, 8          // _a_g
4143    por        xmm0, xmm4       // set alpha to 255
4144    pmullw     xmm1, xmm3       // _a_g * alpha
4145    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4146    paddusb    xmm0, xmm2       // + src argb
4147    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4148    paddusb    xmm0, xmm1       // + src argb
4149    movdqu     [edx], xmm0
4150    lea        edx, [edx + 16]
4151    sub        ecx, 4
4152    jge        convertloop4
4153
4154  convertloop4b:
4155    add        ecx, 4 - 1
4156    jl         convertloop1b
4157
4158    // 1 pixel loop.
4159  convertloop1:
4160    movd       xmm3, [eax]      // src argb
4161    lea        eax, [eax + 4]
4162    movdqa     xmm0, xmm3       // src argb
4163    pxor       xmm3, xmm4       // ~alpha
4164    movd       xmm2, [esi]      // _r_b
4165    pshufb     xmm3, kShuffleAlpha // alpha
4166    pand       xmm2, xmm6       // _r_b
4167    paddw      xmm3, xmm7       // 256 - alpha
4168    pmullw     xmm2, xmm3       // _r_b * alpha
4169    movd       xmm1, [esi]      // _a_g
4170    lea        esi, [esi + 4]
4171    psrlw      xmm1, 8          // _a_g
4172    por        xmm0, xmm4       // set alpha to 255
4173    pmullw     xmm1, xmm3       // _a_g * alpha
4174    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4175    paddusb    xmm0, xmm2       // + src argb
4176    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4177    paddusb    xmm0, xmm1       // + src argb
4178    movd       [edx], xmm0
4179    lea        edx, [edx + 4]
4180    sub        ecx, 1
4181    jge        convertloop1
4182
4183  convertloop1b:
4184    pop        esi
4185    ret
4186  }
4187}
4188#endif  // HAS_ARGBBLENDROW_SSSE3
4189
4190#ifdef HAS_ARGBATTENUATEROW_SSE2
4191// Attenuate 4 pixels at a time.
4192__declspec(naked)
4193void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
4194  __asm {
4195    mov        eax, [esp + 4]   // src_argb0
4196    mov        edx, [esp + 8]   // dst_argb
4197    mov        ecx, [esp + 12]  // width
4198    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
4199    pslld      xmm4, 24
4200    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
4201    psrld      xmm5, 8
4202
4203 convertloop:
4204    movdqu     xmm0, [eax]      // read 4 pixels
4205    punpcklbw  xmm0, xmm0       // first 2
4206    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
4207    pshuflw    xmm2, xmm2, 0FFh
4208    pmulhuw    xmm0, xmm2       // rgb * a
4209    movdqu     xmm1, [eax]      // read 4 pixels
4210    punpckhbw  xmm1, xmm1       // next 2 pixels
4211    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
4212    pshuflw    xmm2, xmm2, 0FFh
4213    pmulhuw    xmm1, xmm2       // rgb * a
4214    movdqu     xmm2, [eax]      // alphas
4215    lea        eax, [eax + 16]
4216    psrlw      xmm0, 8
4217    pand       xmm2, xmm4
4218    psrlw      xmm1, 8
4219    packuswb   xmm0, xmm1
4220    pand       xmm0, xmm5       // keep original alphas
4221    por        xmm0, xmm2
4222    movdqu     [edx], xmm0
4223    lea        edx, [edx + 16]
4224    sub        ecx, 4
4225    jg         convertloop
4226
4227    ret
4228  }
4229}
4230#endif  // HAS_ARGBATTENUATEROW_SSE2
4231
4232#ifdef HAS_ARGBATTENUATEROW_SSSE3
4233// Shuffle table duplicating alpha.
4234static const uvec8 kShuffleAlpha0 = {
4235  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4236};
4237static const uvec8 kShuffleAlpha1 = {
4238  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4239  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4240};
4241__declspec(naked)
4242void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4243  __asm {
4244    mov        eax, [esp + 4]   // src_argb0
4245    mov        edx, [esp + 8]   // dst_argb
4246    mov        ecx, [esp + 12]  // width
4247    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
4248    pslld      xmm3, 24
4249    movdqa     xmm4, kShuffleAlpha0
4250    movdqa     xmm5, kShuffleAlpha1
4251
4252 convertloop:
4253    movdqu     xmm0, [eax]      // read 4 pixels
4254    pshufb     xmm0, xmm4       // isolate first 2 alphas
4255    movdqu     xmm1, [eax]      // read 4 pixels
4256    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
4257    pmulhuw    xmm0, xmm1       // rgb * a
4258    movdqu     xmm1, [eax]      // read 4 pixels
4259    pshufb     xmm1, xmm5       // isolate next 2 alphas
4260    movdqu     xmm2, [eax]      // read 4 pixels
4261    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
4262    pmulhuw    xmm1, xmm2       // rgb * a
4263    movdqu     xmm2, [eax]      // mask original alpha
4264    lea        eax, [eax + 16]
4265    pand       xmm2, xmm3
4266    psrlw      xmm0, 8
4267    psrlw      xmm1, 8
4268    packuswb   xmm0, xmm1
4269    por        xmm0, xmm2       // copy original alpha
4270    movdqu     [edx], xmm0
4271    lea        edx, [edx + 16]
4272    sub        ecx, 4
4273    jg         convertloop
4274
4275    ret
4276  }
4277}
4278#endif  // HAS_ARGBATTENUATEROW_SSSE3
4279
4280#ifdef HAS_ARGBATTENUATEROW_AVX2
4281// Shuffle table duplicating alpha.
4282static const uvec8 kShuffleAlpha_AVX2 = {
4283  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
4284};
4285__declspec(naked)
4286void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
4287  __asm {
4288    mov        eax, [esp + 4]   // src_argb0
4289    mov        edx, [esp + 8]   // dst_argb
4290    mov        ecx, [esp + 12]  // width
4291    sub        edx, eax
4292    vbroadcastf128 ymm4,kShuffleAlpha_AVX2
4293    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
4294    vpslld     ymm5, ymm5, 24
4295
4296 convertloop:
4297    vmovdqu    ymm6, [eax]       // read 8 pixels.
4298    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4299    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4300    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
4301    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
4302    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
4303    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
4304    vpand      ymm6, ymm6, ymm5  // isolate alpha
4305    vpsrlw     ymm0, ymm0, 8
4306    vpsrlw     ymm1, ymm1, 8
4307    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4308    vpor       ymm0, ymm0, ymm6  // copy original alpha
4309    vmovdqu    [eax + edx], ymm0
4310    lea        eax, [eax + 32]
4311    sub        ecx, 8
4312    jg         convertloop
4313
4314    vzeroupper
4315    ret
4316  }
4317}
4318#endif  // HAS_ARGBATTENUATEROW_AVX2
4319
4320#ifdef HAS_ARGBUNATTENUATEROW_SSE2
4321// Unattenuate 4 pixels at a time.
4322__declspec(naked)
4323void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4324                             int width) {
4325  __asm {
4326    push       esi
4327    push       edi
4328    mov        eax, [esp + 8 + 4]   // src_argb0
4329    mov        edx, [esp + 8 + 8]   // dst_argb
4330    mov        ecx, [esp + 8 + 12]  // width
4331
4332 convertloop:
4333    movdqu     xmm0, [eax]      // read 4 pixels
4334    movzx      esi, byte ptr [eax + 3]  // first alpha
4335    movzx      edi, byte ptr [eax + 7]  // second alpha
4336    punpcklbw  xmm0, xmm0       // first 2
4337    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
4338    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
4339    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
4340    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
4341    movlhps    xmm2, xmm3
4342    pmulhuw    xmm0, xmm2       // rgb * a
4343
4344    movdqu     xmm1, [eax]      // read 4 pixels
4345    movzx      esi, byte ptr [eax + 11]  // third alpha
4346    movzx      edi, byte ptr [eax + 15]  // forth alpha
4347    punpckhbw  xmm1, xmm1       // next 2
4348    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
4349    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
4350    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
4351    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
4352    movlhps    xmm2, xmm3
4353    pmulhuw    xmm1, xmm2       // rgb * a
4354    lea        eax, [eax + 16]
4355
4356    packuswb   xmm0, xmm1
4357    movdqu     [edx], xmm0
4358    lea        edx, [edx + 16]
4359    sub        ecx, 4
4360    jg         convertloop
4361    pop        edi
4362    pop        esi
4363    ret
4364  }
4365}
4366#endif  // HAS_ARGBUNATTENUATEROW_SSE2
4367
4368#ifdef HAS_ARGBUNATTENUATEROW_AVX2
4369// Shuffle table duplicating alpha.
4370static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4371  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
4372};
4373// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4374// USE_GATHER is not on by default, due to being a slow instruction.
4375#ifdef USE_GATHER
4376__declspec(naked)
4377void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4378                             int width) {
4379  __asm {
4380    mov        eax, [esp + 4]   // src_argb0
4381    mov        edx, [esp + 8]   // dst_argb
4382    mov        ecx, [esp + 12]  // width
4383    sub        edx, eax
4384    vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
4385
4386 convertloop:
4387    vmovdqu    ymm6, [eax]       // read 8 pixels.
4388    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
4389    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
4390    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4391    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4392    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
4393    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4394    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4395    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
4396    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
4397    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4398    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4399    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4400    vmovdqu    [eax + edx], ymm0
4401    lea        eax, [eax + 32]
4402    sub        ecx, 8
4403    jg         convertloop
4404
4405    vzeroupper
4406    ret
4407  }
4408}
4409#else  // USE_GATHER
4410__declspec(naked)
4411void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4412                             int width) {
4413  __asm {
4414
4415    mov        eax, [esp + 4]   // src_argb0
4416    mov        edx, [esp + 8]   // dst_argb
4417    mov        ecx, [esp + 12]  // width
4418    sub        edx, eax
4419    vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
4420
4421    push       esi
4422    push       edi
4423
4424 convertloop:
4425    // replace VPGATHER
4426    movzx      esi, byte ptr [eax + 3]                 // alpha0
4427    movzx      edi, byte ptr [eax + 7]                 // alpha1
4428    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
4429    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
4430    movzx      esi, byte ptr [eax + 11]                // alpha2
4431    movzx      edi, byte ptr [eax + 15]                // alpha3
4432    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
4433    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
4434    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
4435    movzx      esi, byte ptr [eax + 19]                // alpha4
4436    movzx      edi, byte ptr [eax + 23]                // alpha5
4437    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
4438    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
4439    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
4440    movzx      esi, byte ptr [eax + 27]                // alpha6
4441    movzx      edi, byte ptr [eax + 31]                // alpha7
4442    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
4443    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
4444    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
4445    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
4446    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
4447    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
4448    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4449    // end of VPGATHER
4450
4451    vmovdqu    ymm6, [eax]       // read 8 pixels.
4452    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4453    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4454    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4455    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4456    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
4457    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
4458    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4459    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4460    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4461    vmovdqu    [eax + edx], ymm0
4462    lea        eax, [eax + 32]
4463    sub        ecx, 8
4464    jg         convertloop
4465
4466    pop        edi
4467    pop        esi
4468    vzeroupper
4469    ret
4470  }
4471}
4472#endif  // USE_GATHER
4473#endif  // HAS_ARGBATTENUATEROW_AVX2
4474
4475#ifdef HAS_ARGBGRAYROW_SSSE3
4476// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4477__declspec(naked)
4478void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4479  __asm {
4480    mov        eax, [esp + 4]   /* src_argb */
4481    mov        edx, [esp + 8]   /* dst_argb */
4482    mov        ecx, [esp + 12]  /* width */
4483    movdqa     xmm4, kARGBToYJ
4484    movdqa     xmm5, kAddYJ64
4485
4486 convertloop:
4487    movdqu     xmm0, [eax]  // G
4488    movdqu     xmm1, [eax + 16]
4489    pmaddubsw  xmm0, xmm4
4490    pmaddubsw  xmm1, xmm4
4491    phaddw     xmm0, xmm1
4492    paddw      xmm0, xmm5  // Add .5 for rounding.
4493    psrlw      xmm0, 7
4494    packuswb   xmm0, xmm0   // 8 G bytes
4495    movdqu     xmm2, [eax]  // A
4496    movdqu     xmm3, [eax + 16]
4497    lea        eax, [eax + 32]
4498    psrld      xmm2, 24
4499    psrld      xmm3, 24
4500    packuswb   xmm2, xmm3
4501    packuswb   xmm2, xmm2   // 8 A bytes
4502    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
4503    punpcklbw  xmm0, xmm0   // 8 GG words
4504    punpcklbw  xmm3, xmm2   // 8 GA words
4505    movdqa     xmm1, xmm0
4506    punpcklwd  xmm0, xmm3   // GGGA first 4
4507    punpckhwd  xmm1, xmm3   // GGGA next 4
4508    movdqu     [edx], xmm0
4509    movdqu     [edx + 16], xmm1
4510    lea        edx, [edx + 32]
4511    sub        ecx, 8
4512    jg         convertloop
4513    ret
4514  }
4515}
4516#endif  // HAS_ARGBGRAYROW_SSSE3
4517
4518#ifdef HAS_ARGBSEPIAROW_SSSE3
4519//    b = (r * 35 + g * 68 + b * 17) >> 7
4520//    g = (r * 45 + g * 88 + b * 22) >> 7
4521//    r = (r * 50 + g * 98 + b * 24) >> 7
4522// Constant for ARGB color to sepia tone.
4523static const vec8 kARGBToSepiaB = {
4524  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
4525};
4526
4527static const vec8 kARGBToSepiaG = {
4528  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
4529};
4530
4531static const vec8 kARGBToSepiaR = {
4532  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
4533};
4534
4535// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4536__declspec(naked)
4537void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4538  __asm {
4539    mov        eax, [esp + 4]   /* dst_argb */
4540    mov        ecx, [esp + 8]   /* width */
4541    movdqa     xmm2, kARGBToSepiaB
4542    movdqa     xmm3, kARGBToSepiaG
4543    movdqa     xmm4, kARGBToSepiaR
4544
4545 convertloop:
4546    movdqu     xmm0, [eax]  // B
4547    movdqu     xmm6, [eax + 16]
4548    pmaddubsw  xmm0, xmm2
4549    pmaddubsw  xmm6, xmm2
4550    phaddw     xmm0, xmm6
4551    psrlw      xmm0, 7
4552    packuswb   xmm0, xmm0   // 8 B values
4553    movdqu     xmm5, [eax]  // G
4554    movdqu     xmm1, [eax + 16]
4555    pmaddubsw  xmm5, xmm3
4556    pmaddubsw  xmm1, xmm3
4557    phaddw     xmm5, xmm1
4558    psrlw      xmm5, 7
4559    packuswb   xmm5, xmm5   // 8 G values
4560    punpcklbw  xmm0, xmm5   // 8 BG values
4561    movdqu     xmm5, [eax]  // R
4562    movdqu     xmm1, [eax + 16]
4563    pmaddubsw  xmm5, xmm4
4564    pmaddubsw  xmm1, xmm4
4565    phaddw     xmm5, xmm1
4566    psrlw      xmm5, 7
4567    packuswb   xmm5, xmm5   // 8 R values
4568    movdqu     xmm6, [eax]  // A
4569    movdqu     xmm1, [eax + 16]
4570    psrld      xmm6, 24
4571    psrld      xmm1, 24
4572    packuswb   xmm6, xmm1
4573    packuswb   xmm6, xmm6   // 8 A values
4574    punpcklbw  xmm5, xmm6   // 8 RA values
4575    movdqa     xmm1, xmm0   // Weave BG, RA together
4576    punpcklwd  xmm0, xmm5   // BGRA first 4
4577    punpckhwd  xmm1, xmm5   // BGRA next 4
4578    movdqu     [eax], xmm0
4579    movdqu     [eax + 16], xmm1
4580    lea        eax, [eax + 32]
4581    sub        ecx, 8
4582    jg         convertloop
4583    ret
4584  }
4585}
4586#endif  // HAS_ARGBSEPIAROW_SSSE3
4587
4588#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4589// Tranform 8 ARGB pixels (32 bytes) with color matrix.
4590// Same as Sepia except matrix is provided.
4591// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4592// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4593__declspec(naked)
4594void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4595                              const int8* matrix_argb, int width) {
4596  __asm {
4597    mov        eax, [esp + 4]   /* src_argb */
4598    mov        edx, [esp + 8]   /* dst_argb */
4599    mov        ecx, [esp + 12]  /* matrix_argb */
4600    movdqu     xmm5, [ecx]
4601    pshufd     xmm2, xmm5, 0x00
4602    pshufd     xmm3, xmm5, 0x55
4603    pshufd     xmm4, xmm5, 0xaa
4604    pshufd     xmm5, xmm5, 0xff
4605    mov        ecx, [esp + 16]  /* width */
4606
4607 convertloop:
4608    movdqu     xmm0, [eax]  // B
4609    movdqu     xmm7, [eax + 16]
4610    pmaddubsw  xmm0, xmm2
4611    pmaddubsw  xmm7, xmm2
4612    movdqu     xmm6, [eax]  // G
4613    movdqu     xmm1, [eax + 16]
4614    pmaddubsw  xmm6, xmm3
4615    pmaddubsw  xmm1, xmm3
4616    phaddsw    xmm0, xmm7   // B
4617    phaddsw    xmm6, xmm1   // G
4618    psraw      xmm0, 6      // B
4619    psraw      xmm6, 6      // G
4620    packuswb   xmm0, xmm0   // 8 B values
4621    packuswb   xmm6, xmm6   // 8 G values
4622    punpcklbw  xmm0, xmm6   // 8 BG values
4623    movdqu     xmm1, [eax]  // R
4624    movdqu     xmm7, [eax + 16]
4625    pmaddubsw  xmm1, xmm4
4626    pmaddubsw  xmm7, xmm4
4627    phaddsw    xmm1, xmm7   // R
4628    movdqu     xmm6, [eax]  // A
4629    movdqu     xmm7, [eax + 16]
4630    pmaddubsw  xmm6, xmm5
4631    pmaddubsw  xmm7, xmm5
4632    phaddsw    xmm6, xmm7   // A
4633    psraw      xmm1, 6      // R
4634    psraw      xmm6, 6      // A
4635    packuswb   xmm1, xmm1   // 8 R values
4636    packuswb   xmm6, xmm6   // 8 A values
4637    punpcklbw  xmm1, xmm6   // 8 RA values
4638    movdqa     xmm6, xmm0   // Weave BG, RA together
4639    punpcklwd  xmm0, xmm1   // BGRA first 4
4640    punpckhwd  xmm6, xmm1   // BGRA next 4
4641    movdqu     [edx], xmm0
4642    movdqu     [edx + 16], xmm6
4643    lea        eax, [eax + 32]
4644    lea        edx, [edx + 32]
4645    sub        ecx, 8
4646    jg         convertloop
4647    ret
4648  }
4649}
4650#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4651
4652#ifdef HAS_ARGBQUANTIZEROW_SSE2
4653// Quantize 4 ARGB pixels (16 bytes).
4654__declspec(naked)
4655void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4656                          int interval_offset, int width) {
4657  __asm {
4658    mov        eax, [esp + 4]    /* dst_argb */
4659    movd       xmm2, [esp + 8]   /* scale */
4660    movd       xmm3, [esp + 12]  /* interval_size */
4661    movd       xmm4, [esp + 16]  /* interval_offset */
4662    mov        ecx, [esp + 20]   /* width */
4663    pshuflw    xmm2, xmm2, 040h
4664    pshufd     xmm2, xmm2, 044h
4665    pshuflw    xmm3, xmm3, 040h
4666    pshufd     xmm3, xmm3, 044h
4667    pshuflw    xmm4, xmm4, 040h
4668    pshufd     xmm4, xmm4, 044h
4669    pxor       xmm5, xmm5  // constant 0
4670    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
4671    pslld      xmm6, 24
4672
4673 convertloop:
4674    movdqu     xmm0, [eax]  // read 4 pixels
4675    punpcklbw  xmm0, xmm5   // first 2 pixels
4676    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
4677    movdqu     xmm1, [eax]  // read 4 pixels
4678    punpckhbw  xmm1, xmm5   // next 2 pixels
4679    pmulhuw    xmm1, xmm2
4680    pmullw     xmm0, xmm3   // * interval_size
4681    movdqu     xmm7, [eax]  // read 4 pixels
4682    pmullw     xmm1, xmm3
4683    pand       xmm7, xmm6   // mask alpha
4684    paddw      xmm0, xmm4   // + interval_size / 2
4685    paddw      xmm1, xmm4
4686    packuswb   xmm0, xmm1
4687    por        xmm0, xmm7
4688    movdqu     [eax], xmm0
4689    lea        eax, [eax + 16]
4690    sub        ecx, 4
4691    jg         convertloop
4692    ret
4693  }
4694}
4695#endif  // HAS_ARGBQUANTIZEROW_SSE2
4696
4697#ifdef HAS_ARGBSHADEROW_SSE2
4698// Shade 4 pixels at a time by specified value.
4699__declspec(naked)
4700void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4701                       uint32 value) {
4702  __asm {
4703    mov        eax, [esp + 4]   // src_argb
4704    mov        edx, [esp + 8]   // dst_argb
4705    mov        ecx, [esp + 12]  // width
4706    movd       xmm2, [esp + 16]  // value
4707    punpcklbw  xmm2, xmm2
4708    punpcklqdq xmm2, xmm2
4709
4710 convertloop:
4711    movdqu     xmm0, [eax]      // read 4 pixels
4712    lea        eax, [eax + 16]
4713    movdqa     xmm1, xmm0
4714    punpcklbw  xmm0, xmm0       // first 2
4715    punpckhbw  xmm1, xmm1       // next 2
4716    pmulhuw    xmm0, xmm2       // argb * value
4717    pmulhuw    xmm1, xmm2       // argb * value
4718    psrlw      xmm0, 8
4719    psrlw      xmm1, 8
4720    packuswb   xmm0, xmm1
4721    movdqu     [edx], xmm0
4722    lea        edx, [edx + 16]
4723    sub        ecx, 4
4724    jg         convertloop
4725
4726    ret
4727  }
4728}
4729#endif  // HAS_ARGBSHADEROW_SSE2
4730
4731#ifdef HAS_ARGBMULTIPLYROW_SSE2
4732// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4733__declspec(naked)
4734void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4735                          uint8* dst_argb, int width) {
4736  __asm {
4737    push       esi
4738    mov        eax, [esp + 4 + 4]   // src_argb0
4739    mov        esi, [esp + 4 + 8]   // src_argb1
4740    mov        edx, [esp + 4 + 12]  // dst_argb
4741    mov        ecx, [esp + 4 + 16]  // width
4742    pxor       xmm5, xmm5  // constant 0
4743
4744 convertloop:
4745    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4746    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
4747    movdqu     xmm1, xmm0
4748    movdqu     xmm3, xmm2
4749    punpcklbw  xmm0, xmm0         // first 2
4750    punpckhbw  xmm1, xmm1         // next 2
4751    punpcklbw  xmm2, xmm5         // first 2
4752    punpckhbw  xmm3, xmm5         // next 2
4753    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
4754    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
4755    lea        eax, [eax + 16]
4756    lea        esi, [esi + 16]
4757    packuswb   xmm0, xmm1
4758    movdqu     [edx], xmm0
4759    lea        edx, [edx + 16]
4760    sub        ecx, 4
4761    jg         convertloop
4762
4763    pop        esi
4764    ret
4765  }
4766}
4767#endif  // HAS_ARGBMULTIPLYROW_SSE2
4768
4769#ifdef HAS_ARGBADDROW_SSE2
4770// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4771// TODO(fbarchard): Port this to posix, neon and other math functions.
4772__declspec(naked)
4773void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4774                     uint8* dst_argb, int width) {
4775  __asm {
4776    push       esi
4777    mov        eax, [esp + 4 + 4]   // src_argb0
4778    mov        esi, [esp + 4 + 8]   // src_argb1
4779    mov        edx, [esp + 4 + 12]  // dst_argb
4780    mov        ecx, [esp + 4 + 16]  // width
4781
4782    sub        ecx, 4
4783    jl         convertloop49
4784
4785 convertloop4:
4786    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4787    lea        eax, [eax + 16]
4788    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
4789    lea        esi, [esi + 16]
4790    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
4791    movdqu     [edx], xmm0
4792    lea        edx, [edx + 16]
4793    sub        ecx, 4
4794    jge        convertloop4
4795
4796 convertloop49:
4797    add        ecx, 4 - 1
4798    jl         convertloop19
4799
4800 convertloop1:
4801    movd       xmm0, [eax]        // read 1 pixels from src_argb0
4802    lea        eax, [eax + 4]
4803    movd       xmm1, [esi]        // read 1 pixels from src_argb1
4804    lea        esi, [esi + 4]
4805    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
4806    movd       [edx], xmm0
4807    lea        edx, [edx + 4]
4808    sub        ecx, 1
4809    jge        convertloop1
4810
4811 convertloop19:
4812    pop        esi
4813    ret
4814  }
4815}
4816#endif  // HAS_ARGBADDROW_SSE2
4817
4818#ifdef HAS_ARGBSUBTRACTROW_SSE2
4819// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4820__declspec(naked)
4821void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4822                          uint8* dst_argb, int width) {
4823  __asm {
4824    push       esi
4825    mov        eax, [esp + 4 + 4]   // src_argb0
4826    mov        esi, [esp + 4 + 8]   // src_argb1
4827    mov        edx, [esp + 4 + 12]  // dst_argb
4828    mov        ecx, [esp + 4 + 16]  // width
4829
4830 convertloop:
4831    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4832    lea        eax, [eax + 16]
4833    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
4834    lea        esi, [esi + 16]
4835    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
4836    movdqu     [edx], xmm0
4837    lea        edx, [edx + 16]
4838    sub        ecx, 4
4839    jg         convertloop
4840
4841    pop        esi
4842    ret
4843  }
4844}
4845#endif  // HAS_ARGBSUBTRACTROW_SSE2
4846
4847#ifdef HAS_ARGBMULTIPLYROW_AVX2
4848// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4849__declspec(naked)
4850void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4851                          uint8* dst_argb, int width) {
4852  __asm {
4853    push       esi
4854    mov        eax, [esp + 4 + 4]   // src_argb0
4855    mov        esi, [esp + 4 + 8]   // src_argb1
4856    mov        edx, [esp + 4 + 12]  // dst_argb
4857    mov        ecx, [esp + 4 + 16]  // width
4858    vpxor      ymm5, ymm5, ymm5     // constant 0
4859
4860 convertloop:
4861    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
4862    lea        eax, [eax + 32]
4863    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
4864    lea        esi, [esi + 32]
4865    vpunpcklbw ymm0, ymm1, ymm1   // low 4
4866    vpunpckhbw ymm1, ymm1, ymm1   // high 4
4867    vpunpcklbw ymm2, ymm3, ymm5   // low 4
4868    vpunpckhbw ymm3, ymm3, ymm5   // high 4
4869    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
4870    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
4871    vpackuswb  ymm0, ymm0, ymm1
4872    vmovdqu    [edx], ymm0
4873    lea        edx, [edx + 32]
4874    sub        ecx, 8
4875    jg         convertloop
4876
4877    pop        esi
4878    vzeroupper
4879    ret
4880  }
4881}
4882#endif  // HAS_ARGBMULTIPLYROW_AVX2
4883
4884#ifdef HAS_ARGBADDROW_AVX2
4885// Add 2 rows of ARGB pixels together, 8 pixels at a time.
4886__declspec(naked)
4887void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4888                     uint8* dst_argb, int width) {
4889  __asm {
4890    push       esi
4891    mov        eax, [esp + 4 + 4]   // src_argb0
4892    mov        esi, [esp + 4 + 8]   // src_argb1
4893    mov        edx, [esp + 4 + 12]  // dst_argb
4894    mov        ecx, [esp + 4 + 16]  // width
4895
4896 convertloop:
4897    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
4898    lea        eax, [eax + 32]
4899    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
4900    lea        esi, [esi + 32]
4901    vmovdqu    [edx], ymm0
4902    lea        edx, [edx + 32]
4903    sub        ecx, 8
4904    jg         convertloop
4905
4906    pop        esi
4907    vzeroupper
4908    ret
4909  }
4910}
4911#endif  // HAS_ARGBADDROW_AVX2
4912
4913#ifdef HAS_ARGBSUBTRACTROW_AVX2
4914// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4915__declspec(naked)
4916void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4917                          uint8* dst_argb, int width) {
4918  __asm {
4919    push       esi
4920    mov        eax, [esp + 4 + 4]   // src_argb0
4921    mov        esi, [esp + 4 + 8]   // src_argb1
4922    mov        edx, [esp + 4 + 12]  // dst_argb
4923    mov        ecx, [esp + 4 + 16]  // width
4924
4925 convertloop:
4926    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
4927    lea        eax, [eax + 32]
4928    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
4929    lea        esi, [esi + 32]
4930    vmovdqu    [edx], ymm0
4931    lea        edx, [edx + 32]
4932    sub        ecx, 8
4933    jg         convertloop
4934
4935    pop        esi
4936    vzeroupper
4937    ret
4938  }
4939}
4940#endif  // HAS_ARGBSUBTRACTROW_AVX2
4941
4942#ifdef HAS_SOBELXROW_SSE2
4943// SobelX as a matrix is
4944// -1  0  1
4945// -2  0  2
4946// -1  0  1
4947__declspec(naked)
4948void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4949                    const uint8* src_y2, uint8* dst_sobelx, int width) {
4950  __asm {
4951    push       esi
4952    push       edi
4953    mov        eax, [esp + 8 + 4]   // src_y0
4954    mov        esi, [esp + 8 + 8]   // src_y1
4955    mov        edi, [esp + 8 + 12]  // src_y2
4956    mov        edx, [esp + 8 + 16]  // dst_sobelx
4957    mov        ecx, [esp + 8 + 20]  // width
4958    sub        esi, eax
4959    sub        edi, eax
4960    sub        edx, eax
4961    pxor       xmm5, xmm5  // constant 0
4962
4963 convertloop:
4964    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
4965    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
4966    punpcklbw  xmm0, xmm5
4967    punpcklbw  xmm1, xmm5
4968    psubw      xmm0, xmm1
4969    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
4970    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
4971    punpcklbw  xmm1, xmm5
4972    punpcklbw  xmm2, xmm5
4973    psubw      xmm1, xmm2
4974    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
4975    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
4976    punpcklbw  xmm2, xmm5
4977    punpcklbw  xmm3, xmm5
4978    psubw      xmm2, xmm3
4979    paddw      xmm0, xmm2
4980    paddw      xmm0, xmm1
4981    paddw      xmm0, xmm1
4982    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
4983    psubw      xmm1, xmm0
4984    pmaxsw     xmm0, xmm1
4985    packuswb   xmm0, xmm0
4986    movq       qword ptr [eax + edx], xmm0
4987    lea        eax, [eax + 8]
4988    sub        ecx, 8
4989    jg         convertloop
4990
4991    pop        edi
4992    pop        esi
4993    ret
4994  }
4995}
4996#endif  // HAS_SOBELXROW_SSE2
4997
4998#ifdef HAS_SOBELYROW_SSE2
4999// SobelY as a matrix is
5000// -1 -2 -1
5001//  0  0  0
5002//  1  2  1
5003__declspec(naked)
5004void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5005                    uint8* dst_sobely, int width) {
5006  __asm {
5007    push       esi
5008    mov        eax, [esp + 4 + 4]   // src_y0
5009    mov        esi, [esp + 4 + 8]   // src_y1
5010    mov        edx, [esp + 4 + 12]  // dst_sobely
5011    mov        ecx, [esp + 4 + 16]  // width
5012    sub        esi, eax
5013    sub        edx, eax
5014    pxor       xmm5, xmm5  // constant 0
5015
5016 convertloop:
5017    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
5018    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
5019    punpcklbw  xmm0, xmm5
5020    punpcklbw  xmm1, xmm5
5021    psubw      xmm0, xmm1
5022    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
5023    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
5024    punpcklbw  xmm1, xmm5
5025    punpcklbw  xmm2, xmm5
5026    psubw      xmm1, xmm2
5027    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
5028    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5029    punpcklbw  xmm2, xmm5
5030    punpcklbw  xmm3, xmm5
5031    psubw      xmm2, xmm3
5032    paddw      xmm0, xmm2
5033    paddw      xmm0, xmm1
5034    paddw      xmm0, xmm1
5035    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5036    psubw      xmm1, xmm0
5037    pmaxsw     xmm0, xmm1
5038    packuswb   xmm0, xmm0
5039    movq       qword ptr [eax + edx], xmm0
5040    lea        eax, [eax + 8]
5041    sub        ecx, 8
5042    jg         convertloop
5043
5044    pop        esi
5045    ret
5046  }
5047}
5048#endif  // HAS_SOBELYROW_SSE2
5049
5050#ifdef HAS_SOBELROW_SSE2
5051// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5052// A = 255
5053// R = Sobel
5054// G = Sobel
5055// B = Sobel
5056__declspec(naked)
5057void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5058                   uint8* dst_argb, int width) {
5059  __asm {
5060    push       esi
5061    mov        eax, [esp + 4 + 4]   // src_sobelx
5062    mov        esi, [esp + 4 + 8]   // src_sobely
5063    mov        edx, [esp + 4 + 12]  // dst_argb
5064    mov        ecx, [esp + 4 + 16]  // width
5065    sub        esi, eax
5066    pcmpeqb    xmm5, xmm5           // alpha 255
5067    pslld      xmm5, 24             // 0xff000000
5068
5069 convertloop:
5070    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
5071    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5072    lea        eax, [eax + 16]
5073    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
5074    movdqa     xmm2, xmm0             // GG
5075    punpcklbw  xmm2, xmm0             // First 8
5076    punpckhbw  xmm0, xmm0             // Next 8
5077    movdqa     xmm1, xmm2             // GGGG
5078    punpcklwd  xmm1, xmm2             // First 4
5079    punpckhwd  xmm2, xmm2             // Next 4
5080    por        xmm1, xmm5             // GGGA
5081    por        xmm2, xmm5
5082    movdqa     xmm3, xmm0             // GGGG
5083    punpcklwd  xmm3, xmm0             // Next 4
5084    punpckhwd  xmm0, xmm0             // Last 4
5085    por        xmm3, xmm5             // GGGA
5086    por        xmm0, xmm5
5087    movdqu     [edx], xmm1
5088    movdqu     [edx + 16], xmm2
5089    movdqu     [edx + 32], xmm3
5090    movdqu     [edx + 48], xmm0
5091    lea        edx, [edx + 64]
5092    sub        ecx, 16
5093    jg         convertloop
5094
5095    pop        esi
5096    ret
5097  }
5098}
5099#endif  // HAS_SOBELROW_SSE2
5100
5101#ifdef HAS_SOBELTOPLANEROW_SSE2
5102// Adds Sobel X and Sobel Y and stores Sobel into a plane.
5103__declspec(naked)
5104void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5105                          uint8* dst_y, int width) {
5106  __asm {
5107    push       esi
5108    mov        eax, [esp + 4 + 4]   // src_sobelx
5109    mov        esi, [esp + 4 + 8]   // src_sobely
5110    mov        edx, [esp + 4 + 12]  // dst_argb
5111    mov        ecx, [esp + 4 + 16]  // width
5112    sub        esi, eax
5113
5114 convertloop:
5115    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
5116    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5117    lea        eax, [eax + 16]
5118    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
5119    movdqu     [edx], xmm0
5120    lea        edx, [edx + 16]
5121    sub        ecx, 16
5122    jg         convertloop
5123
5124    pop        esi
5125    ret
5126  }
5127}
5128#endif  // HAS_SOBELTOPLANEROW_SSE2
5129
5130#ifdef HAS_SOBELXYROW_SSE2
5131// Mixes Sobel X, Sobel Y and Sobel into ARGB.
5132// A = 255
5133// R = Sobel X
5134// G = Sobel
5135// B = Sobel Y
5136__declspec(naked)
5137void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5138                     uint8* dst_argb, int width) {
5139  __asm {
5140    push       esi
5141    mov        eax, [esp + 4 + 4]   // src_sobelx
5142    mov        esi, [esp + 4 + 8]   // src_sobely
5143    mov        edx, [esp + 4 + 12]  // dst_argb
5144    mov        ecx, [esp + 4 + 16]  // width
5145    sub        esi, eax
5146    pcmpeqb    xmm5, xmm5           // alpha 255
5147
5148 convertloop:
5149    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
5150    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5151    lea        eax, [eax + 16]
5152    movdqa     xmm2, xmm0
5153    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
5154    movdqa     xmm3, xmm0             // XA
5155    punpcklbw  xmm3, xmm5
5156    punpckhbw  xmm0, xmm5
5157    movdqa     xmm4, xmm1             // YS
5158    punpcklbw  xmm4, xmm2
5159    punpckhbw  xmm1, xmm2
5160    movdqa     xmm6, xmm4             // YSXA
5161    punpcklwd  xmm6, xmm3             // First 4
5162    punpckhwd  xmm4, xmm3             // Next 4
5163    movdqa     xmm7, xmm1             // YSXA
5164    punpcklwd  xmm7, xmm0             // Next 4
5165    punpckhwd  xmm1, xmm0             // Last 4
5166    movdqu     [edx], xmm6
5167    movdqu     [edx + 16], xmm4
5168    movdqu     [edx + 32], xmm7
5169    movdqu     [edx + 48], xmm1
5170    lea        edx, [edx + 64]
5171    sub        ecx, 16
5172    jg         convertloop
5173
5174    pop        esi
5175    ret
5176  }
5177}
5178#endif  // HAS_SOBELXYROW_SSE2
5179
5180#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5181// Consider float CumulativeSum.
5182// Consider calling CumulativeSum one row at time as needed.
5183// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5184// Convert cumulative sum for an area to an average for 1 pixel.
5185// topleft is pointer to top left of CumulativeSum buffer for area.
5186// botleft is pointer to bottom left of CumulativeSum buffer.
5187// width is offset from left to right of area in CumulativeSum buffer measured
5188//   in number of ints.
5189// area is the number of pixels in the area being averaged.
5190// dst points to pixel to store result to.
5191// count is number of averaged pixels to produce.
5192// Does 4 pixels at a time.
5193void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5194                                    int width, int area, uint8* dst,
5195                                    int count) {
5196  __asm {
5197    mov        eax, topleft  // eax topleft
5198    mov        esi, botleft  // esi botleft
5199    mov        edx, width
5200    movd       xmm5, area
5201    mov        edi, dst
5202    mov        ecx, count
5203    cvtdq2ps   xmm5, xmm5
5204    rcpss      xmm4, xmm5  // 1.0f / area
5205    pshufd     xmm4, xmm4, 0
5206    sub        ecx, 4
5207    jl         l4b
5208
5209    cmp        area, 128  // 128 pixels will not overflow 15 bits.
5210    ja         l4
5211
5212    pshufd     xmm5, xmm5, 0        // area
5213    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
5214    psrld      xmm6, 16
5215    cvtdq2ps   xmm6, xmm6
5216    addps      xmm5, xmm6           // (65536.0 + area - 1)
5217    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
5218    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
5219    packssdw   xmm5, xmm5           // 16 bit shorts
5220
5221    // 4 pixel loop small blocks.
5222  s4:
5223    // top left
5224    movdqu     xmm0, [eax]
5225    movdqu     xmm1, [eax + 16]
5226    movdqu     xmm2, [eax + 32]
5227    movdqu     xmm3, [eax + 48]
5228
5229    // - top right
5230    psubd      xmm0, [eax + edx * 4]
5231    psubd      xmm1, [eax + edx * 4 + 16]
5232    psubd      xmm2, [eax + edx * 4 + 32]
5233    psubd      xmm3, [eax + edx * 4 + 48]
5234    lea        eax, [eax + 64]
5235
5236    // - bottom left
5237    psubd      xmm0, [esi]
5238    psubd      xmm1, [esi + 16]
5239    psubd      xmm2, [esi + 32]
5240    psubd      xmm3, [esi + 48]
5241
5242    // + bottom right
5243    paddd      xmm0, [esi + edx * 4]
5244    paddd      xmm1, [esi + edx * 4 + 16]
5245    paddd      xmm2, [esi + edx * 4 + 32]
5246    paddd      xmm3, [esi + edx * 4 + 48]
5247    lea        esi, [esi + 64]
5248
5249    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
5250    packssdw   xmm2, xmm3
5251
5252    pmulhuw    xmm0, xmm5
5253    pmulhuw    xmm2, xmm5
5254
5255    packuswb   xmm0, xmm2
5256    movdqu     [edi], xmm0
5257    lea        edi, [edi + 16]
5258    sub        ecx, 4
5259    jge        s4
5260
5261    jmp        l4b
5262
5263    // 4 pixel loop
5264  l4:
5265    // top left
5266    movdqu     xmm0, [eax]
5267    movdqu     xmm1, [eax + 16]
5268    movdqu     xmm2, [eax + 32]
5269    movdqu     xmm3, [eax + 48]
5270
5271    // - top right
5272    psubd      xmm0, [eax + edx * 4]
5273    psubd      xmm1, [eax + edx * 4 + 16]
5274    psubd      xmm2, [eax + edx * 4 + 32]
5275    psubd      xmm3, [eax + edx * 4 + 48]
5276    lea        eax, [eax + 64]
5277
5278    // - bottom left
5279    psubd      xmm0, [esi]
5280    psubd      xmm1, [esi + 16]
5281    psubd      xmm2, [esi + 32]
5282    psubd      xmm3, [esi + 48]
5283
5284    // + bottom right
5285    paddd      xmm0, [esi + edx * 4]
5286    paddd      xmm1, [esi + edx * 4 + 16]
5287    paddd      xmm2, [esi + edx * 4 + 32]
5288    paddd      xmm3, [esi + edx * 4 + 48]
5289    lea        esi, [esi + 64]
5290
5291    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
5292    cvtdq2ps   xmm1, xmm1
5293    mulps      xmm0, xmm4
5294    mulps      xmm1, xmm4
5295    cvtdq2ps   xmm2, xmm2
5296    cvtdq2ps   xmm3, xmm3
5297    mulps      xmm2, xmm4
5298    mulps      xmm3, xmm4
5299    cvtps2dq   xmm0, xmm0
5300    cvtps2dq   xmm1, xmm1
5301    cvtps2dq   xmm2, xmm2
5302    cvtps2dq   xmm3, xmm3
5303    packssdw   xmm0, xmm1
5304    packssdw   xmm2, xmm3
5305    packuswb   xmm0, xmm2
5306    movdqu     [edi], xmm0
5307    lea        edi, [edi + 16]
5308    sub        ecx, 4
5309    jge        l4
5310
5311  l4b:
5312    add        ecx, 4 - 1
5313    jl         l1b
5314
5315    // 1 pixel loop
5316  l1:
5317    movdqu     xmm0, [eax]
5318    psubd      xmm0, [eax + edx * 4]
5319    lea        eax, [eax + 16]
5320    psubd      xmm0, [esi]
5321    paddd      xmm0, [esi + edx * 4]
5322    lea        esi, [esi + 16]
5323    cvtdq2ps   xmm0, xmm0
5324    mulps      xmm0, xmm4
5325    cvtps2dq   xmm0, xmm0
5326    packssdw   xmm0, xmm0
5327    packuswb   xmm0, xmm0
5328    movd       dword ptr [edi], xmm0
5329    lea        edi, [edi + 4]
5330    sub        ecx, 1
5331    jge        l1
5332  l1b:
5333  }
5334}
5335#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5336
5337#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5338// Creates a table of cumulative sums where each value is a sum of all values
5339// above and to the left of the value.
5340void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
5341                                  const int32* previous_cumsum, int width) {
5342  __asm {
5343    mov        eax, row
5344    mov        edx, cumsum
5345    mov        esi, previous_cumsum
5346    mov        ecx, width
5347    pxor       xmm0, xmm0
5348    pxor       xmm1, xmm1
5349
5350    sub        ecx, 4
5351    jl         l4b
5352    test       edx, 15
5353    jne        l4b
5354
5355    // 4 pixel loop
5356  l4:
5357    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
5358    lea        eax, [eax + 16]
5359    movdqa     xmm4, xmm2
5360
5361    punpcklbw  xmm2, xmm1
5362    movdqa     xmm3, xmm2
5363    punpcklwd  xmm2, xmm1
5364    punpckhwd  xmm3, xmm1
5365
5366    punpckhbw  xmm4, xmm1
5367    movdqa     xmm5, xmm4
5368    punpcklwd  xmm4, xmm1
5369    punpckhwd  xmm5, xmm1
5370
5371    paddd      xmm0, xmm2
5372    movdqu     xmm2, [esi]  // previous row above.
5373    paddd      xmm2, xmm0
5374
5375    paddd      xmm0, xmm3
5376    movdqu     xmm3, [esi + 16]
5377    paddd      xmm3, xmm0
5378
5379    paddd      xmm0, xmm4
5380    movdqu     xmm4, [esi + 32]
5381    paddd      xmm4, xmm0
5382
5383    paddd      xmm0, xmm5
5384    movdqu     xmm5, [esi + 48]
5385    lea        esi, [esi + 64]
5386    paddd      xmm5, xmm0
5387
5388    movdqu     [edx], xmm2
5389    movdqu     [edx + 16], xmm3
5390    movdqu     [edx + 32], xmm4
5391    movdqu     [edx + 48], xmm5
5392
5393    lea        edx, [edx + 64]
5394    sub        ecx, 4
5395    jge        l4
5396
5397  l4b:
5398    add        ecx, 4 - 1
5399    jl         l1b
5400
5401    // 1 pixel loop
5402  l1:
5403    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
5404    lea        eax, [eax + 4]
5405    punpcklbw  xmm2, xmm1
5406    punpcklwd  xmm2, xmm1
5407    paddd      xmm0, xmm2
5408    movdqu     xmm2, [esi]
5409    lea        esi, [esi + 16]
5410    paddd      xmm2, xmm0
5411    movdqu     [edx], xmm2
5412    lea        edx, [edx + 16]
5413    sub        ecx, 1
5414    jge        l1
5415
5416 l1b:
5417  }
5418}
5419#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
5420
5421#ifdef HAS_ARGBAFFINEROW_SSE2
5422// Copy ARGB pixels from source image with slope to a row of destination.
5423__declspec(naked)
5424LIBYUV_API
5425void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
5426                        uint8* dst_argb, const float* uv_dudv, int width) {
5427  __asm {
5428    push       esi
5429    push       edi
5430    mov        eax, [esp + 12]  // src_argb
5431    mov        esi, [esp + 16]  // stride
5432    mov        edx, [esp + 20]  // dst_argb
5433    mov        ecx, [esp + 24]  // pointer to uv_dudv
5434    movq       xmm2, qword ptr [ecx]  // uv
5435    movq       xmm7, qword ptr [ecx + 8]  // dudv
5436    mov        ecx, [esp + 28]  // width
5437    shl        esi, 16          // 4, stride
5438    add        esi, 4
5439    movd       xmm5, esi
5440    sub        ecx, 4
5441    jl         l4b
5442
5443    // setup for 4 pixel loop
5444    pshufd     xmm7, xmm7, 0x44  // dup dudv
5445    pshufd     xmm5, xmm5, 0  // dup 4, stride
5446    movdqa     xmm0, xmm2    // x0, y0, x1, y1
5447    addps      xmm0, xmm7
5448    movlhps    xmm2, xmm0
5449    movdqa     xmm4, xmm7
5450    addps      xmm4, xmm4    // dudv *= 2
5451    movdqa     xmm3, xmm2    // x2, y2, x3, y3
5452    addps      xmm3, xmm4
5453    addps      xmm4, xmm4    // dudv *= 4
5454
5455    // 4 pixel loop
5456  l4:
5457    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
5458    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
5459    packssdw   xmm0, xmm1    // x, y as 8 shorts
5460    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
5461    movd       esi, xmm0
5462    pshufd     xmm0, xmm0, 0x39  // shift right
5463    movd       edi, xmm0
5464    pshufd     xmm0, xmm0, 0x39  // shift right
5465    movd       xmm1, [eax + esi]  // read pixel 0
5466    movd       xmm6, [eax + edi]  // read pixel 1
5467    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
5468    addps      xmm2, xmm4    // x, y += dx, dy first 2
5469    movq       qword ptr [edx], xmm1
5470    movd       esi, xmm0
5471    pshufd     xmm0, xmm0, 0x39  // shift right
5472    movd       edi, xmm0
5473    movd       xmm6, [eax + esi]  // read pixel 2
5474    movd       xmm0, [eax + edi]  // read pixel 3
5475    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
5476    addps      xmm3, xmm4    // x, y += dx, dy next 2
5477    movq       qword ptr 8[edx], xmm6
5478    lea        edx, [edx + 16]
5479    sub        ecx, 4
5480    jge        l4
5481
5482  l4b:
5483    add        ecx, 4 - 1
5484    jl         l1b
5485
5486    // 1 pixel loop
5487  l1:
5488    cvttps2dq  xmm0, xmm2    // x, y float to int
5489    packssdw   xmm0, xmm0    // x, y as shorts
5490    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
5491    addps      xmm2, xmm7    // x, y += dx, dy
5492    movd       esi, xmm0
5493    movd       xmm0, [eax + esi]  // copy a pixel
5494    movd       [edx], xmm0
5495    lea        edx, [edx + 4]
5496    sub        ecx, 1
5497    jge        l1
5498  l1b:
5499    pop        edi
5500    pop        esi
5501    ret
5502  }
5503}
5504#endif  // HAS_ARGBAFFINEROW_SSE2
5505
5506#ifdef HAS_INTERPOLATEROW_AVX2
5507// Bilinear filter 32x2 -> 32x1
5508__declspec(naked)
5509void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
5510                         ptrdiff_t src_stride, int dst_width,
5511                         int source_y_fraction) {
5512  __asm {
5513    push       esi
5514    push       edi
5515    mov        edi, [esp + 8 + 4]   // dst_ptr
5516    mov        esi, [esp + 8 + 8]   // src_ptr
5517    mov        edx, [esp + 8 + 12]  // src_stride
5518    mov        ecx, [esp + 8 + 16]  // dst_width
5519    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5520    shr        eax, 1
5521    // Dispatch to specialized filters if applicable.
5522    cmp        eax, 0
5523    je         xloop100  // 0 / 128.  Blend 100 / 0.
5524    sub        edi, esi
5525    cmp        eax, 32
5526    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
5527    cmp        eax, 64
5528    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
5529    cmp        eax, 96
5530    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
5531
5532    vmovd      xmm0, eax  // high fraction 0..127
5533    neg        eax
5534    add        eax, 128
5535    vmovd      xmm5, eax  // low fraction 128..1
5536    vpunpcklbw xmm5, xmm5, xmm0
5537    vpunpcklwd xmm5, xmm5, xmm5
5538    vpxor      ymm0, ymm0, ymm0
5539    vpermd     ymm5, ymm0, ymm5
5540
5541  xloop:
5542    vmovdqu    ymm0, [esi]
5543    vmovdqu    ymm2, [esi + edx]
5544    vpunpckhbw ymm1, ymm0, ymm2  // mutates
5545    vpunpcklbw ymm0, ymm0, ymm2  // mutates
5546    vpmaddubsw ymm0, ymm0, ymm5
5547    vpmaddubsw ymm1, ymm1, ymm5
5548    vpsrlw     ymm0, ymm0, 7
5549    vpsrlw     ymm1, ymm1, 7
5550    vpackuswb  ymm0, ymm0, ymm1  // unmutates
5551    vmovdqu    [esi + edi], ymm0
5552    lea        esi, [esi + 32]
5553    sub        ecx, 32
5554    jg         xloop
5555    jmp        xloop99
5556
5557   // Blend 25 / 75.
5558 xloop25:
5559   vmovdqu    ymm0, [esi]
5560   vmovdqu    ymm1, [esi + edx]
5561   vpavgb     ymm0, ymm0, ymm1
5562   vpavgb     ymm0, ymm0, ymm1
5563   vmovdqu    [esi + edi], ymm0
5564   lea        esi, [esi + 32]
5565   sub        ecx, 32
5566   jg         xloop25
5567   jmp        xloop99
5568
5569   // Blend 50 / 50.
5570 xloop50:
5571   vmovdqu    ymm0, [esi]
5572   vpavgb     ymm0, ymm0, [esi + edx]
5573   vmovdqu    [esi + edi], ymm0
5574   lea        esi, [esi + 32]
5575   sub        ecx, 32
5576   jg         xloop50
5577   jmp        xloop99
5578
5579   // Blend 75 / 25.
5580 xloop75:
5581   vmovdqu    ymm1, [esi]
5582   vmovdqu    ymm0, [esi + edx]
5583   vpavgb     ymm0, ymm0, ymm1
5584   vpavgb     ymm0, ymm0, ymm1
5585   vmovdqu    [esi + edi], ymm0
5586   lea        esi, [esi + 32]
5587   sub        ecx, 32
5588   jg         xloop75
5589   jmp        xloop99
5590
5591   // Blend 100 / 0 - Copy row unchanged.
5592 xloop100:
5593   rep movsb
5594
5595  xloop99:
5596    pop        edi
5597    pop        esi
5598    vzeroupper
5599    ret
5600  }
5601}
5602#endif  // HAS_INTERPOLATEROW_AVX2
5603
5604// Bilinear filter 16x2 -> 16x1
5605__declspec(naked)
5606void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5607                          ptrdiff_t src_stride, int dst_width,
5608                          int source_y_fraction) {
5609  __asm {
5610    push       esi
5611    push       edi
5612    mov        edi, [esp + 8 + 4]   // dst_ptr
5613    mov        esi, [esp + 8 + 8]   // src_ptr
5614    mov        edx, [esp + 8 + 12]  // src_stride
5615    mov        ecx, [esp + 8 + 16]  // dst_width
5616    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5617    sub        edi, esi
5618    shr        eax, 1
5619    // Dispatch to specialized filters if applicable.
5620    cmp        eax, 0
5621    je         xloop100  // 0 / 128.  Blend 100 / 0.
5622    cmp        eax, 32
5623    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
5624    cmp        eax, 64
5625    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
5626    cmp        eax, 96
5627    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
5628
5629    movd       xmm0, eax  // high fraction 0..127
5630    neg        eax
5631    add        eax, 128
5632    movd       xmm5, eax  // low fraction 128..1
5633    punpcklbw  xmm5, xmm0
5634    punpcklwd  xmm5, xmm5
5635    pshufd     xmm5, xmm5, 0
5636
5637  xloop:
5638    movdqu     xmm0, [esi]
5639    movdqu     xmm2, [esi + edx]
5640    movdqu     xmm1, xmm0
5641    punpcklbw  xmm0, xmm2
5642    punpckhbw  xmm1, xmm2
5643    pmaddubsw  xmm0, xmm5
5644    pmaddubsw  xmm1, xmm5
5645    psrlw      xmm0, 7
5646    psrlw      xmm1, 7
5647    packuswb   xmm0, xmm1
5648    movdqu     [esi + edi], xmm0
5649    lea        esi, [esi + 16]
5650    sub        ecx, 16
5651    jg         xloop
5652    jmp        xloop99
5653
5654    // Blend 25 / 75.
5655  xloop25:
5656    movdqu     xmm0, [esi]
5657    movdqu     xmm1, [esi + edx]
5658    pavgb      xmm0, xmm1
5659    pavgb      xmm0, xmm1
5660    movdqu     [esi + edi], xmm0
5661    lea        esi, [esi + 16]
5662    sub        ecx, 16
5663    jg         xloop25
5664    jmp        xloop99
5665
5666    // Blend 50 / 50.
5667  xloop50:
5668    movdqu     xmm0, [esi]
5669    movdqu     xmm1, [esi + edx]
5670    pavgb      xmm0, xmm1
5671    movdqu     [esi + edi], xmm0
5672    lea        esi, [esi + 16]
5673    sub        ecx, 16
5674    jg         xloop50
5675    jmp        xloop99
5676
5677    // Blend 75 / 25.
5678  xloop75:
5679    movdqu     xmm1, [esi]
5680    movdqu     xmm0, [esi + edx]
5681    pavgb      xmm0, xmm1
5682    pavgb      xmm0, xmm1
5683    movdqu     [esi + edi], xmm0
5684    lea        esi, [esi + 16]
5685    sub        ecx, 16
5686    jg         xloop75
5687    jmp        xloop99
5688
5689    // Blend 100 / 0 - Copy row unchanged.
5690  xloop100:
5691    movdqu     xmm0, [esi]
5692    movdqu     [esi + edi], xmm0
5693    lea        esi, [esi + 16]
5694    sub        ecx, 16
5695    jg         xloop100
5696
5697  xloop99:
5698    pop        edi
5699    pop        esi
5700    ret
5701  }
5702}
5703
5704#ifdef HAS_INTERPOLATEROW_SSE2
5705// Bilinear filter 16x2 -> 16x1
5706__declspec(naked)
5707void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
5708                         ptrdiff_t src_stride, int dst_width,
5709                         int source_y_fraction) {
5710  __asm {
5711    push       esi
5712    push       edi
5713    mov        edi, [esp + 8 + 4]   // dst_ptr
5714    mov        esi, [esp + 8 + 8]   // src_ptr
5715    mov        edx, [esp + 8 + 12]  // src_stride
5716    mov        ecx, [esp + 8 + 16]  // dst_width
5717    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5718    sub        edi, esi
5719    // Dispatch to specialized filters if applicable.
5720    cmp        eax, 0
5721    je         xloop100  // 0 / 256.  Blend 100 / 0.
5722    cmp        eax, 64
5723    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
5724    cmp        eax, 128
5725    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
5726    cmp        eax, 192
5727    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
5728
5729    movd       xmm5, eax            // xmm5 = y fraction
5730    punpcklbw  xmm5, xmm5
5731    psrlw      xmm5, 1
5732    punpcklwd  xmm5, xmm5
5733    punpckldq  xmm5, xmm5
5734    punpcklqdq xmm5, xmm5
5735    pxor       xmm4, xmm4
5736
5737  xloop:
5738    movdqu     xmm0, [esi]  // row0
5739    movdqu     xmm2, [esi + edx]  // row1
5740    movdqu     xmm1, xmm0
5741    movdqu     xmm3, xmm2
5742    punpcklbw  xmm2, xmm4
5743    punpckhbw  xmm3, xmm4
5744    punpcklbw  xmm0, xmm4
5745    punpckhbw  xmm1, xmm4
5746    psubw      xmm2, xmm0  // row1 - row0
5747    psubw      xmm3, xmm1
5748    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
5749    paddw      xmm3, xmm3
5750    pmulhw     xmm2, xmm5  // scale diff
5751    pmulhw     xmm3, xmm5
5752    paddw      xmm0, xmm2  // sum rows
5753    paddw      xmm1, xmm3
5754    packuswb   xmm0, xmm1
5755    movdqu     [esi + edi], xmm0
5756    lea        esi, [esi + 16]
5757    sub        ecx, 16
5758    jg         xloop
5759    jmp        xloop99
5760
5761    // Blend 25 / 75.
5762  xloop25:
5763    movdqu     xmm0, [esi]
5764    movdqu     xmm1, [esi + edx]
5765    pavgb      xmm0, xmm1
5766    pavgb      xmm0, xmm1
5767    movdqu     [esi + edi], xmm0
5768    lea        esi, [esi + 16]
5769    sub        ecx, 16
5770    jg         xloop25
5771    jmp        xloop99
5772
5773    // Blend 50 / 50.
5774  xloop50:
5775    movdqu     xmm0, [esi]
5776    movdqu     xmm1, [esi + edx]
5777    pavgb      xmm0, xmm1
5778    movdqu     [esi + edi], xmm0
5779    lea        esi, [esi + 16]
5780    sub        ecx, 16
5781    jg         xloop50
5782    jmp        xloop99
5783
5784    // Blend 75 / 25.
5785  xloop75:
5786    movdqu     xmm1, [esi]
5787    movdqu     xmm0, [esi + edx]
5788    pavgb      xmm0, xmm1
5789    pavgb      xmm0, xmm1
5790    movdqu     [esi + edi], xmm0
5791    lea        esi, [esi + 16]
5792    sub        ecx, 16
5793    jg         xloop75
5794    jmp        xloop99
5795
5796    // Blend 100 / 0 - Copy row unchanged.
5797  xloop100:
5798    movdqu     xmm0, [esi]
5799    movdqu     [esi + edi], xmm0
5800    lea        esi, [esi + 16]
5801    sub        ecx, 16
5802    jg         xloop100
5803
5804  xloop99:
5805    pop        edi
5806    pop        esi
5807    ret
5808  }
5809}
5810#endif  // HAS_INTERPOLATEROW_SSE2
5811
5812// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5813__declspec(naked)
5814void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5815                          const uint8* shuffler, int pix) {
5816  __asm {
5817    mov        eax, [esp + 4]    // src_argb
5818    mov        edx, [esp + 8]    // dst_argb
5819    mov        ecx, [esp + 12]   // shuffler
5820    movdqu     xmm5, [ecx]
5821    mov        ecx, [esp + 16]   // pix
5822
5823  wloop:
5824    movdqu     xmm0, [eax]
5825    movdqu     xmm1, [eax + 16]
5826    lea        eax, [eax + 32]
5827    pshufb     xmm0, xmm5
5828    pshufb     xmm1, xmm5
5829    movdqu     [edx], xmm0
5830    movdqu     [edx + 16], xmm1
5831    lea        edx, [edx + 32]
5832    sub        ecx, 8
5833    jg         wloop
5834    ret
5835  }
5836}
5837
5838#ifdef HAS_ARGBSHUFFLEROW_AVX2
5839__declspec(naked)
5840void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5841                         const uint8* shuffler, int pix) {
5842  __asm {
5843    mov        eax, [esp + 4]     // src_argb
5844    mov        edx, [esp + 8]     // dst_argb
5845    mov        ecx, [esp + 12]    // shuffler
5846    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
5847    mov        ecx, [esp + 16]    // pix
5848
5849  wloop:
5850    vmovdqu    ymm0, [eax]
5851    vmovdqu    ymm1, [eax + 32]
5852    lea        eax, [eax + 64]
5853    vpshufb    ymm0, ymm0, ymm5
5854    vpshufb    ymm1, ymm1, ymm5
5855    vmovdqu    [edx], ymm0
5856    vmovdqu    [edx + 32], ymm1
5857    lea        edx, [edx + 64]
5858    sub        ecx, 16
5859    jg         wloop
5860
5861    vzeroupper
5862    ret
5863  }
5864}
5865#endif  // HAS_ARGBSHUFFLEROW_AVX2
5866
5867__declspec(naked)
5868void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5869                         const uint8* shuffler, int pix) {
5870  __asm {
5871    push       ebx
5872    push       esi
5873    mov        eax, [esp + 8 + 4]    // src_argb
5874    mov        edx, [esp + 8 + 8]    // dst_argb
5875    mov        esi, [esp + 8 + 12]   // shuffler
5876    mov        ecx, [esp + 8 + 16]   // pix
5877    pxor       xmm5, xmm5
5878
5879    mov        ebx, [esi]   // shuffler
5880    cmp        ebx, 0x03000102
5881    je         shuf_3012
5882    cmp        ebx, 0x00010203
5883    je         shuf_0123
5884    cmp        ebx, 0x00030201
5885    je         shuf_0321
5886    cmp        ebx, 0x02010003
5887    je         shuf_2103
5888
5889  // TODO(fbarchard): Use one source pointer and 3 offsets.
5890  shuf_any1:
5891    movzx      ebx, byte ptr [esi]
5892    movzx      ebx, byte ptr [eax + ebx]
5893    mov        [edx], bl
5894    movzx      ebx, byte ptr [esi + 1]
5895    movzx      ebx, byte ptr [eax + ebx]
5896    mov        [edx + 1], bl
5897    movzx      ebx, byte ptr [esi + 2]
5898    movzx      ebx, byte ptr [eax + ebx]
5899    mov        [edx + 2], bl
5900    movzx      ebx, byte ptr [esi + 3]
5901    movzx      ebx, byte ptr [eax + ebx]
5902    mov        [edx + 3], bl
5903    lea        eax, [eax + 4]
5904    lea        edx, [edx + 4]
5905    sub        ecx, 1
5906    jg         shuf_any1
5907    jmp        shuf99
5908
5909  shuf_0123:
5910    movdqu     xmm0, [eax]
5911    lea        eax, [eax + 16]
5912    movdqa     xmm1, xmm0
5913    punpcklbw  xmm0, xmm5
5914    punpckhbw  xmm1, xmm5
5915    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
5916    pshuflw    xmm0, xmm0, 01Bh
5917    pshufhw    xmm1, xmm1, 01Bh
5918    pshuflw    xmm1, xmm1, 01Bh
5919    packuswb   xmm0, xmm1
5920    movdqu     [edx], xmm0
5921    lea        edx, [edx + 16]
5922    sub        ecx, 4
5923    jg         shuf_0123
5924    jmp        shuf99
5925
5926  shuf_0321:
5927    movdqu     xmm0, [eax]
5928    lea        eax, [eax + 16]
5929    movdqa     xmm1, xmm0
5930    punpcklbw  xmm0, xmm5
5931    punpckhbw  xmm1, xmm5
5932    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
5933    pshuflw    xmm0, xmm0, 039h
5934    pshufhw    xmm1, xmm1, 039h
5935    pshuflw    xmm1, xmm1, 039h
5936    packuswb   xmm0, xmm1
5937    movdqu     [edx], xmm0
5938    lea        edx, [edx + 16]
5939    sub        ecx, 4
5940    jg         shuf_0321
5941    jmp        shuf99
5942
5943  shuf_2103:
5944    movdqu     xmm0, [eax]
5945    lea        eax, [eax + 16]
5946    movdqa     xmm1, xmm0
5947    punpcklbw  xmm0, xmm5
5948    punpckhbw  xmm1, xmm5
5949    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
5950    pshuflw    xmm0, xmm0, 093h
5951    pshufhw    xmm1, xmm1, 093h
5952    pshuflw    xmm1, xmm1, 093h
5953    packuswb   xmm0, xmm1
5954    movdqu     [edx], xmm0
5955    lea        edx, [edx + 16]
5956    sub        ecx, 4
5957    jg         shuf_2103
5958    jmp        shuf99
5959
5960  shuf_3012:
5961    movdqu     xmm0, [eax]
5962    lea        eax, [eax + 16]
5963    movdqa     xmm1, xmm0
5964    punpcklbw  xmm0, xmm5
5965    punpckhbw  xmm1, xmm5
5966    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
5967    pshuflw    xmm0, xmm0, 0C6h
5968    pshufhw    xmm1, xmm1, 0C6h
5969    pshuflw    xmm1, xmm1, 0C6h
5970    packuswb   xmm0, xmm1
5971    movdqu     [edx], xmm0
5972    lea        edx, [edx + 16]
5973    sub        ecx, 4
5974    jg         shuf_3012
5975
5976  shuf99:
5977    pop        esi
5978    pop        ebx
5979    ret
5980  }
5981}
5982
5983// YUY2 - Macro-pixel = 2 image pixels
5984// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5985
5986// UYVY - Macro-pixel = 2 image pixels
5987// U0Y0V0Y1
5988
5989__declspec(naked)
5990void I422ToYUY2Row_SSE2(const uint8* src_y,
5991                        const uint8* src_u,
5992                        const uint8* src_v,
5993                        uint8* dst_frame, int width) {
5994  __asm {
5995    push       esi
5996    push       edi
5997    mov        eax, [esp + 8 + 4]    // src_y
5998    mov        esi, [esp + 8 + 8]    // src_u
5999    mov        edx, [esp + 8 + 12]   // src_v
6000    mov        edi, [esp + 8 + 16]   // dst_frame
6001    mov        ecx, [esp + 8 + 20]   // width
6002    sub        edx, esi
6003
6004  convertloop:
6005    movq       xmm2, qword ptr [esi] // U
6006    movq       xmm3, qword ptr [esi + edx] // V
6007    lea        esi, [esi + 8]
6008    punpcklbw  xmm2, xmm3 // UV
6009    movdqu     xmm0, [eax] // Y
6010    lea        eax, [eax + 16]
6011    movdqa     xmm1, xmm0
6012    punpcklbw  xmm0, xmm2 // YUYV
6013    punpckhbw  xmm1, xmm2
6014    movdqu     [edi], xmm0
6015    movdqu     [edi + 16], xmm1
6016    lea        edi, [edi + 32]
6017    sub        ecx, 16
6018    jg         convertloop
6019
6020    pop        edi
6021    pop        esi
6022    ret
6023  }
6024}
6025
6026__declspec(naked)
6027void I422ToUYVYRow_SSE2(const uint8* src_y,
6028                        const uint8* src_u,
6029                        const uint8* src_v,
6030                        uint8* dst_frame, int width) {
6031  __asm {
6032    push       esi
6033    push       edi
6034    mov        eax, [esp + 8 + 4]    // src_y
6035    mov        esi, [esp + 8 + 8]    // src_u
6036    mov        edx, [esp + 8 + 12]   // src_v
6037    mov        edi, [esp + 8 + 16]   // dst_frame
6038    mov        ecx, [esp + 8 + 20]   // width
6039    sub        edx, esi
6040
6041  convertloop:
6042    movq       xmm2, qword ptr [esi] // U
6043    movq       xmm3, qword ptr [esi + edx] // V
6044    lea        esi, [esi + 8]
6045    punpcklbw  xmm2, xmm3 // UV
6046    movdqu     xmm0, [eax] // Y
6047    movdqa     xmm1, xmm2
6048    lea        eax, [eax + 16]
6049    punpcklbw  xmm1, xmm0 // UYVY
6050    punpckhbw  xmm2, xmm0
6051    movdqu     [edi], xmm1
6052    movdqu     [edi + 16], xmm2
6053    lea        edi, [edi + 32]
6054    sub        ecx, 16
6055    jg         convertloop
6056
6057    pop        edi
6058    pop        esi
6059    ret
6060  }
6061}
6062
6063#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
6064__declspec(naked)
6065void ARGBPolynomialRow_SSE2(const uint8* src_argb,
6066                            uint8* dst_argb, const float* poly,
6067                            int width) {
6068  __asm {
6069    push       esi
6070    mov        eax, [esp + 4 + 4]   /* src_argb */
6071    mov        edx, [esp + 4 + 8]   /* dst_argb */
6072    mov        esi, [esp + 4 + 12]  /* poly */
6073    mov        ecx, [esp + 4 + 16]  /* width */
6074    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
6075
6076    // 2 pixel loop.
6077 convertloop:
6078//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
6079//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
6080    movq       xmm0, qword ptr [eax]  // BGRABGRA
6081    lea        eax, [eax + 8]
6082    punpcklbw  xmm0, xmm3
6083    movdqa     xmm4, xmm0
6084    punpcklwd  xmm0, xmm3  // pixel 0
6085    punpckhwd  xmm4, xmm3  // pixel 1
6086    cvtdq2ps   xmm0, xmm0  // 4 floats
6087    cvtdq2ps   xmm4, xmm4
6088    movdqa     xmm1, xmm0  // X
6089    movdqa     xmm5, xmm4
6090    mulps      xmm0, [esi + 16]  // C1 * X
6091    mulps      xmm4, [esi + 16]
6092    addps      xmm0, [esi]  // result = C0 + C1 * X
6093    addps      xmm4, [esi]
6094    movdqa     xmm2, xmm1
6095    movdqa     xmm6, xmm5
6096    mulps      xmm2, xmm1  // X * X
6097    mulps      xmm6, xmm5
6098    mulps      xmm1, xmm2  // X * X * X
6099    mulps      xmm5, xmm6
6100    mulps      xmm2, [esi + 32]  // C2 * X * X
6101    mulps      xmm6, [esi + 32]
6102    mulps      xmm1, [esi + 48]  // C3 * X * X * X
6103    mulps      xmm5, [esi + 48]
6104    addps      xmm0, xmm2  // result += C2 * X * X
6105    addps      xmm4, xmm6
6106    addps      xmm0, xmm1  // result += C3 * X * X * X
6107    addps      xmm4, xmm5
6108    cvttps2dq  xmm0, xmm0
6109    cvttps2dq  xmm4, xmm4
6110    packuswb   xmm0, xmm4
6111    packuswb   xmm0, xmm0
6112    movq       qword ptr [edx], xmm0
6113    lea        edx, [edx + 8]
6114    sub        ecx, 2
6115    jg         convertloop
6116    pop        esi
6117    ret
6118  }
6119}
6120#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
6121
6122#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6123__declspec(naked)
6124void ARGBPolynomialRow_AVX2(const uint8* src_argb,
6125                            uint8* dst_argb, const float* poly,
6126                            int width) {
6127  __asm {
6128    mov        eax, [esp + 4]   /* src_argb */
6129    mov        edx, [esp + 8]   /* dst_argb */
6130    mov        ecx, [esp + 12]   /* poly */
6131    vbroadcastf128 ymm4, [ecx]       // C0
6132    vbroadcastf128 ymm5, [ecx + 16]  // C1
6133    vbroadcastf128 ymm6, [ecx + 32]  // C2
6134    vbroadcastf128 ymm7, [ecx + 48]  // C3
6135    mov        ecx, [esp + 16]  /* width */
6136
6137    // 2 pixel loop.
6138 convertloop:
6139    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
6140    lea         eax, [eax + 8]
6141    vcvtdq2ps   ymm0, ymm0        // X 8 floats
6142    vmulps      ymm2, ymm0, ymm0  // X * X
6143    vmulps      ymm3, ymm0, ymm7  // C3 * X
6144    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
6145    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
6146    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
6147    vcvttps2dq  ymm0, ymm0
6148    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
6149    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
6150    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
6151    vmovq       qword ptr [edx], xmm0
6152    lea         edx, [edx + 8]
6153    sub         ecx, 2
6154    jg          convertloop
6155    vzeroupper
6156    ret
6157  }
6158}
6159#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
6160
6161#ifdef HAS_ARGBCOLORTABLEROW_X86
6162// Tranform ARGB pixels with color table.
6163__declspec(naked)
6164void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
6165                           int width) {
6166  __asm {
6167    push       esi
6168    mov        eax, [esp + 4 + 4]   /* dst_argb */
6169    mov        esi, [esp + 4 + 8]   /* table_argb */
6170    mov        ecx, [esp + 4 + 12]  /* width */
6171
6172    // 1 pixel loop.
6173  convertloop:
6174    movzx      edx, byte ptr [eax]
6175    lea        eax, [eax + 4]
6176    movzx      edx, byte ptr [esi + edx * 4]
6177    mov        byte ptr [eax - 4], dl
6178    movzx      edx, byte ptr [eax - 4 + 1]
6179    movzx      edx, byte ptr [esi + edx * 4 + 1]
6180    mov        byte ptr [eax - 4 + 1], dl
6181    movzx      edx, byte ptr [eax - 4 + 2]
6182    movzx      edx, byte ptr [esi + edx * 4 + 2]
6183    mov        byte ptr [eax - 4 + 2], dl
6184    movzx      edx, byte ptr [eax - 4 + 3]
6185    movzx      edx, byte ptr [esi + edx * 4 + 3]
6186    mov        byte ptr [eax - 4 + 3], dl
6187    dec        ecx
6188    jg         convertloop
6189    pop        esi
6190    ret
6191  }
6192}
6193#endif  // HAS_ARGBCOLORTABLEROW_X86
6194
6195#ifdef HAS_RGBCOLORTABLEROW_X86
6196// Tranform RGB pixels with color table.
6197__declspec(naked)
6198void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
6199  __asm {
6200    push       esi
6201    mov        eax, [esp + 4 + 4]   /* dst_argb */
6202    mov        esi, [esp + 4 + 8]   /* table_argb */
6203    mov        ecx, [esp + 4 + 12]  /* width */
6204
6205    // 1 pixel loop.
6206  convertloop:
6207    movzx      edx, byte ptr [eax]
6208    lea        eax, [eax + 4]
6209    movzx      edx, byte ptr [esi + edx * 4]
6210    mov        byte ptr [eax - 4], dl
6211    movzx      edx, byte ptr [eax - 4 + 1]
6212    movzx      edx, byte ptr [esi + edx * 4 + 1]
6213    mov        byte ptr [eax - 4 + 1], dl
6214    movzx      edx, byte ptr [eax - 4 + 2]
6215    movzx      edx, byte ptr [esi + edx * 4 + 2]
6216    mov        byte ptr [eax - 4 + 2], dl
6217    dec        ecx
6218    jg         convertloop
6219
6220    pop        esi
6221    ret
6222  }
6223}
6224#endif  // HAS_RGBCOLORTABLEROW_X86
6225
6226#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6227// Tranform RGB pixels with luma table.
6228__declspec(naked)
6229void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6230                                 int width,
6231                                 const uint8* luma, uint32 lumacoeff) {
6232  __asm {
6233    push       esi
6234    push       edi
6235    mov        eax, [esp + 8 + 4]   /* src_argb */
6236    mov        edi, [esp + 8 + 8]   /* dst_argb */
6237    mov        ecx, [esp + 8 + 12]  /* width */
6238    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
6239    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
6240    pshufd     xmm2, xmm2, 0
6241    pshufd     xmm3, xmm3, 0
6242    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
6243    psllw      xmm4, 8
6244    pxor       xmm5, xmm5
6245
6246    // 4 pixel loop.
6247  convertloop:
6248    movdqu     xmm0, qword ptr [eax]      // generate luma ptr
6249    pmaddubsw  xmm0, xmm3
6250    phaddw     xmm0, xmm0
6251    pand       xmm0, xmm4  // mask out low bits
6252    punpcklwd  xmm0, xmm5
6253    paddd      xmm0, xmm2  // add table base
6254    movd       esi, xmm0
6255    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6256
6257    movzx      edx, byte ptr [eax]
6258    movzx      edx, byte ptr [esi + edx]
6259    mov        byte ptr [edi], dl
6260    movzx      edx, byte ptr [eax + 1]
6261    movzx      edx, byte ptr [esi + edx]
6262    mov        byte ptr [edi + 1], dl
6263    movzx      edx, byte ptr [eax + 2]
6264    movzx      edx, byte ptr [esi + edx]
6265    mov        byte ptr [edi + 2], dl
6266    movzx      edx, byte ptr [eax + 3]  // copy alpha.
6267    mov        byte ptr [edi + 3], dl
6268
6269    movd       esi, xmm0
6270    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6271
6272    movzx      edx, byte ptr [eax + 4]
6273    movzx      edx, byte ptr [esi + edx]
6274    mov        byte ptr [edi + 4], dl
6275    movzx      edx, byte ptr [eax + 5]
6276    movzx      edx, byte ptr [esi + edx]
6277    mov        byte ptr [edi + 5], dl
6278    movzx      edx, byte ptr [eax + 6]
6279    movzx      edx, byte ptr [esi + edx]
6280    mov        byte ptr [edi + 6], dl
6281    movzx      edx, byte ptr [eax + 7]  // copy alpha.
6282    mov        byte ptr [edi + 7], dl
6283
6284    movd       esi, xmm0
6285    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6286
6287    movzx      edx, byte ptr [eax + 8]
6288    movzx      edx, byte ptr [esi + edx]
6289    mov        byte ptr [edi + 8], dl
6290    movzx      edx, byte ptr [eax + 9]
6291    movzx      edx, byte ptr [esi + edx]
6292    mov        byte ptr [edi + 9], dl
6293    movzx      edx, byte ptr [eax + 10]
6294    movzx      edx, byte ptr [esi + edx]
6295    mov        byte ptr [edi + 10], dl
6296    movzx      edx, byte ptr [eax + 11]  // copy alpha.
6297    mov        byte ptr [edi + 11], dl
6298
6299    movd       esi, xmm0
6300
6301    movzx      edx, byte ptr [eax + 12]
6302    movzx      edx, byte ptr [esi + edx]
6303    mov        byte ptr [edi + 12], dl
6304    movzx      edx, byte ptr [eax + 13]
6305    movzx      edx, byte ptr [esi + edx]
6306    mov        byte ptr [edi + 13], dl
6307    movzx      edx, byte ptr [eax + 14]
6308    movzx      edx, byte ptr [esi + edx]
6309    mov        byte ptr [edi + 14], dl
6310    movzx      edx, byte ptr [eax + 15]  // copy alpha.
6311    mov        byte ptr [edi + 15], dl
6312
6313    lea        eax, [eax + 16]
6314    lea        edi, [edi + 16]
6315    sub        ecx, 4
6316    jg         convertloop
6317
6318    pop        edi
6319    pop        esi
6320    ret
6321  }
6322}
6323#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6324
6325#endif  // defined(_M_X64)
6326#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6327
6328#ifdef __cplusplus
6329}  // extern "C"
6330}  // namespace libyuv
6331#endif
6332