planar_functions.cc revision 7cd8149e2cbad8b1ff6d481c37a4775d3c8cf2fa
1/*
2 *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/planar_functions.h"
12
13#include <string.h>
14
15#include "libyuv/cpu_id.h"
16#include "row.h"
17
18namespace libyuv {
19
20#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
21#define HAS_SPLITUV_NEON
22// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
23// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
24static void SplitUV_NEON(const uint8* src_uv,
25                         uint8* dst_u, uint8* dst_v, int pix) {
26  __asm__ volatile
27  (
28    "1:\n"
29    "vld2.u8    {q0,q1}, [%0]!    \n"  // load 16 pairs of UV
30    "vst1.u8    {q0}, [%1]!       \n"  // store U
31    "vst1.u8    {q1}, [%2]!       \n"  // Store V
32    "subs       %3, %3, #16       \n"  // 16 processed per loop
33    "bhi        1b                \n"
34    : "+r"(src_uv),
35      "+r"(dst_u),
36      "+r"(dst_v),
37      "+r"(pix)             // Output registers
38    :                       // Input registers
39    : "q0", "q1"            // Clobber List
40  );
41}
42
43#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
44    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
45#if defined(_MSC_VER)
46#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
47#else
48#define TALIGN16(t, var) t var __attribute__((aligned(16)))
49#endif
50
51// Shuffle table for converting ABGR to ARGB.
52extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
53  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
54};
55
56// Shuffle table for converting BGRA to ARGB.
57extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
58  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
59};
60
61#if defined(WIN32) && !defined(COVERAGE_ENABLED)
62#define HAS_SPLITUV_SSE2
63__declspec(naked)
64static void SplitUV_SSE2(const uint8* src_uv,
65                         uint8* dst_u, uint8* dst_v, int pix) {
66  __asm {
67    push       edi
68    mov        eax, [esp + 4 + 4]    // src_uv
69    mov        edx, [esp + 4 + 8]    // dst_u
70    mov        edi, [esp + 4 + 12]   // dst_v
71    mov        ecx, [esp + 4 + 16]   // pix
72    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
73    psrlw      xmm7, 8
74
75  wloop:
76    movdqa     xmm0, [eax]
77    movdqa     xmm1, [eax + 16]
78    lea        eax,  [eax + 32]
79    movdqa     xmm2, xmm0
80    movdqa     xmm3, xmm1
81    pand       xmm0, xmm7   // even bytes
82    pand       xmm1, xmm7
83    packuswb   xmm0, xmm1
84    movdqa     [edx], xmm0
85    lea        edx, [edx + 16]
86    psrlw      xmm2, 8      // odd bytes
87    psrlw      xmm3, 8
88    packuswb   xmm2, xmm3
89    movdqa     [edi], xmm2
90    lea        edi, [edi + 16]
91    sub        ecx, 16
92    ja         wloop
93    pop        edi
94    ret
95  }
96}
97
98#elif (defined(__x86_64__) || defined(__i386__)) && \
99    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
100#define HAS_SPLITUV_SSE2
101static void SplitUV_SSE2(const uint8* src_uv,
102                         uint8* dst_u, uint8* dst_v, int pix) {
103 asm volatile(
104  "pcmpeqb    %%xmm7,%%xmm7\n"
105  "psrlw      $0x8,%%xmm7\n"
106"1:"
107  "movdqa     (%0),%%xmm0\n"
108  "movdqa     0x10(%0),%%xmm1\n"
109  "lea        0x20(%0),%0\n"
110  "movdqa     %%xmm0,%%xmm2\n"
111  "movdqa     %%xmm1,%%xmm3\n"
112  "pand       %%xmm7,%%xmm0\n"
113  "pand       %%xmm7,%%xmm1\n"
114  "packuswb   %%xmm1,%%xmm0\n"
115  "movdqa     %%xmm0,(%1)\n"
116  "lea        0x10(%1),%1\n"
117  "psrlw      $0x8,%%xmm2\n"
118  "psrlw      $0x8,%%xmm3\n"
119  "packuswb   %%xmm3,%%xmm2\n"
120  "movdqa     %%xmm2,(%2)\n"
121  "lea        0x10(%2),%2\n"
122  "sub        $0x10,%3\n"
123  "ja         1b\n"
124  : "+r"(src_uv),     // %0
125    "+r"(dst_u),      // %1
126    "+r"(dst_v),      // %2
127    "+r"(pix)         // %3
128  :
129  : "memory"
130);
131}
132#endif
133#endif
134
135static void SplitUV_C(const uint8* src_uv,
136                      uint8* dst_u, uint8* dst_v, int pix) {
137  // Copy a row of UV.
138  for (int x = 0; x < pix; ++x) {
139    dst_u[0] = src_uv[0];
140    dst_v[0] = src_uv[1];
141    src_uv += 2;
142    dst_u += 1;
143    dst_v += 1;
144  }
145}
146
147static void I420CopyPlane(const uint8* src_y, int src_stride_y,
148                          uint8* dst_y, int dst_stride_y,
149                          int width, int height) {
150  // Copy plane
151  for (int y = 0; y < height; ++y) {
152    memcpy(dst_y, src_y, width);
153    src_y += src_stride_y;
154    dst_y += dst_stride_y;
155  }
156}
157
158// Copy I420 with optional flipping
159int I420Copy(const uint8* src_y, int src_stride_y,
160             const uint8* src_u, int src_stride_u,
161             const uint8* src_v, int src_stride_v,
162             uint8* dst_y, int dst_stride_y,
163             uint8* dst_u, int dst_stride_u,
164             uint8* dst_v, int dst_stride_v,
165             int width, int height) {
166  if (!src_y || !src_u || !src_v ||
167      !dst_y || !dst_u || !dst_v ||
168      width <= 0 || height == 0) {
169    return -1;
170  }
171
172  // Negative height means invert the image.
173  if (height < 0) {
174    height = -height;
175    int halfheight = (height + 1) >> 1;
176    src_y = src_y + (height - 1) * src_stride_y;
177    src_u = src_u + (halfheight - 1) * src_stride_u;
178    src_v = src_v + (halfheight - 1) * src_stride_v;
179    src_stride_y = -src_stride_y;
180    src_stride_u = -src_stride_u;
181    src_stride_v = -src_stride_v;
182  }
183
184  int halfwidth = (width + 1) >> 1;
185  int halfheight = (height + 1) >> 1;
186  I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
187  I420CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
188  I420CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
189  return 0;
190}
191
192// SetRows32 writes 'count' bytes using a 32 bit value repeated
193
194#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
195#define HAS_SETROW_NEON
196static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
197  __asm__ volatile
198  (
199    "vdup.u32   q0, %2            \n"  // duplicate 4 ints
200    "1:\n"
201    "vst1.u32   {q0}, [%0]!       \n"  // store
202    "subs       %1, %1, #16       \n"  // 16 processed per loop
203    "bhi        1b                \n"
204  : "+r"(dst),  // %0
205    "+r"(count) // %1
206  : "r"(v32)    // %2
207  : "q0", "memory"
208  );
209}
210
211#elif defined(WIN32) && !defined(COVERAGE_ENABLED)
212#define HAS_SETROW_SSE2
213__declspec(naked)
214static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
215  __asm {
216    mov        eax, [esp + 4]    // dst
217    movd       xmm7, [esp + 8]   // v32
218    mov        ecx, [esp + 12]   // count
219    pshufd     xmm7, xmm7, 0
220
221  wloop:
222    movdqa     [eax], xmm7
223    lea        eax, [eax + 16]
224    sub        ecx, 16
225    ja         wloop
226    ret
227  }
228}
229
230#elif (defined(__x86_64__) || defined(__i386__)) && \
231    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
232
233#define HAS_SETROW_SSE2
234static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
235  asm volatile(
236  "movd       %2, %%xmm7\n"
237  "pshufd     $0x0,%%xmm7,%%xmm7\n"
238"1:"
239  "movdqa     %%xmm7,(%0)\n"
240  "lea        0x10(%0),%0\n"
241  "sub        $0x10,%1\n"
242  "ja         1b\n"
243  : "+r"(dst),  // %0
244    "+r"(count) // %1
245  : "r"(v32)    // %2
246  : "memory"
247);
248}
249#endif
250
251static void SetRow8_C(uint8* dst, uint32 v8, int count) {
252  memset(dst, v8, count);
253}
254
255static void I420SetPlane(uint8* dst_y, int dst_stride_y,
256                         int width, int height,
257                         int value) {
258  void (*SetRow)(uint8* dst, uint32 value, int pix);
259#if defined(HAS_SETROW_NEON)
260  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
261      (width % 16 == 0) &&
262      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
263    SetRow = SetRow32_NEON;
264  } else
265#elif defined(HAS_SETROW_SSE2)
266  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
267      (width % 16 == 0) &&
268      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
269    SetRow = SetRow32_SSE2;
270  } else
271#endif
272  {
273    SetRow = SetRow8_C;
274  }
275
276  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
277  // Set plane
278  for (int y = 0; y < height; ++y) {
279    SetRow(dst_y, v32, width);
280    dst_y += dst_stride_y;
281  }
282}
283
284// Draw a rectangle into I420
285int I420Rect(uint8* dst_y, int dst_stride_y,
286             uint8* dst_u, int dst_stride_u,
287             uint8* dst_v, int dst_stride_v,
288             int x, int y,
289             int width, int height,
290             int value_y, int value_u, int value_v) {
291  if (!dst_y || !dst_u || !dst_v ||
292      width <= 0 || height == 0 ||
293      x < 0 || y < 0 ||
294      value_y < 0 || value_y > 255 ||
295      value_u < 0 || value_u > 255 ||
296      value_v < 0 || value_v > 255) {
297    return -1;
298  }
299  // Negative height means invert the image.
300  if (height < 0) {
301    height = -height;
302    int halfheight = (height + 1) >> 1;
303    dst_y = dst_y + (height - 1) * dst_stride_y;
304    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
305    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
306    dst_stride_y = -dst_stride_y;
307    dst_stride_u = -dst_stride_u;
308    dst_stride_v = -dst_stride_v;
309  }
310
311  int halfwidth = (width + 1) >> 1;
312  int halfheight = (height + 1) >> 1;
313  uint8* start_y = dst_y + y * dst_stride_y + x;
314  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
315  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
316
317  I420SetPlane(start_y, dst_stride_y, width, height, value_y);
318  I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
319  I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
320  return 0;
321}
322
323// Helper function to copy yuv data without scaling.  Used
324// by our jpeg conversion callbacks to incrementally fill a yuv image.
325int I422ToI420(const uint8* src_y, int src_stride_y,
326               const uint8* src_u, int src_stride_u,
327               const uint8* src_v, int src_stride_v,
328               uint8* dst_y, int dst_stride_y,
329               uint8* dst_u, int dst_stride_u,
330               uint8* dst_v, int dst_stride_v,
331               int width, int height) {
332  // Negative height means invert the image.
333  if (height < 0) {
334    height = -height;
335    src_y = src_y + (height - 1) * src_stride_y;
336    src_u = src_u + (height - 1) * src_stride_u;
337    src_v = src_v + (height - 1) * src_stride_v;
338    src_stride_y = -src_stride_y;
339    src_stride_u = -src_stride_u;
340    src_stride_v = -src_stride_v;
341  }
342
343  // Copy Y plane
344  I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
345
346  // SubSample UV planes.
347  int x, y;
348  int halfwidth = (width + 1) >> 1;
349  for (y = 0; y < height; y += 2) {
350    const uint8* u0 = src_u;
351    const uint8* u1 = src_u + src_stride_u;
352    if ((y + 1) >= height) {
353      u1 = u0;
354    }
355    for (x = 0; x < halfwidth; ++x) {
356      dst_u[x] = (u0[x] + u1[x] + 1) >> 1;
357    }
358    src_u += src_stride_u * 2;
359    dst_u += dst_stride_u;
360  }
361  for (y = 0; y < height; y += 2) {
362    const uint8* v0 = src_v;
363    const uint8* v1 = src_v + src_stride_v;
364    if ((y + 1) >= height) {
365      v1 = v0;
366    }
367    for (x = 0; x < halfwidth; ++x) {
368      dst_v[x] = (v0[x] + v1[x] + 1) >> 1;
369    }
370    src_v += src_stride_v * 2;
371    dst_v += dst_stride_v;
372  }
373  return 0;
374}
375
376static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
377                           uint8* dst, int dst_stride_frame,
378                           int width, int height) {
379  // Copy plane
380  for (int y = 0; y < height; y += 2) {
381    memcpy(dst, src, width);
382    src += src_stride_0;
383    dst += dst_stride_frame;
384    memcpy(dst, src, width);
385    src += src_stride_1;
386    dst += dst_stride_frame;
387  }
388}
389
390// Support converting from FOURCC_M420
391// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
392// easy conversion to I420.
393// M420 format description:
394// M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
395// Chroma is half width / half height. (420)
396// src_stride_m420 is row planar.  Normally this will be the width in pixels.
397//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
398//   this as well as the two Y planes.
399static int X420ToI420(const uint8* src_y,
400                      int src_stride_y0, int src_stride_y1,
401                      const uint8* src_uv, int src_stride_uv,
402                      uint8* dst_y, int dst_stride_y,
403                      uint8* dst_u, int dst_stride_u,
404                      uint8* dst_v, int dst_stride_v,
405                      int width, int height) {
406  // Negative height means invert the image.
407  if (height < 0) {
408    height = -height;
409    int halfheight = (height + 1) >> 1;
410    dst_y = dst_y + (height - 1) * dst_stride_y;
411    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
412    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
413    dst_stride_y = -dst_stride_y;
414    dst_stride_u = -dst_stride_u;
415    dst_stride_v = -dst_stride_v;
416  }
417
418  int halfwidth = (width + 1) >> 1;
419  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
420#if defined(HAS_SPLITUV_NEON)
421  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
422      (halfwidth % 16 == 0) &&
423      IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
424      IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
425      IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
426    SplitUV = SplitUV_NEON;
427  } else
428#elif defined(HAS_SPLITUV_SSE2)
429  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
430      (halfwidth % 16 == 0) &&
431      IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
432      IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
433      IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
434    SplitUV = SplitUV_SSE2;
435  } else
436#endif
437  {
438    SplitUV = SplitUV_C;
439  }
440
441  I420CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
442                 width, height);
443
444  int halfheight = (height + 1) >> 1;
445  for (int y = 0; y < halfheight; ++y) {
446    // Copy a row of UV.
447    SplitUV(src_uv, dst_u, dst_v, halfwidth);
448    dst_u += dst_stride_u;
449    dst_v += dst_stride_v;
450    src_uv += src_stride_uv;
451  }
452  return 0;
453}
454
455// Convert M420 to I420.
456int M420ToI420(const uint8* src_m420, int src_stride_m420,
457               uint8* dst_y, int dst_stride_y,
458               uint8* dst_u, int dst_stride_u,
459               uint8* dst_v, int dst_stride_v,
460               int width, int height) {
461  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
462                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
463                    dst_y, dst_stride_y,
464                    dst_u, dst_stride_u,
465                    dst_v, dst_stride_v,
466                    width, height);
467}
468
469// Convert NV12 to I420.
470int NV12ToI420(const uint8* src_y, int src_stride_y,
471               const uint8* src_uv, int src_stride_uv,
472               uint8* dst_y, int dst_stride_y,
473               uint8* dst_u, int dst_stride_u,
474               uint8* dst_v, int dst_stride_v,
475               int width, int height) {
476  return X420ToI420(src_y, src_stride_y, src_stride_y,
477                    src_uv, src_stride_uv,
478                    dst_y, dst_stride_y,
479                    dst_u, dst_stride_u,
480                    dst_v, dst_stride_v,
481                    width, height);
482}
483
484// Convert NV12 to I420.  Deprecated.
485int NV12ToI420(const uint8* src_y,
486               const uint8* src_uv,
487               int src_stride_frame,
488               uint8* dst_y, int dst_stride_y,
489               uint8* dst_u, int dst_stride_u,
490               uint8* dst_v, int dst_stride_v,
491               int width, int height) {
492  return X420ToI420(src_y, src_stride_frame, src_stride_frame,
493                    src_uv, src_stride_frame,
494                    dst_y, dst_stride_y,
495                    dst_u, dst_stride_u,
496                    dst_v, dst_stride_v,
497                    width, height);
498}
499
500#if defined(WIN32) && !defined(COVERAGE_ENABLED)
501#define HAS_SPLITYUY2_SSE2
502__declspec(naked)
503static void SplitYUY2_SSE2(const uint8* src_yuy2,
504                           uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
505  __asm {
506    push       esi
507    push       edi
508    mov        eax, [esp + 8 + 4]    // src_yuy2
509    mov        edx, [esp + 8 + 8]    // dst_y
510    mov        esi, [esp + 8 + 12]   // dst_u
511    mov        edi, [esp + 8 + 16]   // dst_v
512    mov        ecx, [esp + 8 + 20]   // pix
513    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
514    psrlw      xmm7, 8
515
516  wloop:
517    movdqa     xmm0, [eax]
518    movdqa     xmm1, [eax + 16]
519    lea        eax,  [eax + 32]
520    movdqa     xmm2, xmm0
521    movdqa     xmm3, xmm1
522    pand       xmm2, xmm7   // even bytes are Y
523    pand       xmm3, xmm7
524    packuswb   xmm2, xmm3
525    movdqa     [edx], xmm2
526    lea        edx, [edx + 16]
527    psrlw      xmm0, 8      // YUYV -> UVUV
528    psrlw      xmm1, 8
529    packuswb   xmm0, xmm1
530    movdqa     xmm1, xmm0
531    pand       xmm0, xmm7  // U
532    packuswb   xmm0, xmm0
533    movq       qword ptr [esi], xmm0
534    lea        esi, [esi + 8]
535    psrlw      xmm1, 8     // V
536    packuswb   xmm1, xmm1
537    movq       qword ptr [edi], xmm1
538    lea        edi, [edi + 8]
539    sub        ecx, 16
540    ja         wloop
541
542    pop        edi
543    pop        esi
544    ret
545  }
546}
547
548#elif (defined(__x86_64__) || defined(__i386__)) && \
549    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
550#define HAS_SPLITYUY2_SSE2
551static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
552                           uint8* dst_u, uint8* dst_v, int pix) {
553  asm volatile(
554  "pcmpeqb    %%xmm7,%%xmm7\n"
555  "psrlw      $0x8,%%xmm7\n"
556"1:"
557  "movdqa     (%0),%%xmm0\n"
558  "movdqa     0x10(%0),%%xmm1\n"
559  "lea        0x20(%0),%0\n"
560  "movdqa     %%xmm0,%%xmm2\n"
561  "movdqa     %%xmm1,%%xmm3\n"
562  "pand       %%xmm7,%%xmm2\n"
563  "pand       %%xmm7,%%xmm3\n"
564  "packuswb   %%xmm3,%%xmm2\n"
565  "movdqa     %%xmm2,(%1)\n"
566  "lea        0x10(%1),%1\n"
567  "psrlw      $0x8,%%xmm0\n"
568  "psrlw      $0x8,%%xmm1\n"
569  "packuswb   %%xmm1,%%xmm0\n"
570  "movdqa     %%xmm0,%%xmm1\n"
571  "pand       %%xmm7,%%xmm0\n"
572  "packuswb   %%xmm0,%%xmm0\n"
573  "movq       %%xmm0,(%2)\n"
574  "lea        0x8(%2),%2\n"
575  "psrlw      $0x8,%%xmm1\n"
576  "packuswb   %%xmm1,%%xmm1\n"
577  "movq       %%xmm1,(%3)\n"
578  "lea        0x8(%3),%3\n"
579  "sub        $0x10,%4\n"
580  "ja         1b\n"
581  : "+r"(src_yuy2),    // %0
582    "+r"(dst_y),       // %1
583    "+r"(dst_u),       // %2
584    "+r"(dst_v),       // %3
585    "+r"(pix)          // %4
586  :
587  : "memory"
588);
589}
590#endif
591
592static void SplitYUY2_C(const uint8* src_yuy2,
593                        uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
594  // Copy a row of YUY2.
595  for (int x = 0; x < pix; x += 2) {
596    dst_y[0] = src_yuy2[0];
597    dst_y[1] = src_yuy2[2];
598    dst_u[0] = src_yuy2[1];
599    dst_v[0] = src_yuy2[3];
600    src_yuy2 += 4;
601    dst_y += 2;
602    dst_u += 1;
603    dst_v += 1;
604  }
605}
606
607// Convert Q420 to I420.
608// Format is rows of YY/YUYV
609int Q420ToI420(const uint8* src_y, int src_stride_y,
610               const uint8* src_yuy2, int src_stride_yuy2,
611               uint8* dst_y, int dst_stride_y,
612               uint8* dst_u, int dst_stride_u,
613               uint8* dst_v, int dst_stride_v,
614               int width, int height) {
615  // Negative height means invert the image.
616  if (height < 0) {
617    height = -height;
618    int halfheight = (height + 1) >> 1;
619    dst_y = dst_y + (height - 1) * dst_stride_y;
620    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
621    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
622    dst_stride_y = -dst_stride_y;
623    dst_stride_u = -dst_stride_u;
624    dst_stride_v = -dst_stride_v;
625  }
626  void (*SplitYUY2)(const uint8* src_yuy2,
627                    uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix);
628#if defined(HAS_SPLITYUY2_SSE2)
629  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
630      (width % 16 == 0) &&
631      IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
632      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
633      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
634      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
635    SplitYUY2 = SplitYUY2_SSE2;
636  } else
637#endif
638  {
639    SplitYUY2 = SplitYUY2_C;
640  }
641  for (int y = 0; y < height; y += 2) {
642    memcpy(dst_y, src_y, width);
643    dst_y += dst_stride_y;
644    src_y += src_stride_y;
645
646    // Copy a row of YUY2.
647    SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width);
648    dst_y += dst_stride_y;
649    dst_u += dst_stride_u;
650    dst_v += dst_stride_v;
651    src_yuy2 += src_stride_yuy2;
652  }
653  return 0;
654}
655
656#if defined(WIN32) && !defined(COVERAGE_ENABLED)
657#define HAS_YUY2TOI420ROW_SSE2
658__declspec(naked)
659void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
660                         uint8* dst_y, int pix) {
661  __asm {
662    mov        eax, [esp + 4]    // src_yuy2
663    mov        edx, [esp + 8]    // dst_y
664    mov        ecx, [esp + 12]   // pix
665    pcmpeqb    xmm7, xmm7        // generate mask 0x00ff00ff
666    psrlw      xmm7, 8
667
668  wloop:
669    movdqa     xmm0, [eax]
670    movdqa     xmm1, [eax + 16]
671    lea        eax,  [eax + 32]
672    pand       xmm0, xmm7   // even bytes are Y
673    pand       xmm1, xmm7
674    packuswb   xmm0, xmm1
675    movdqa     [edx], xmm0
676    lea        edx, [edx + 16]
677    sub        ecx, 16
678    ja         wloop
679    ret
680  }
681}
682
683__declspec(naked)
684void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
685                          uint8* dst_u, uint8* dst_y, int pix) {
686  __asm {
687    push       esi
688    push       edi
689    mov        eax, [esp + 8 + 4]    // src_yuy2
690    mov        esi, [esp + 8 + 8]    // stride_yuy2
691    mov        edx, [esp + 8 + 12]   // dst_u
692    mov        edi, [esp + 8 + 16]   // dst_v
693    mov        ecx, [esp + 8 + 20]   // pix
694    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
695    psrlw      xmm7, 8
696
697  wloop:
698    movdqa     xmm0, [eax]
699    movdqa     xmm1, [eax + 16]
700    movdqa     xmm2, [eax + esi]
701    movdqa     xmm3, [eax + esi + 16]
702    lea        eax,  [eax + 32]
703    pavgb      xmm0, xmm2
704    pavgb      xmm1, xmm3
705    psrlw      xmm0, 8      // YUYV -> UVUV
706    psrlw      xmm1, 8
707    packuswb   xmm0, xmm1
708    movdqa     xmm1, xmm0
709    pand       xmm0, xmm7  // U
710    packuswb   xmm0, xmm0
711    movq       qword ptr [edx], xmm0
712    lea        edx, [edx + 8]
713    psrlw      xmm1, 8     // V
714    packuswb   xmm1, xmm1
715    movq       qword ptr [edi], xmm1
716    lea        edi, [edi + 8]
717    sub        ecx, 16
718    ja         wloop
719
720    pop        edi
721    pop        esi
722    ret
723  }
724}
725
726#define HAS_UYVYTOI420ROW_SSE2
727__declspec(naked)
728void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
729                         uint8* dst_y, int pix) {
730  __asm {
731    mov        eax, [esp + 4]    // src_uyvy
732    mov        edx, [esp + 8]    // dst_y
733    mov        ecx, [esp + 12]   // pix
734
735  wloop:
736    movdqa     xmm0, [eax]
737    movdqa     xmm1, [eax + 16]
738    lea        eax,  [eax + 32]
739    psrlw      xmm0, 8    // odd bytes are Y
740    psrlw      xmm1, 8
741    packuswb   xmm0, xmm1
742    movdqa     [edx], xmm0
743    lea        edx, [edx + 16]
744    sub        ecx, 16
745    ja         wloop
746    ret
747  }
748}
749
750__declspec(naked)
751void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
752                          uint8* dst_u, uint8* dst_y, int pix) {
753  __asm {
754    push       esi
755    push       edi
756    mov        eax, [esp + 8 + 4]    // src_yuy2
757    mov        esi, [esp + 8 + 8]    // stride_yuy2
758    mov        edx, [esp + 8 + 12]   // dst_u
759    mov        edi, [esp + 8 + 16]   // dst_v
760    mov        ecx, [esp + 8 + 20]   // pix
761    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
762    psrlw      xmm7, 8
763
764  wloop:
765    movdqa     xmm0, [eax]
766    movdqa     xmm1, [eax + 16]
767    movdqa     xmm2, [eax + esi]
768    movdqa     xmm3, [eax + esi + 16]
769    lea        eax,  [eax + 32]
770    pavgb      xmm0, xmm2
771    pavgb      xmm1, xmm3
772    pand       xmm0, xmm7   // UYVY -> UVUV
773    pand       xmm1, xmm7
774    packuswb   xmm0, xmm1
775    movdqa     xmm1, xmm0
776    pand       xmm0, xmm7  // U
777    packuswb   xmm0, xmm0
778    movq       qword ptr [edx], xmm0
779    lea        edx, [edx + 8]
780    psrlw      xmm1, 8     // V
781    packuswb   xmm1, xmm1
782    movq       qword ptr [edi], xmm1
783    lea        edi, [edi + 8]
784    sub        ecx, 16
785    ja         wloop
786
787    pop        edi
788    pop        esi
789    ret
790  }
791}
792
793#elif (defined(__x86_64__) || defined(__i386__)) && \
794    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
795
796#define HAS_YUY2TOI420ROW_SSE2
797static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
798                                uint8* dst_y, int pix) {
799  asm volatile(
800  "pcmpeqb    %%xmm7,%%xmm7\n"
801  "psrlw      $0x8,%%xmm7\n"
802"1:"
803  "movdqa     (%0),%%xmm0\n"
804  "movdqa     0x10(%0),%%xmm1\n"
805  "lea        0x20(%0),%0\n"
806  "pand       %%xmm7,%%xmm0\n"
807  "pand       %%xmm7,%%xmm1\n"
808  "packuswb   %%xmm1,%%xmm0\n"
809  "movdqa     %%xmm0,(%1)\n"
810  "lea        0x10(%1),%1\n"
811  "sub        $0x10,%2\n"
812  "ja         1b\n"
813  : "+r"(src_yuy2),  // %0
814    "+r"(dst_y),     // %1
815    "+r"(pix)        // %2
816  :
817  : "memory"
818);
819}
820
821static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
822                                 uint8* dst_u, uint8* dst_y, int pix) {
823  asm volatile(
824  "pcmpeqb    %%xmm7,%%xmm7\n"
825  "psrlw      $0x8,%%xmm7\n"
826"1:"
827  "movdqa     (%0),%%xmm0\n"
828  "movdqa     0x10(%0),%%xmm1\n"
829  "movdqa     (%0,%4,1),%%xmm2\n"
830  "movdqa     0x10(%0,%4,1),%%xmm3\n"
831  "lea        0x20(%0),%0\n"
832  "pavgb      %%xmm2,%%xmm0\n"
833  "pavgb      %%xmm3,%%xmm1\n"
834  "psrlw      $0x8,%%xmm0\n"
835  "psrlw      $0x8,%%xmm1\n"
836  "packuswb   %%xmm1,%%xmm0\n"
837  "movdqa     %%xmm0,%%xmm1\n"
838  "pand       %%xmm7,%%xmm0\n"
839  "packuswb   %%xmm0,%%xmm0\n"
840  "movq       %%xmm0,(%1)\n"
841  "lea        0x8(%1),%1\n"
842  "psrlw      $0x8,%%xmm1\n"
843  "packuswb   %%xmm1,%%xmm1\n"
844  "movq       %%xmm1,(%2)\n"
845  "lea        0x8(%2),%2\n"
846  "sub        $0x10,%3\n"
847  "ja         1b\n"
848  : "+r"(src_yuy2),    // %0
849    "+r"(dst_u),       // %1
850    "+r"(dst_y),       // %2
851    "+r"(pix)          // %3
852  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
853  : "memory"
854);
855}
856#define HAS_UYVYTOI420ROW_SSE2
857static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
858                                uint8* dst_y, int pix) {
859  asm volatile(
860"1:"
861  "movdqa     (%0),%%xmm0\n"
862  "movdqa     0x10(%0),%%xmm1\n"
863  "lea        0x20(%0),%0\n"
864  "psrlw      $0x8,%%xmm0\n"
865  "psrlw      $0x8,%%xmm1\n"
866  "packuswb   %%xmm1,%%xmm0\n"
867  "movdqa     %%xmm0,(%1)\n"
868  "lea        0x10(%1),%1\n"
869  "sub        $0x10,%2\n"
870  "ja         1b\n"
871  : "+r"(src_uyvy),  // %0
872    "+r"(dst_y),     // %1
873    "+r"(pix)        // %2
874  :
875  : "memory"
876);
877}
878
879static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
880                                 uint8* dst_u, uint8* dst_y, int pix) {
881  asm volatile(
882  "pcmpeqb    %%xmm7,%%xmm7\n"
883  "psrlw      $0x8,%%xmm7\n"
884"1:"
885  "movdqa     (%0),%%xmm0\n"
886  "movdqa     0x10(%0),%%xmm1\n"
887  "movdqa     (%0,%4,1),%%xmm2\n"
888  "movdqa     0x10(%0,%4,1),%%xmm3\n"
889  "lea        0x20(%0),%0\n"
890  "pavgb      %%xmm2,%%xmm0\n"
891  "pavgb      %%xmm3,%%xmm1\n"
892  "pand       %%xmm7,%%xmm0\n"
893  "pand       %%xmm7,%%xmm1\n"
894  "packuswb   %%xmm1,%%xmm0\n"
895  "movdqa     %%xmm0,%%xmm1\n"
896  "pand       %%xmm7,%%xmm0\n"
897  "packuswb   %%xmm0,%%xmm0\n"
898  "movq       %%xmm0,(%1)\n"
899  "lea        0x8(%1),%1\n"
900  "psrlw      $0x8,%%xmm1\n"
901  "packuswb   %%xmm1,%%xmm1\n"
902  "movq       %%xmm1,(%2)\n"
903  "lea        0x8(%2),%2\n"
904  "sub        $0x10,%3\n"
905  "ja         1b\n"
906  : "+r"(src_uyvy),    // %0
907    "+r"(dst_u),       // %1
908    "+r"(dst_y),       // %2
909    "+r"(pix)          // %3
910  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
911  : "memory"
912);
913}
914#endif
915
916// Filter 2 rows of YUY2 UV's (422) into U and V (420)
917void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2,
918                       uint8* dst_u, uint8* dst_v, int pix) {
919  // Output a row of UV values, filtering 2 rows of YUY2
920  for (int x = 0; x < pix; x += 2) {
921    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
922    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
923    src_yuy2 += 4;
924    dst_u += 1;
925    dst_v += 1;
926  }
927}
928
929void YUY2ToI420RowY_C(const uint8* src_yuy2,
930                      uint8* dst_y, int pix) {
931  // Copy a row of yuy2 Y values
932  for (int x = 0; x < pix; ++x) {
933    dst_y[0] = src_yuy2[0];
934    src_yuy2 += 2;
935    dst_y += 1;
936  }
937}
938
939void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy,
940                       uint8* dst_u, uint8* dst_v, int pix) {
941  // Copy a row of uyvy UV values
942  for (int x = 0; x < pix; x += 2) {
943    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
944    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
945    src_uyvy += 4;
946    dst_u += 1;
947    dst_v += 1;
948  }
949}
950
951void UYVYToI420RowY_C(const uint8* src_uyvy,
952                      uint8* dst_y, int pix) {
953  // Copy a row of uyvy Y values
954  for (int x = 0; x < pix; ++x) {
955    dst_y[0] = src_uyvy[1];
956    src_uyvy += 2;
957    dst_y += 1;
958  }
959}
960
961// Convert YUY2 to I420.
962int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
963               uint8* dst_y, int dst_stride_y,
964               uint8* dst_u, int dst_stride_u,
965               uint8* dst_v, int dst_stride_v,
966               int width, int height) {
967  // Negative height means invert the image.
968  if (height < 0) {
969    height = -height;
970    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
971    src_stride_yuy2 = -src_stride_yuy2;
972  }
973  void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2,
974                          uint8* dst_u, uint8* dst_v, int pix);
975  void (*YUY2ToI420RowY)(const uint8* src_yuy2,
976                         uint8* dst_y, int pix);
977#if defined(HAS_YUY2TOI420ROW_SSE2)
978  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
979      (width % 16 == 0) &&
980      IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
981      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
982      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
983      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
984    YUY2ToI420RowY = YUY2ToI420RowY_SSE2;
985    YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2;
986  } else
987#endif
988  {
989    YUY2ToI420RowY = YUY2ToI420RowY_C;
990    YUY2ToI420RowUV = YUY2ToI420RowUV_C;
991  }
992  for (int y = 0; y < height; ++y) {
993    if ((y & 1) == 0) {
994      if (y >= (height - 1) ) {  // last chroma on odd height clamp height
995        src_stride_yuy2 = 0;
996      }
997      YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
998      dst_u += dst_stride_u;
999      dst_v += dst_stride_v;
1000    }
1001    YUY2ToI420RowY(src_yuy2, dst_y, width);
1002    dst_y += dst_stride_y;
1003    src_yuy2 += src_stride_yuy2;
1004  }
1005  return 0;
1006}
1007
1008// Convert UYVY to I420.
1009int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
1010               uint8* dst_y, int dst_stride_y,
1011               uint8* dst_u, int dst_stride_u,
1012               uint8* dst_v, int dst_stride_v,
1013               int width, int height) {
1014  // Negative height means invert the image.
1015  if (height < 0) {
1016    height = -height;
1017    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
1018    src_stride_uyvy = -src_stride_uyvy;
1019  }
1020  void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy,
1021                          uint8* dst_u, uint8* dst_v, int pix);
1022  void (*UYVYToI420RowY)(const uint8* src_uyvy,
1023                         uint8* dst_y, int pix);
1024#if defined(HAS_UYVYTOI420ROW_SSE2)
1025  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
1026      (width % 16 == 0) &&
1027      IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) &&
1028      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
1029      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
1030      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
1031    UYVYToI420RowY = UYVYToI420RowY_SSE2;
1032    UYVYToI420RowUV = UYVYToI420RowUV_SSE2;
1033  } else
1034#endif
1035  {
1036    UYVYToI420RowY = UYVYToI420RowY_C;
1037    UYVYToI420RowUV = UYVYToI420RowUV_C;
1038  }
1039  for (int y = 0; y < height; ++y) {
1040    if ((y & 1) == 0) {
1041      if (y >= (height - 1) ) {  // last chroma on odd height clamp height
1042        src_stride_uyvy = 0;
1043      }
1044      UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
1045      dst_u += dst_stride_u;
1046      dst_v += dst_stride_v;
1047    }
1048    UYVYToI420RowY(src_uyvy, dst_y, width);
1049    dst_y += dst_stride_y;
1050    src_uyvy += src_stride_uyvy;
1051  }
1052  return 0;
1053}
1054
1055// Convert I420 to ARGB.
1056// TODO(fbarchard): Add SSE2 version and supply C version for fallback.
1057int I420ToARGB(const uint8* src_y, int src_stride_y,
1058               const uint8* src_u, int src_stride_u,
1059               const uint8* src_v, int src_stride_v,
1060               uint8* dst_argb, int dst_stride_argb,
1061               int width, int height) {
1062  // Negative height means invert the image.
1063  if (height < 0) {
1064    height = -height;
1065    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
1066    dst_stride_argb = -dst_stride_argb;
1067  }
1068  for (int y = 0; y < height; ++y) {
1069    FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
1070    dst_argb += dst_stride_argb;
1071    src_y += src_stride_y;
1072    if (y & 1) {
1073      src_u += src_stride_u;
1074      src_v += src_stride_v;
1075    }
1076  }
1077  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
1078  EMMS();
1079  return 0;
1080}
1081
1082// Convert I420 to BGRA.
1083int I420ToBGRA(const uint8* src_y, int src_stride_y,
1084               const uint8* src_u, int src_stride_u,
1085               const uint8* src_v, int src_stride_v,
1086               uint8* dst_argb, int dst_stride_argb,
1087               int width, int height) {
1088  // Negative height means invert the image.
1089  if (height < 0) {
1090    height = -height;
1091    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
1092    dst_stride_argb = -dst_stride_argb;
1093  }
1094  for (int y = 0; y < height; ++y) {
1095    FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width);
1096    dst_argb += dst_stride_argb;
1097    src_y += src_stride_y;
1098    if (y & 1) {
1099      src_u += src_stride_u;
1100      src_v += src_stride_v;
1101    }
1102  }
1103  EMMS();
1104  return 0;
1105}
1106
1107// Convert I420 to BGRA.
1108int I420ToABGR(const uint8* src_y, int src_stride_y,
1109               const uint8* src_u, int src_stride_u,
1110               const uint8* src_v, int src_stride_v,
1111               uint8* dst_argb, int dst_stride_argb,
1112               int width, int height) {
1113  // Negative height means invert the image.
1114  if (height < 0) {
1115    height = -height;
1116    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
1117    dst_stride_argb = -dst_stride_argb;
1118  }
1119  for (int y = 0; y < height; ++y) {
1120    FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width);
1121    dst_argb += dst_stride_argb;
1122    src_y += src_stride_y;
1123    if (y & 1) {
1124      src_u += src_stride_u;
1125      src_v += src_stride_v;
1126    }
1127  }
1128  EMMS();
1129  return 0;
1130}
1131
1132// Convert I422 to ARGB.
1133int I422ToARGB(const uint8* src_y, int src_stride_y,
1134               const uint8* src_u, int src_stride_u,
1135               const uint8* src_v, int src_stride_v,
1136               uint8* dst_argb, int dst_stride_argb,
1137               int width, int height) {
1138  // Negative height means invert the image.
1139  if (height < 0) {
1140    height = -height;
1141    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
1142    dst_stride_argb = -dst_stride_argb;
1143  }
1144  for (int y = 0; y < height; ++y) {
1145    FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
1146    dst_argb += dst_stride_argb;
1147    src_y += src_stride_y;
1148    src_u += src_stride_u;
1149    src_v += src_stride_v;
1150  }
1151  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
1152  EMMS();
1153  return 0;
1154}
1155
1156// Convert I444 to ARGB.
1157int I444ToARGB(const uint8* src_y, int src_stride_y,
1158               const uint8* src_u, int src_stride_u,
1159               const uint8* src_v, int src_stride_v,
1160               uint8* dst_argb, int dst_stride_argb,
1161               int width, int height) {
1162  // Negative height means invert the image.
1163  if (height < 0) {
1164    height = -height;
1165    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
1166    dst_stride_argb = -dst_stride_argb;
1167  }
1168  for (int y = 0; y < height; ++y) {
1169    FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width);
1170    dst_argb += dst_stride_argb;
1171    src_y += src_stride_y;
1172    src_u += src_stride_u;
1173    src_v += src_stride_v;
1174  }
1175  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
1176  EMMS();
1177  return 0;
1178}
1179
1180// Convert I400 to ARGB.
1181int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
1182                         uint8* dst_argb, int dst_stride_argb,
1183                         int width, int height) {
1184  // Negative height means invert the image.
1185  if (height < 0) {
1186    height = -height;
1187    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
1188    dst_stride_argb = -dst_stride_argb;
1189  }
1190  for (int y = 0; y < height; ++y) {
1191    FastConvertYToRGB32Row(src_y, dst_argb, width);
1192    dst_argb += dst_stride_argb;
1193    src_y += src_stride_y;
1194  }
1195  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
1196  EMMS();
1197  return 0;
1198}
1199
1200// TODO(fbarchard): 64 bit version
1201#if defined(WIN32) && !defined(COVERAGE_ENABLED)
1202
1203#define HAS_I400TOARGBROW_SSE2
1204__declspec(naked)
1205static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
1206  __asm {
1207    mov        eax, [esp + 4]        // src_y
1208    mov        edx, [esp + 8]        // dst_argb
1209    mov        ecx, [esp + 12]       // pix
1210    pcmpeqb    xmm7, xmm7            // generate mask 0xff000000
1211    pslld      xmm7, 24
1212
1213  wloop:
1214    movq       xmm0, qword ptr [eax]
1215    lea        eax,  [eax + 8]
1216    punpcklbw  xmm0, xmm0
1217    movdqa     xmm1, xmm0
1218    punpcklwd  xmm0, xmm0
1219    punpckhwd  xmm1, xmm1
1220    por        xmm0, xmm7
1221    por        xmm1, xmm7
1222    movdqa     [edx], xmm0
1223    movdqa     [edx + 16], xmm1
1224    lea        edx, [edx + 32]
1225    sub        ecx, 8
1226    ja         wloop
1227    ret
1228  }
1229}
1230
1231#define HAS_ABGRTOARGBROW_SSSE3
1232__declspec(naked)
1233static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
1234                                int pix) {
1235__asm {
1236    mov       eax, [esp + 4]   // src_abgr
1237    mov       edx, [esp + 8]   // dst_argb
1238    mov       ecx, [esp + 12]  // pix
1239    movdqa    xmm7, _kShuffleMaskABGRToARGB
1240
1241 convertloop :
1242    movdqa    xmm0, [eax]
1243    lea       eax, [eax + 16]
1244    pshufb    xmm0, xmm7
1245    movdqa    [edx], xmm0
1246    lea       edx, [edx + 16]
1247    sub       ecx, 4
1248    ja        convertloop
1249    ret
1250  }
1251}
1252
1253#define HAS_BGRATOARGBROW_SSSE3
1254__declspec(naked)
1255static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
1256                                int pix) {
1257__asm {
1258    mov       eax, [esp + 4]   // src_bgra
1259    mov       edx, [esp + 8]   // dst_argb
1260    mov       ecx, [esp + 12]  // pix
1261    movdqa    xmm7, _kShuffleMaskBGRAToARGB
1262
1263 convertloop :
1264    movdqa    xmm0, [eax]
1265    lea       eax, [eax + 16]
1266    pshufb    xmm0, xmm7
1267    movdqa    [edx], xmm0
1268    lea       edx, [edx + 16]
1269    sub       ecx, 4
1270    ja        convertloop
1271    ret
1272  }
1273}
1274
1275
1276#elif (defined(__x86_64__) || defined(__i386__)) && \
1277    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
1278
1279// TODO(yuche): consider moving ARGB related codes to a separate file.
1280#define HAS_I400TOARGBROW_SSE2
1281static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
1282  asm volatile(
1283  "pcmpeqb    %%xmm7,%%xmm7\n"
1284  "pslld      $0x18,%%xmm7\n"
1285"1:"
1286  "movq       (%0),%%xmm0\n"
1287  "lea        0x8(%0),%0\n"
1288  "punpcklbw  %%xmm0,%%xmm0\n"
1289  "movdqa     %%xmm0,%%xmm1\n"
1290  "punpcklwd  %%xmm0,%%xmm0\n"
1291  "punpckhwd  %%xmm1,%%xmm1\n"
1292  "por        %%xmm7,%%xmm0\n"
1293  "por        %%xmm7,%%xmm1\n"
1294  "movdqa     %%xmm0,(%1)\n"
1295  "movdqa     %%xmm1,0x10(%1)\n"
1296  "lea        0x20(%1),%1\n"
1297  "sub        $0x8,%2\n"
1298  "ja         1b\n"
1299  : "+r"(src_y),     // %0
1300    "+r"(dst_argb),  // %1
1301    "+r"(pix)        // %2
1302  :
1303  : "memory"
1304);
1305}
1306
1307#define HAS_ABGRTOARGBROW_SSSE3
1308static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
1309                                int pix) {
1310  asm volatile(
1311  "movdqa     (%3),%%xmm7\n"
1312"1:"
1313  "movdqa     (%0),%%xmm0\n"
1314  "lea        0x10(%0),%0\n"
1315  "pshufb     %%xmm7,%%xmm0\n"
1316  "movdqa     %%xmm0,(%1)\n"
1317  "lea        0x10(%1),%1\n"
1318  "sub        $0x4,%2\n"
1319  "ja         1b\n"
1320  : "+r"(src_abgr),  // %0
1321    "+r"(dst_argb),  // %1
1322    "+r"(pix)        // %2
1323  : "r"(kShuffleMaskABGRToARGB)  // %3
1324  : "memory"
1325);
1326}
1327
1328#define HAS_BGRATOARGBROW_SSSE3
1329static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
1330                                int pix) {
1331  asm volatile(
1332  "movdqa     (%3),%%xmm7\n"
1333"1:"
1334  "movdqa     (%0),%%xmm0\n"
1335  "lea        0x10(%0),%0\n"
1336  "pshufb     %%xmm7,%%xmm0\n"
1337  "movdqa     %%xmm0,(%1)\n"
1338  "lea        0x10(%1),%1\n"
1339  "sub        $0x4,%2\n"
1340  "ja         1b\n"
1341  : "+r"(src_bgra),  // %0
1342    "+r"(dst_argb),  // %1
1343    "+r"(pix)        // %2
1344  : "r"(kShuffleMaskBGRAToARGB)  // %3
1345  : "memory"
1346);
1347}
1348
1349#endif
1350
1351static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
1352  // Copy a Y to RGB.
1353  for (int x = 0; x < pix; ++x) {
1354    uint8 y = src_y[0];
1355    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
1356    dst_argb[3] = 255u;
1357    dst_argb += 4;
1358    ++src_y;
1359  }
1360}
1361
1362// Convert I400 to ARGB.
1363int I400ToARGB(const uint8* src_y, int src_stride_y,
1364               uint8* dst_argb, int dst_stride_argb,
1365               int width, int height) {
1366  if (height < 0) {
1367    height = -height;
1368    src_y = src_y + (height - 1) * src_stride_y;
1369    src_stride_y = -src_stride_y;
1370  }
1371  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix);
1372#if defined(HAS_I400TOARGBROW_SSE2)
1373  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
1374      (width % 8 == 0) &&
1375      IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) &&
1376      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
1377    I400ToARGBRow = I400ToARGBRow_SSE2;
1378  } else
1379#endif
1380  {
1381    I400ToARGBRow = I400ToARGBRow_C;
1382  }
1383
1384  for (int y = 0; y < height; ++y) {
1385    I400ToARGBRow(src_y, dst_argb, width);
1386    src_y += src_stride_y;
1387    dst_argb += dst_stride_argb;
1388  }
1389  return 0;
1390}
1391
1392static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
1393  for (int x = 0; x < pix; ++x) {
1394    // To support in-place conversion.
1395    uint8 r = src_abgr[0];
1396    uint8 g = src_abgr[1];
1397    uint8 b = src_abgr[2];
1398    uint8 a = src_abgr[3];
1399    dst_argb[0] = b;
1400    dst_argb[1] = g;
1401    dst_argb[2] = r;
1402    dst_argb[3] = a;
1403    dst_argb += 4;
1404    src_abgr += 4;
1405  }
1406}
1407
1408int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
1409               uint8* dst_argb, int dst_stride_argb,
1410               int width, int height) {
1411  if (height < 0) {
1412    height = -height;
1413    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
1414    src_stride_abgr = -src_stride_abgr;
1415  }
1416void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix);
1417#if defined(HAS_ABGRTOARGBROW_SSSE3)
1418  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
1419      (width % 4 == 0) &&
1420      IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) &&
1421      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
1422    ABGRToARGBRow = ABGRToARGBRow_SSSE3;
1423  } else
1424#endif
1425  {
1426    ABGRToARGBRow = ABGRToARGBRow_C;
1427  }
1428
1429  for (int y = 0; y < height; ++y) {
1430    ABGRToARGBRow(src_abgr, dst_argb, width);
1431    src_abgr += src_stride_abgr;
1432    dst_argb += dst_stride_argb;
1433  }
1434  return 0;
1435}
1436
1437static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
1438  for (int x = 0; x < pix; ++x) {
1439    // To support in-place conversion.
1440    uint8 a = src_bgra[0];
1441    uint8 r = src_bgra[1];
1442    uint8 g = src_bgra[2];
1443    uint8 b = src_bgra[3];
1444    dst_argb[0] = b;
1445    dst_argb[1] = g;
1446    dst_argb[2] = r;
1447    dst_argb[3] = a;
1448    dst_argb += 4;
1449    src_bgra += 4;
1450  }
1451}
1452
1453// Convert BGRA to ARGB.
1454int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
1455               uint8* dst_argb, int dst_stride_argb,
1456               int width, int height) {
1457  if (height < 0) {
1458    height = -height;
1459    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
1460    src_stride_bgra = -src_stride_bgra;
1461  }
1462  void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix);
1463#if defined(HAS_BGRATOARGBROW_SSSE3)
1464  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
1465      (width % 4 == 0) &&
1466      IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) &&
1467      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
1468    BGRAToARGBRow = BGRAToARGBRow_SSSE3;
1469  } else
1470#endif
1471  {
1472    BGRAToARGBRow = BGRAToARGBRow_C;
1473  }
1474
1475  for (int y = 0; y < height; ++y) {
1476    BGRAToARGBRow(src_bgra, dst_argb, width);
1477    src_bgra += src_stride_bgra;
1478    dst_argb += dst_stride_argb;
1479  }
1480  return 0;
1481}
1482
1483// Convert ARGB to I400.
1484int ARGBToI400(const uint8* src_argb, int src_stride_argb,
1485               uint8* dst_y, int dst_stride_y,
1486               int width, int height) {
1487  if (height < 0) {
1488    height = -height;
1489    src_argb = src_argb + (height - 1) * src_stride_argb;
1490    src_stride_argb = -src_stride_argb;
1491  }
1492void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
1493#if defined(HAS_ARGBTOYROW_SSSE3)
1494  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
1495      (width % 4 == 0) &&
1496      IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) &&
1497      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
1498    ARGBToYRow = ARGBToYRow_SSSE3;
1499  } else
1500#endif
1501  {
1502    ARGBToYRow = ARGBToYRow_C;
1503  }
1504
1505  for (int y = 0; y < height; ++y) {
1506    ARGBToYRow(src_argb, dst_y, width);
1507    src_argb += src_stride_argb;
1508    dst_y += dst_stride_y;
1509  }
1510  return 0;
1511}
1512
1513
1514// Convert RAW to ARGB.
1515int RAWToARGB(const uint8* src_raw, int src_stride_raw,
1516              uint8* dst_argb, int dst_stride_argb,
1517              int width, int height) {
1518  if (height < 0) {
1519    height = -height;
1520    src_raw = src_raw + (height - 1) * src_stride_raw;
1521    src_stride_raw = -src_stride_raw;
1522  }
1523  void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
1524#if defined(HAS_RAWTOARGBROW_SSSE3)
1525  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
1526      (width % 16 == 0) &&
1527      IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
1528      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
1529    RAWToARGBRow = RAWToARGBRow_SSSE3;
1530  } else
1531#endif
1532  {
1533    RAWToARGBRow = RAWToARGBRow_C;
1534  }
1535
1536  for (int y = 0; y < height; ++y) {
1537    RAWToARGBRow(src_raw, dst_argb, width);
1538    src_raw += src_stride_raw;
1539    dst_argb += dst_stride_argb;
1540  }
1541  return 0;
1542}
1543
1544// Convert BG24 to ARGB.
1545int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
1546               uint8* dst_argb, int dst_stride_argb,
1547               int width, int height) {
1548  if (height < 0) {
1549    height = -height;
1550    src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
1551    src_stride_bg24 = -src_stride_bg24;
1552  }
1553  void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
1554#if defined(HAS_BG24TOARGBROW_SSSE3)
1555  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
1556      (width % 16 == 0) &&
1557      IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
1558      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
1559    BG24ToARGBRow = BG24ToARGBRow_SSSE3;
1560  } else
1561#endif
1562  {
1563    BG24ToARGBRow = BG24ToARGBRow_C;
1564  }
1565
1566  for (int y = 0; y < height; ++y) {
1567    BG24ToARGBRow(src_bg24, dst_argb, width);
1568    src_bg24 += src_stride_bg24;
1569    dst_argb += dst_stride_argb;
1570  }
1571  return 0;
1572}
1573
1574}  // namespace libyuv
1575
1576