row_common.cc revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#include <string.h>  // For memcpy and memset.
14
15#include "libyuv/basic_types.h"
16
17#ifdef __cplusplus
18namespace libyuv {
19extern "C" {
20#endif
21
22// llvm x86 is poor at ternary operator, so use branchless min/max.
23
24#define USE_BRANCHLESS 1
25#if USE_BRANCHLESS
26static __inline int32 clamp0(int32 v) {
27  return ((-(v) >> 31) & (v));
28}
29
30static __inline int32 clamp255(int32 v) {
31  return (((255 - (v)) >> 31) | (v)) & 255;
32}
33
34static __inline uint32 Clamp(int32 val) {
35  int v = clamp0(val);
36  return (uint32)(clamp255(v));
37}
38
39static __inline uint32 Abs(int32 v) {
40  int m = v >> 31;
41  return (v + m) ^ m;
42}
43#else  // USE_BRANCHLESS
44static __inline int32 clamp0(int32 v) {
45  return (v < 0) ? 0 : v;
46}
47
48static __inline int32 clamp255(int32 v) {
49  return (v > 255) ? 255 : v;
50}
51
52static __inline uint32 Clamp(int32 val) {
53  int v = clamp0(val);
54  return (uint32)(clamp255(v));
55}
56
57static __inline uint32 Abs(int32 v) {
58  return (v < 0) ? -v : v;
59}
60#endif  // USE_BRANCHLESS
61
62#ifdef LIBYUV_LITTLE_ENDIAN
63#define WRITEWORD(p, v) *(uint32*)(p) = v
64#else
65static inline void WRITEWORD(uint8* p, uint32 v) {
66  p[0] = (uint8)(v & 255);
67  p[1] = (uint8)((v >> 8) & 255);
68  p[2] = (uint8)((v >> 16) & 255);
69  p[3] = (uint8)((v >> 24) & 255);
70}
71#endif
72
73void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
74  int x;
75  for (x = 0; x < width; ++x) {
76    uint8 b = src_rgb24[0];
77    uint8 g = src_rgb24[1];
78    uint8 r = src_rgb24[2];
79    dst_argb[0] = b;
80    dst_argb[1] = g;
81    dst_argb[2] = r;
82    dst_argb[3] = 255u;
83    dst_argb += 4;
84    src_rgb24 += 3;
85  }
86}
87
88void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
89  int x;
90  for (x = 0; x < width; ++x) {
91    uint8 r = src_raw[0];
92    uint8 g = src_raw[1];
93    uint8 b = src_raw[2];
94    dst_argb[0] = b;
95    dst_argb[1] = g;
96    dst_argb[2] = r;
97    dst_argb[3] = 255u;
98    dst_argb += 4;
99    src_raw += 3;
100  }
101}
102
103void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
104  int x;
105  for (x = 0; x < width; ++x) {
106    uint8 b = src_rgb565[0] & 0x1f;
107    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
108    uint8 r = src_rgb565[1] >> 3;
109    dst_argb[0] = (b << 3) | (b >> 2);
110    dst_argb[1] = (g << 2) | (g >> 4);
111    dst_argb[2] = (r << 3) | (r >> 2);
112    dst_argb[3] = 255u;
113    dst_argb += 4;
114    src_rgb565 += 2;
115  }
116}
117
118void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
119                         int width) {
120  int x;
121  for (x = 0; x < width; ++x) {
122    uint8 b = src_argb1555[0] & 0x1f;
123    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
124    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
125    uint8 a = src_argb1555[1] >> 7;
126    dst_argb[0] = (b << 3) | (b >> 2);
127    dst_argb[1] = (g << 3) | (g >> 2);
128    dst_argb[2] = (r << 3) | (r >> 2);
129    dst_argb[3] = -a;
130    dst_argb += 4;
131    src_argb1555 += 2;
132  }
133}
134
135void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
136                         int width) {
137  int x;
138  for (x = 0; x < width; ++x) {
139    uint8 b = src_argb4444[0] & 0x0f;
140    uint8 g = src_argb4444[0] >> 4;
141    uint8 r = src_argb4444[1] & 0x0f;
142    uint8 a = src_argb4444[1] >> 4;
143    dst_argb[0] = (b << 4) | b;
144    dst_argb[1] = (g << 4) | g;
145    dst_argb[2] = (r << 4) | r;
146    dst_argb[3] = (a << 4) | a;
147    dst_argb += 4;
148    src_argb4444 += 2;
149  }
150}
151
152void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
153  int x;
154  for (x = 0; x < width; ++x) {
155    uint8 b = src_argb[0];
156    uint8 g = src_argb[1];
157    uint8 r = src_argb[2];
158    dst_rgb[0] = b;
159    dst_rgb[1] = g;
160    dst_rgb[2] = r;
161    dst_rgb += 3;
162    src_argb += 4;
163  }
164}
165
166void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
167  int x;
168  for (x = 0; x < width; ++x) {
169    uint8 b = src_argb[0];
170    uint8 g = src_argb[1];
171    uint8 r = src_argb[2];
172    dst_rgb[0] = r;
173    dst_rgb[1] = g;
174    dst_rgb[2] = b;
175    dst_rgb += 3;
176    src_argb += 4;
177  }
178}
179
180void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
181  int x;
182  for (x = 0; x < width - 1; x += 2) {
183    uint8 b0 = src_argb[0] >> 3;
184    uint8 g0 = src_argb[1] >> 2;
185    uint8 r0 = src_argb[2] >> 3;
186    uint8 b1 = src_argb[4] >> 3;
187    uint8 g1 = src_argb[5] >> 2;
188    uint8 r1 = src_argb[6] >> 3;
189    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
190              (b1 << 16) | (g1 << 21) | (r1 << 27));
191    dst_rgb += 4;
192    src_argb += 8;
193  }
194  if (width & 1) {
195    uint8 b0 = src_argb[0] >> 3;
196    uint8 g0 = src_argb[1] >> 2;
197    uint8 r0 = src_argb[2] >> 3;
198    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
199  }
200}
201
202// dither4 is a row of 4 values from 4x4 dither matrix.
203// The 4x4 matrix contains values to increase RGB.  When converting to
204// fewer bits (565) this provides an ordered dither.
205// The order in the 4x4 matrix in first byte is upper left.
206// The 4 values are passed as an int, then referenced as an array, so
207// endian will not affect order of the original matrix.  But the dither4
208// will containing the first pixel in the lower byte for little endian
209// or the upper byte for big endian.
210void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
211                             const uint32 dither4, int width) {
212  int x;
213  for (x = 0; x < width - 1; x += 2) {
214    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
215    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
216    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
217    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
218    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
219    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
220    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
221    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
222    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
223              (b1 << 16) | (g1 << 21) | (r1 << 27));
224    dst_rgb += 4;
225    src_argb += 8;
226  }
227  if (width & 1) {
228    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
229    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
230    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
231    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
232    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
233  }
234}
235
236void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
237  int x;
238  for (x = 0; x < width - 1; x += 2) {
239    uint8 b0 = src_argb[0] >> 3;
240    uint8 g0 = src_argb[1] >> 3;
241    uint8 r0 = src_argb[2] >> 3;
242    uint8 a0 = src_argb[3] >> 7;
243    uint8 b1 = src_argb[4] >> 3;
244    uint8 g1 = src_argb[5] >> 3;
245    uint8 r1 = src_argb[6] >> 3;
246    uint8 a1 = src_argb[7] >> 7;
247    *(uint32*)(dst_rgb) =
248        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
249        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
250    dst_rgb += 4;
251    src_argb += 8;
252  }
253  if (width & 1) {
254    uint8 b0 = src_argb[0] >> 3;
255    uint8 g0 = src_argb[1] >> 3;
256    uint8 r0 = src_argb[2] >> 3;
257    uint8 a0 = src_argb[3] >> 7;
258    *(uint16*)(dst_rgb) =
259        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
260  }
261}
262
263void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
264  int x;
265  for (x = 0; x < width - 1; x += 2) {
266    uint8 b0 = src_argb[0] >> 4;
267    uint8 g0 = src_argb[1] >> 4;
268    uint8 r0 = src_argb[2] >> 4;
269    uint8 a0 = src_argb[3] >> 4;
270    uint8 b1 = src_argb[4] >> 4;
271    uint8 g1 = src_argb[5] >> 4;
272    uint8 r1 = src_argb[6] >> 4;
273    uint8 a1 = src_argb[7] >> 4;
274    *(uint32*)(dst_rgb) =
275        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
276        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
277    dst_rgb += 4;
278    src_argb += 8;
279  }
280  if (width & 1) {
281    uint8 b0 = src_argb[0] >> 4;
282    uint8 g0 = src_argb[1] >> 4;
283    uint8 r0 = src_argb[2] >> 4;
284    uint8 a0 = src_argb[3] >> 4;
285    *(uint16*)(dst_rgb) =
286        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
287  }
288}
289
290static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
291  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
292}
293
294static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
295  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
296}
297static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
298  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
299}
300
301#define MAKEROWY(NAME, R, G, B, BPP) \
302void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
303  int x;                                                                       \
304  for (x = 0; x < width; ++x) {                                                \
305    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
306    src_argb0 += BPP;                                                          \
307    dst_y += 1;                                                                \
308  }                                                                            \
309}                                                                              \
310void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
311                       uint8* dst_u, uint8* dst_v, int width) {                \
312  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
313  int x;                                                                       \
314  for (x = 0; x < width - 1; x += 2) {                                         \
315    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
316               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
317    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
318               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
319    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
320               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
321    dst_u[0] = RGBToU(ar, ag, ab);                                             \
322    dst_v[0] = RGBToV(ar, ag, ab);                                             \
323    src_rgb0 += BPP * 2;                                                       \
324    src_rgb1 += BPP * 2;                                                       \
325    dst_u += 1;                                                                \
326    dst_v += 1;                                                                \
327  }                                                                            \
328  if (width & 1) {                                                             \
329    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
330    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
331    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
332    dst_u[0] = RGBToU(ar, ag, ab);                                             \
333    dst_v[0] = RGBToV(ar, ag, ab);                                             \
334  }                                                                            \
335}
336
337MAKEROWY(ARGB, 2, 1, 0, 4)
338MAKEROWY(BGRA, 1, 2, 3, 4)
339MAKEROWY(ABGR, 0, 1, 2, 4)
340MAKEROWY(RGBA, 3, 2, 1, 4)
341MAKEROWY(RGB24, 2, 1, 0, 3)
342MAKEROWY(RAW, 0, 1, 2, 3)
343#undef MAKEROWY
344
345// JPeg uses a variation on BT.601-1 full range
346// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
347// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
348// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
349// BT.601 Mpeg range uses:
350// b 0.1016 * 255 = 25.908 = 25
351// g 0.5078 * 255 = 129.489 = 129
352// r 0.2578 * 255 = 65.739 = 66
353// JPeg 8 bit Y (not used):
354// b 0.11400 * 256 = 29.184 = 29
355// g 0.58700 * 256 = 150.272 = 150
356// r 0.29900 * 256 = 76.544 = 77
357// JPeg 7 bit Y:
358// b 0.11400 * 128 = 14.592 = 15
359// g 0.58700 * 128 = 75.136 = 75
360// r 0.29900 * 128 = 38.272 = 38
361// JPeg 8 bit U:
362// b  0.50000 * 255 = 127.5 = 127
363// g -0.33126 * 255 = -84.4713 = -84
364// r -0.16874 * 255 = -43.0287 = -43
365// JPeg 8 bit V:
366// b -0.08131 * 255 = -20.73405 = -20
367// g -0.41869 * 255 = -106.76595 = -107
368// r  0.50000 * 255 = 127.5 = 127
369
370static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
371  return (38 * r + 75 * g +  15 * b + 64) >> 7;
372}
373
374static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
375  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
376}
377static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
378  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
379}
380
381#define AVGB(a, b) (((a) + (b) + 1) >> 1)
382
383#define MAKEROWYJ(NAME, R, G, B, BPP) \
384void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
385  int x;                                                                       \
386  for (x = 0; x < width; ++x) {                                                \
387    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
388    src_argb0 += BPP;                                                          \
389    dst_y += 1;                                                                \
390  }                                                                            \
391}                                                                              \
392void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
393                        uint8* dst_u, uint8* dst_v, int width) {               \
394  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
395  int x;                                                                       \
396  for (x = 0; x < width - 1; x += 2) {                                         \
397    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
398                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
399    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
400                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
401    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
402                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
403    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
404    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
405    src_rgb0 += BPP * 2;                                                       \
406    src_rgb1 += BPP * 2;                                                       \
407    dst_u += 1;                                                                \
408    dst_v += 1;                                                                \
409  }                                                                            \
410  if (width & 1) {                                                             \
411    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
412    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
413    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
414    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
415    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
416  }                                                                            \
417}
418
419MAKEROWYJ(ARGB, 2, 1, 0, 4)
420#undef MAKEROWYJ
421
422void ARGBToUVJ422Row_C(const uint8* src_argb,
423                       uint8* dst_u, uint8* dst_v, int width) {
424  int x;
425  for (x = 0; x < width - 1; x += 2) {
426    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
427    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
428    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
429    dst_u[0] = RGBToUJ(ar, ag, ab);
430    dst_v[0] = RGBToVJ(ar, ag, ab);
431    src_argb += 8;
432    dst_u += 1;
433    dst_v += 1;
434  }
435  if (width & 1) {
436    uint8 ab = src_argb[0];
437    uint8 ag = src_argb[1];
438    uint8 ar = src_argb[2];
439    dst_u[0] = RGBToUJ(ar, ag, ab);
440    dst_v[0] = RGBToVJ(ar, ag, ab);
441  }
442}
443
444void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
445  int x;
446  for (x = 0; x < width; ++x) {
447    uint8 b = src_rgb565[0] & 0x1f;
448    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
449    uint8 r = src_rgb565[1] >> 3;
450    b = (b << 3) | (b >> 2);
451    g = (g << 2) | (g >> 4);
452    r = (r << 3) | (r >> 2);
453    dst_y[0] = RGBToY(r, g, b);
454    src_rgb565 += 2;
455    dst_y += 1;
456  }
457}
458
459void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
460  int x;
461  for (x = 0; x < width; ++x) {
462    uint8 b = src_argb1555[0] & 0x1f;
463    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
464    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
465    b = (b << 3) | (b >> 2);
466    g = (g << 3) | (g >> 2);
467    r = (r << 3) | (r >> 2);
468    dst_y[0] = RGBToY(r, g, b);
469    src_argb1555 += 2;
470    dst_y += 1;
471  }
472}
473
474void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
475  int x;
476  for (x = 0; x < width; ++x) {
477    uint8 b = src_argb4444[0] & 0x0f;
478    uint8 g = src_argb4444[0] >> 4;
479    uint8 r = src_argb4444[1] & 0x0f;
480    b = (b << 4) | b;
481    g = (g << 4) | g;
482    r = (r << 4) | r;
483    dst_y[0] = RGBToY(r, g, b);
484    src_argb4444 += 2;
485    dst_y += 1;
486  }
487}
488
489void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
490                     uint8* dst_u, uint8* dst_v, int width) {
491  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
492  int x;
493  for (x = 0; x < width - 1; x += 2) {
494    uint8 b0 = src_rgb565[0] & 0x1f;
495    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
496    uint8 r0 = src_rgb565[1] >> 3;
497    uint8 b1 = src_rgb565[2] & 0x1f;
498    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
499    uint8 r1 = src_rgb565[3] >> 3;
500    uint8 b2 = next_rgb565[0] & 0x1f;
501    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
502    uint8 r2 = next_rgb565[1] >> 3;
503    uint8 b3 = next_rgb565[2] & 0x1f;
504    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
505    uint8 r3 = next_rgb565[3] >> 3;
506    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
507    uint8 g = (g0 + g1 + g2 + g3);
508    uint8 r = (r0 + r1 + r2 + r3);
509    b = (b << 1) | (b >> 6);  // 787 -> 888.
510    r = (r << 1) | (r >> 6);
511    dst_u[0] = RGBToU(r, g, b);
512    dst_v[0] = RGBToV(r, g, b);
513    src_rgb565 += 4;
514    next_rgb565 += 4;
515    dst_u += 1;
516    dst_v += 1;
517  }
518  if (width & 1) {
519    uint8 b0 = src_rgb565[0] & 0x1f;
520    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
521    uint8 r0 = src_rgb565[1] >> 3;
522    uint8 b2 = next_rgb565[0] & 0x1f;
523    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
524    uint8 r2 = next_rgb565[1] >> 3;
525    uint8 b = (b0 + b2);  // 565 * 2 = 676.
526    uint8 g = (g0 + g2);
527    uint8 r = (r0 + r2);
528    b = (b << 2) | (b >> 4);  // 676 -> 888
529    g = (g << 1) | (g >> 6);
530    r = (r << 2) | (r >> 4);
531    dst_u[0] = RGBToU(r, g, b);
532    dst_v[0] = RGBToV(r, g, b);
533  }
534}
535
536void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
537                       uint8* dst_u, uint8* dst_v, int width) {
538  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
539  int x;
540  for (x = 0; x < width - 1; x += 2) {
541    uint8 b0 = src_argb1555[0] & 0x1f;
542    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
543    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
544    uint8 b1 = src_argb1555[2] & 0x1f;
545    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
546    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
547    uint8 b2 = next_argb1555[0] & 0x1f;
548    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
549    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
550    uint8 b3 = next_argb1555[2] & 0x1f;
551    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
552    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
553    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
554    uint8 g = (g0 + g1 + g2 + g3);
555    uint8 r = (r0 + r1 + r2 + r3);
556    b = (b << 1) | (b >> 6);  // 777 -> 888.
557    g = (g << 1) | (g >> 6);
558    r = (r << 1) | (r >> 6);
559    dst_u[0] = RGBToU(r, g, b);
560    dst_v[0] = RGBToV(r, g, b);
561    src_argb1555 += 4;
562    next_argb1555 += 4;
563    dst_u += 1;
564    dst_v += 1;
565  }
566  if (width & 1) {
567    uint8 b0 = src_argb1555[0] & 0x1f;
568    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
569    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
570    uint8 b2 = next_argb1555[0] & 0x1f;
571    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
572    uint8 r2 = next_argb1555[1] >> 3;
573    uint8 b = (b0 + b2);  // 555 * 2 = 666.
574    uint8 g = (g0 + g2);
575    uint8 r = (r0 + r2);
576    b = (b << 2) | (b >> 4);  // 666 -> 888.
577    g = (g << 2) | (g >> 4);
578    r = (r << 2) | (r >> 4);
579    dst_u[0] = RGBToU(r, g, b);
580    dst_v[0] = RGBToV(r, g, b);
581  }
582}
583
584void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
585                       uint8* dst_u, uint8* dst_v, int width) {
586  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
587  int x;
588  for (x = 0; x < width - 1; x += 2) {
589    uint8 b0 = src_argb4444[0] & 0x0f;
590    uint8 g0 = src_argb4444[0] >> 4;
591    uint8 r0 = src_argb4444[1] & 0x0f;
592    uint8 b1 = src_argb4444[2] & 0x0f;
593    uint8 g1 = src_argb4444[2] >> 4;
594    uint8 r1 = src_argb4444[3] & 0x0f;
595    uint8 b2 = next_argb4444[0] & 0x0f;
596    uint8 g2 = next_argb4444[0] >> 4;
597    uint8 r2 = next_argb4444[1] & 0x0f;
598    uint8 b3 = next_argb4444[2] & 0x0f;
599    uint8 g3 = next_argb4444[2] >> 4;
600    uint8 r3 = next_argb4444[3] & 0x0f;
601    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
602    uint8 g = (g0 + g1 + g2 + g3);
603    uint8 r = (r0 + r1 + r2 + r3);
604    b = (b << 2) | (b >> 4);  // 666 -> 888.
605    g = (g << 2) | (g >> 4);
606    r = (r << 2) | (r >> 4);
607    dst_u[0] = RGBToU(r, g, b);
608    dst_v[0] = RGBToV(r, g, b);
609    src_argb4444 += 4;
610    next_argb4444 += 4;
611    dst_u += 1;
612    dst_v += 1;
613  }
614  if (width & 1) {
615    uint8 b0 = src_argb4444[0] & 0x0f;
616    uint8 g0 = src_argb4444[0] >> 4;
617    uint8 r0 = src_argb4444[1] & 0x0f;
618    uint8 b2 = next_argb4444[0] & 0x0f;
619    uint8 g2 = next_argb4444[0] >> 4;
620    uint8 r2 = next_argb4444[1] & 0x0f;
621    uint8 b = (b0 + b2);  // 444 * 2 = 555.
622    uint8 g = (g0 + g2);
623    uint8 r = (r0 + r2);
624    b = (b << 3) | (b >> 2);  // 555 -> 888.
625    g = (g << 3) | (g >> 2);
626    r = (r << 3) | (r >> 2);
627    dst_u[0] = RGBToU(r, g, b);
628    dst_v[0] = RGBToV(r, g, b);
629  }
630}
631
632void ARGBToUV444Row_C(const uint8* src_argb,
633                      uint8* dst_u, uint8* dst_v, int width) {
634  int x;
635  for (x = 0; x < width; ++x) {
636    uint8 ab = src_argb[0];
637    uint8 ag = src_argb[1];
638    uint8 ar = src_argb[2];
639    dst_u[0] = RGBToU(ar, ag, ab);
640    dst_v[0] = RGBToV(ar, ag, ab);
641    src_argb += 4;
642    dst_u += 1;
643    dst_v += 1;
644  }
645}
646
647void ARGBToUV422Row_C(const uint8* src_argb,
648                      uint8* dst_u, uint8* dst_v, int width) {
649  int x;
650  for (x = 0; x < width - 1; x += 2) {
651    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
652    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
653    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
654    dst_u[0] = RGBToU(ar, ag, ab);
655    dst_v[0] = RGBToV(ar, ag, ab);
656    src_argb += 8;
657    dst_u += 1;
658    dst_v += 1;
659  }
660  if (width & 1) {
661    uint8 ab = src_argb[0];
662    uint8 ag = src_argb[1];
663    uint8 ar = src_argb[2];
664    dst_u[0] = RGBToU(ar, ag, ab);
665    dst_v[0] = RGBToV(ar, ag, ab);
666  }
667}
668
669void ARGBToUV411Row_C(const uint8* src_argb,
670                      uint8* dst_u, uint8* dst_v, int width) {
671  int x;
672  for (x = 0; x < width - 3; x += 4) {
673    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
674    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
675    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
676    dst_u[0] = RGBToU(ar, ag, ab);
677    dst_v[0] = RGBToV(ar, ag, ab);
678    src_argb += 16;
679    dst_u += 1;
680    dst_v += 1;
681  }
682  if ((width & 3) == 3) {
683    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
684    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
685    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
686    dst_u[0] = RGBToU(ar, ag, ab);
687    dst_v[0] = RGBToV(ar, ag, ab);
688  } else if ((width & 3) == 2) {
689    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
690    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
691    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
692    dst_u[0] = RGBToU(ar, ag, ab);
693    dst_v[0] = RGBToV(ar, ag, ab);
694  } else if ((width & 3) == 1) {
695    uint8 ab = src_argb[0];
696    uint8 ag = src_argb[1];
697    uint8 ar = src_argb[2];
698    dst_u[0] = RGBToU(ar, ag, ab);
699    dst_v[0] = RGBToV(ar, ag, ab);
700  }
701}
702
703void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
704  int x;
705  for (x = 0; x < width; ++x) {
706    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
707    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
708    dst_argb[3] = src_argb[3];
709    dst_argb += 4;
710    src_argb += 4;
711  }
712}
713
714// Convert a row of image to Sepia tone.
715void ARGBSepiaRow_C(uint8* dst_argb, int width) {
716  int x;
717  for (x = 0; x < width; ++x) {
718    int b = dst_argb[0];
719    int g = dst_argb[1];
720    int r = dst_argb[2];
721    int sb = (b * 17 + g * 68 + r * 35) >> 7;
722    int sg = (b * 22 + g * 88 + r * 45) >> 7;
723    int sr = (b * 24 + g * 98 + r * 50) >> 7;
724    // b does not over flow. a is preserved from original.
725    dst_argb[0] = sb;
726    dst_argb[1] = clamp255(sg);
727    dst_argb[2] = clamp255(sr);
728    dst_argb += 4;
729  }
730}
731
732// Apply color matrix to a row of image. Matrix is signed.
733// TODO(fbarchard): Consider adding rounding (+32).
734void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
735                          const int8* matrix_argb, int width) {
736  int x;
737  for (x = 0; x < width; ++x) {
738    int b = src_argb[0];
739    int g = src_argb[1];
740    int r = src_argb[2];
741    int a = src_argb[3];
742    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
743              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
744    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
745              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
746    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
747              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
748    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
749              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
750    dst_argb[0] = Clamp(sb);
751    dst_argb[1] = Clamp(sg);
752    dst_argb[2] = Clamp(sr);
753    dst_argb[3] = Clamp(sa);
754    src_argb += 4;
755    dst_argb += 4;
756  }
757}
758
759// Apply color table to a row of image.
760void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
761  int x;
762  for (x = 0; x < width; ++x) {
763    int b = dst_argb[0];
764    int g = dst_argb[1];
765    int r = dst_argb[2];
766    int a = dst_argb[3];
767    dst_argb[0] = table_argb[b * 4 + 0];
768    dst_argb[1] = table_argb[g * 4 + 1];
769    dst_argb[2] = table_argb[r * 4 + 2];
770    dst_argb[3] = table_argb[a * 4 + 3];
771    dst_argb += 4;
772  }
773}
774
775// Apply color table to a row of image.
776void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
777  int x;
778  for (x = 0; x < width; ++x) {
779    int b = dst_argb[0];
780    int g = dst_argb[1];
781    int r = dst_argb[2];
782    dst_argb[0] = table_argb[b * 4 + 0];
783    dst_argb[1] = table_argb[g * 4 + 1];
784    dst_argb[2] = table_argb[r * 4 + 2];
785    dst_argb += 4;
786  }
787}
788
789void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
790                       int interval_offset, int width) {
791  int x;
792  for (x = 0; x < width; ++x) {
793    int b = dst_argb[0];
794    int g = dst_argb[1];
795    int r = dst_argb[2];
796    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
797    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
798    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
799    dst_argb += 4;
800  }
801}
802
803#define REPEAT8(v) (v) | ((v) << 8)
804#define SHADE(f, v) v * f >> 24
805
806void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
807                    uint32 value) {
808  const uint32 b_scale = REPEAT8(value & 0xff);
809  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
810  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
811  const uint32 a_scale = REPEAT8(value >> 24);
812
813  int i;
814  for (i = 0; i < width; ++i) {
815    const uint32 b = REPEAT8(src_argb[0]);
816    const uint32 g = REPEAT8(src_argb[1]);
817    const uint32 r = REPEAT8(src_argb[2]);
818    const uint32 a = REPEAT8(src_argb[3]);
819    dst_argb[0] = SHADE(b, b_scale);
820    dst_argb[1] = SHADE(g, g_scale);
821    dst_argb[2] = SHADE(r, r_scale);
822    dst_argb[3] = SHADE(a, a_scale);
823    src_argb += 4;
824    dst_argb += 4;
825  }
826}
827#undef REPEAT8
828#undef SHADE
829
830#define REPEAT8(v) (v) | ((v) << 8)
831#define SHADE(f, v) v * f >> 16
832
833void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
834                       uint8* dst_argb, int width) {
835  int i;
836  for (i = 0; i < width; ++i) {
837    const uint32 b = REPEAT8(src_argb0[0]);
838    const uint32 g = REPEAT8(src_argb0[1]);
839    const uint32 r = REPEAT8(src_argb0[2]);
840    const uint32 a = REPEAT8(src_argb0[3]);
841    const uint32 b_scale = src_argb1[0];
842    const uint32 g_scale = src_argb1[1];
843    const uint32 r_scale = src_argb1[2];
844    const uint32 a_scale = src_argb1[3];
845    dst_argb[0] = SHADE(b, b_scale);
846    dst_argb[1] = SHADE(g, g_scale);
847    dst_argb[2] = SHADE(r, r_scale);
848    dst_argb[3] = SHADE(a, a_scale);
849    src_argb0 += 4;
850    src_argb1 += 4;
851    dst_argb += 4;
852  }
853}
854#undef REPEAT8
855#undef SHADE
856
857#define SHADE(f, v) clamp255(v + f)
858
859void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
860                  uint8* dst_argb, int width) {
861  int i;
862  for (i = 0; i < width; ++i) {
863    const int b = src_argb0[0];
864    const int g = src_argb0[1];
865    const int r = src_argb0[2];
866    const int a = src_argb0[3];
867    const int b_add = src_argb1[0];
868    const int g_add = src_argb1[1];
869    const int r_add = src_argb1[2];
870    const int a_add = src_argb1[3];
871    dst_argb[0] = SHADE(b, b_add);
872    dst_argb[1] = SHADE(g, g_add);
873    dst_argb[2] = SHADE(r, r_add);
874    dst_argb[3] = SHADE(a, a_add);
875    src_argb0 += 4;
876    src_argb1 += 4;
877    dst_argb += 4;
878  }
879}
880#undef SHADE
881
882#define SHADE(f, v) clamp0(f - v)
883
884void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
885                       uint8* dst_argb, int width) {
886  int i;
887  for (i = 0; i < width; ++i) {
888    const int b = src_argb0[0];
889    const int g = src_argb0[1];
890    const int r = src_argb0[2];
891    const int a = src_argb0[3];
892    const int b_sub = src_argb1[0];
893    const int g_sub = src_argb1[1];
894    const int r_sub = src_argb1[2];
895    const int a_sub = src_argb1[3];
896    dst_argb[0] = SHADE(b, b_sub);
897    dst_argb[1] = SHADE(g, g_sub);
898    dst_argb[2] = SHADE(r, r_sub);
899    dst_argb[3] = SHADE(a, a_sub);
900    src_argb0 += 4;
901    src_argb1 += 4;
902    dst_argb += 4;
903  }
904}
905#undef SHADE
906
907// Sobel functions which mimics SSSE3.
908void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
909                 uint8* dst_sobelx, int width) {
910  int i;
911  for (i = 0; i < width; ++i) {
912    int a = src_y0[i];
913    int b = src_y1[i];
914    int c = src_y2[i];
915    int a_sub = src_y0[i + 2];
916    int b_sub = src_y1[i + 2];
917    int c_sub = src_y2[i + 2];
918    int a_diff = a - a_sub;
919    int b_diff = b - b_sub;
920    int c_diff = c - c_sub;
921    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
922    dst_sobelx[i] = (uint8)(clamp255(sobel));
923  }
924}
925
926void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
927                 uint8* dst_sobely, int width) {
928  int i;
929  for (i = 0; i < width; ++i) {
930    int a = src_y0[i + 0];
931    int b = src_y0[i + 1];
932    int c = src_y0[i + 2];
933    int a_sub = src_y1[i + 0];
934    int b_sub = src_y1[i + 1];
935    int c_sub = src_y1[i + 2];
936    int a_diff = a - a_sub;
937    int b_diff = b - b_sub;
938    int c_diff = c - c_sub;
939    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
940    dst_sobely[i] = (uint8)(clamp255(sobel));
941  }
942}
943
944void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
945                uint8* dst_argb, int width) {
946  int i;
947  for (i = 0; i < width; ++i) {
948    int r = src_sobelx[i];
949    int b = src_sobely[i];
950    int s = clamp255(r + b);
951    dst_argb[0] = (uint8)(s);
952    dst_argb[1] = (uint8)(s);
953    dst_argb[2] = (uint8)(s);
954    dst_argb[3] = (uint8)(255u);
955    dst_argb += 4;
956  }
957}
958
959void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
960                       uint8* dst_y, int width) {
961  int i;
962  for (i = 0; i < width; ++i) {
963    int r = src_sobelx[i];
964    int b = src_sobely[i];
965    int s = clamp255(r + b);
966    dst_y[i] = (uint8)(s);
967  }
968}
969
970void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
971                  uint8* dst_argb, int width) {
972  int i;
973  for (i = 0; i < width; ++i) {
974    int r = src_sobelx[i];
975    int b = src_sobely[i];
976    int g = clamp255(r + b);
977    dst_argb[0] = (uint8)(b);
978    dst_argb[1] = (uint8)(g);
979    dst_argb[2] = (uint8)(r);
980    dst_argb[3] = (uint8)(255u);
981    dst_argb += 4;
982  }
983}
984
985void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
986  // Copy a Y to RGB.
987  int x;
988  for (x = 0; x < width; ++x) {
989    uint8 y = src_y[0];
990    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
991    dst_argb[3] = 255u;
992    dst_argb += 4;
993    ++src_y;
994  }
995}
996
997// BT.601 YUV to RGB reference
998//  R = (Y - 16) * 1.164              - V * -1.596
999//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
1000//  B = (Y - 16) * 1.164 - U * -2.018
1001
1002// Y contribution to R,G,B.  Scale and bias.
1003// TODO(fbarchard): Consider moving constants into a common header.
1004#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1005#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1006
1007// U and V contributions to R,G,B.
1008#define UB -128 /* max(-128, round(-2.018 * 64)) */
1009#define UG 25 /* round(0.391 * 64) */
1010#define VG 52 /* round(0.813 * 64) */
1011#define VR -102 /* round(-1.596 * 64) */
1012
1013// Bias values to subtract 16 from Y and 128 from U and V.
1014#define BB (UB * 128 + YGB)
1015#define BG (UG * 128 + VG * 128 + YGB)
1016#define BR (VR * 128 + YGB)
1017
1018// C reference code that mimics the YUV assembly.
1019static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
1020                              uint8* b, uint8* g, uint8* r) {
1021  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
1022  *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
1023  *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
1024  *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
1025}
1026
1027// C reference code that mimics the YUV assembly.
1028static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
1029  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
1030  *b = Clamp((int32)(y1 + YGB) >> 6);
1031  *g = Clamp((int32)(y1 + YGB) >> 6);
1032  *r = Clamp((int32)(y1 + YGB) >> 6);
1033}
1034
1035#undef YG
1036#undef YGB
1037#undef UB
1038#undef UG
1039#undef VG
1040#undef VR
1041#undef BB
1042#undef BG
1043#undef BR
1044
1045// JPEG YUV to RGB reference
1046// *  R = Y                - V * -1.40200
1047// *  G = Y - U *  0.34414 - V *  0.71414
1048// *  B = Y - U * -1.77200
1049
1050// Y contribution to R,G,B.  Scale and bias.
1051// TODO(fbarchard): Consider moving constants into a common header.
1052#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1053#define YGBJ 32  /* 64 / 2 */
1054
1055// U and V contributions to R,G,B.
1056#define UBJ -113 /* round(-1.77200 * 64) */
1057#define UGJ 22 /* round(0.34414 * 64) */
1058#define VGJ 46 /* round(0.71414  * 64) */
1059#define VRJ -90 /* round(-1.40200 * 64) */
1060
1061// Bias values to subtract 16 from Y and 128 from U and V.
1062#define BBJ (UBJ * 128 + YGBJ)
1063#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
1064#define BRJ (VRJ * 128 + YGBJ)
1065
1066// C reference code that mimics the YUV assembly.
1067static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
1068                               uint8* b, uint8* g, uint8* r) {
1069  uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
1070  *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
1071  *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
1072  *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
1073}
1074
1075#undef YGJ
1076#undef YGBJ
1077#undef UBJ
1078#undef UGJ
1079#undef VGJ
1080#undef VRJ
1081#undef BBJ
1082#undef BGJ
1083#undef BRJ
1084
1085#if !defined(LIBYUV_DISABLE_NEON) && \
1086    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
1087// C mimic assembly.
1088// TODO(fbarchard): Remove subsampling from Neon.
1089void I444ToARGBRow_C(const uint8* src_y,
1090                     const uint8* src_u,
1091                     const uint8* src_v,
1092                     uint8* rgb_buf,
1093                     int width) {
1094  int x;
1095  for (x = 0; x < width - 1; x += 2) {
1096    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
1097    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
1098    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1099    rgb_buf[3] = 255;
1100    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1101    rgb_buf[7] = 255;
1102    src_y += 2;
1103    src_u += 2;
1104    src_v += 2;
1105    rgb_buf += 8;  // Advance 2 pixels.
1106  }
1107  if (width & 1) {
1108    YuvPixel(src_y[0], src_u[0], src_v[0],
1109             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1110  }
1111}
1112#else
1113void I444ToARGBRow_C(const uint8* src_y,
1114                     const uint8* src_u,
1115                     const uint8* src_v,
1116                     uint8* rgb_buf,
1117                     int width) {
1118  int x;
1119  for (x = 0; x < width; ++x) {
1120    YuvPixel(src_y[0], src_u[0], src_v[0],
1121             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1122    rgb_buf[3] = 255;
1123    src_y += 1;
1124    src_u += 1;
1125    src_v += 1;
1126    rgb_buf += 4;  // Advance 1 pixel.
1127  }
1128}
1129#endif
1130
1131// Also used for 420
1132void I422ToARGBRow_C(const uint8* src_y,
1133                     const uint8* src_u,
1134                     const uint8* src_v,
1135                     uint8* rgb_buf,
1136                     int width) {
1137  int x;
1138  for (x = 0; x < width - 1; x += 2) {
1139    YuvPixel(src_y[0], src_u[0], src_v[0],
1140             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1141    rgb_buf[3] = 255;
1142    YuvPixel(src_y[1], src_u[0], src_v[0],
1143             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1144    rgb_buf[7] = 255;
1145    src_y += 2;
1146    src_u += 1;
1147    src_v += 1;
1148    rgb_buf += 8;  // Advance 2 pixels.
1149  }
1150  if (width & 1) {
1151    YuvPixel(src_y[0], src_u[0], src_v[0],
1152             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1153    rgb_buf[3] = 255;
1154  }
1155}
1156
1157void J422ToARGBRow_C(const uint8* src_y,
1158                     const uint8* src_u,
1159                     const uint8* src_v,
1160                     uint8* rgb_buf,
1161                     int width) {
1162  int x;
1163  for (x = 0; x < width - 1; x += 2) {
1164    YuvJPixel(src_y[0], src_u[0], src_v[0],
1165              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1166    rgb_buf[3] = 255;
1167    YuvJPixel(src_y[1], src_u[0], src_v[0],
1168              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1169    rgb_buf[7] = 255;
1170    src_y += 2;
1171    src_u += 1;
1172    src_v += 1;
1173    rgb_buf += 8;  // Advance 2 pixels.
1174  }
1175  if (width & 1) {
1176    YuvJPixel(src_y[0], src_u[0], src_v[0],
1177              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1178    rgb_buf[3] = 255;
1179  }
1180}
1181
1182void I422ToRGB24Row_C(const uint8* src_y,
1183                      const uint8* src_u,
1184                      const uint8* src_v,
1185                      uint8* rgb_buf,
1186                      int width) {
1187  int x;
1188  for (x = 0; x < width - 1; x += 2) {
1189    YuvPixel(src_y[0], src_u[0], src_v[0],
1190             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1191    YuvPixel(src_y[1], src_u[0], src_v[0],
1192             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
1193    src_y += 2;
1194    src_u += 1;
1195    src_v += 1;
1196    rgb_buf += 6;  // Advance 2 pixels.
1197  }
1198  if (width & 1) {
1199    YuvPixel(src_y[0], src_u[0], src_v[0],
1200             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1201  }
1202}
1203
1204void I422ToRAWRow_C(const uint8* src_y,
1205                    const uint8* src_u,
1206                    const uint8* src_v,
1207                    uint8* rgb_buf,
1208                    int width) {
1209  int x;
1210  for (x = 0; x < width - 1; x += 2) {
1211    YuvPixel(src_y[0], src_u[0], src_v[0],
1212             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1213    YuvPixel(src_y[1], src_u[0], src_v[0],
1214             rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
1215    src_y += 2;
1216    src_u += 1;
1217    src_v += 1;
1218    rgb_buf += 6;  // Advance 2 pixels.
1219  }
1220  if (width & 1) {
1221    YuvPixel(src_y[0], src_u[0], src_v[0],
1222             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1223  }
1224}
1225
1226void I422ToARGB4444Row_C(const uint8* src_y,
1227                         const uint8* src_u,
1228                         const uint8* src_v,
1229                         uint8* dst_argb4444,
1230                         int width) {
1231  uint8 b0;
1232  uint8 g0;
1233  uint8 r0;
1234  uint8 b1;
1235  uint8 g1;
1236  uint8 r1;
1237  int x;
1238  for (x = 0; x < width - 1; x += 2) {
1239    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1240    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1241    b0 = b0 >> 4;
1242    g0 = g0 >> 4;
1243    r0 = r0 >> 4;
1244    b1 = b1 >> 4;
1245    g1 = g1 >> 4;
1246    r1 = r1 >> 4;
1247    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
1248        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
1249    src_y += 2;
1250    src_u += 1;
1251    src_v += 1;
1252    dst_argb4444 += 4;  // Advance 2 pixels.
1253  }
1254  if (width & 1) {
1255    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1256    b0 = b0 >> 4;
1257    g0 = g0 >> 4;
1258    r0 = r0 >> 4;
1259    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
1260        0xf000;
1261  }
1262}
1263
1264void I422ToARGB1555Row_C(const uint8* src_y,
1265                         const uint8* src_u,
1266                         const uint8* src_v,
1267                         uint8* dst_argb1555,
1268                         int width) {
1269  uint8 b0;
1270  uint8 g0;
1271  uint8 r0;
1272  uint8 b1;
1273  uint8 g1;
1274  uint8 r1;
1275  int x;
1276  for (x = 0; x < width - 1; x += 2) {
1277    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1278    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1279    b0 = b0 >> 3;
1280    g0 = g0 >> 3;
1281    r0 = r0 >> 3;
1282    b1 = b1 >> 3;
1283    g1 = g1 >> 3;
1284    r1 = r1 >> 3;
1285    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
1286        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
1287    src_y += 2;
1288    src_u += 1;
1289    src_v += 1;
1290    dst_argb1555 += 4;  // Advance 2 pixels.
1291  }
1292  if (width & 1) {
1293    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1294    b0 = b0 >> 3;
1295    g0 = g0 >> 3;
1296    r0 = r0 >> 3;
1297    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
1298        0x8000;
1299  }
1300}
1301
1302void I422ToRGB565Row_C(const uint8* src_y,
1303                       const uint8* src_u,
1304                       const uint8* src_v,
1305                       uint8* dst_rgb565,
1306                       int width) {
1307  uint8 b0;
1308  uint8 g0;
1309  uint8 r0;
1310  uint8 b1;
1311  uint8 g1;
1312  uint8 r1;
1313  int x;
1314  for (x = 0; x < width - 1; x += 2) {
1315    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1316    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1317    b0 = b0 >> 3;
1318    g0 = g0 >> 2;
1319    r0 = r0 >> 3;
1320    b1 = b1 >> 3;
1321    g1 = g1 >> 2;
1322    r1 = r1 >> 3;
1323    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1324        (b1 << 16) | (g1 << 21) | (r1 << 27);
1325    src_y += 2;
1326    src_u += 1;
1327    src_v += 1;
1328    dst_rgb565 += 4;  // Advance 2 pixels.
1329  }
1330  if (width & 1) {
1331    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1332    b0 = b0 >> 3;
1333    g0 = g0 >> 2;
1334    r0 = r0 >> 3;
1335    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1336  }
1337}
1338
1339void I411ToARGBRow_C(const uint8* src_y,
1340                     const uint8* src_u,
1341                     const uint8* src_v,
1342                     uint8* rgb_buf,
1343                     int width) {
1344  int x;
1345  for (x = 0; x < width - 3; x += 4) {
1346    YuvPixel(src_y[0], src_u[0], src_v[0],
1347             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1348    rgb_buf[3] = 255;
1349    YuvPixel(src_y[1], src_u[0], src_v[0],
1350             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1351    rgb_buf[7] = 255;
1352    YuvPixel(src_y[2], src_u[0], src_v[0],
1353             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
1354    rgb_buf[11] = 255;
1355    YuvPixel(src_y[3], src_u[0], src_v[0],
1356             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
1357    rgb_buf[15] = 255;
1358    src_y += 4;
1359    src_u += 1;
1360    src_v += 1;
1361    rgb_buf += 16;  // Advance 4 pixels.
1362  }
1363  if (width & 2) {
1364    YuvPixel(src_y[0], src_u[0], src_v[0],
1365             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1366    rgb_buf[3] = 255;
1367    YuvPixel(src_y[1], src_u[0], src_v[0],
1368             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1369    rgb_buf[7] = 255;
1370    src_y += 2;
1371    rgb_buf += 8;  // Advance 2 pixels.
1372  }
1373  if (width & 1) {
1374    YuvPixel(src_y[0], src_u[0], src_v[0],
1375             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1376    rgb_buf[3] = 255;
1377  }
1378}
1379
1380void NV12ToARGBRow_C(const uint8* src_y,
1381                     const uint8* src_uv,
1382                     uint8* rgb_buf,
1383                     int width) {
1384  int x;
1385  for (x = 0; x < width - 1; x += 2) {
1386    YuvPixel(src_y[0], src_uv[0], src_uv[1],
1387             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1388    rgb_buf[3] = 255;
1389    YuvPixel(src_y[1], src_uv[0], src_uv[1],
1390             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1391    rgb_buf[7] = 255;
1392    src_y += 2;
1393    src_uv += 2;
1394    rgb_buf += 8;  // Advance 2 pixels.
1395  }
1396  if (width & 1) {
1397    YuvPixel(src_y[0], src_uv[0], src_uv[1],
1398             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1399    rgb_buf[3] = 255;
1400  }
1401}
1402
1403void NV21ToARGBRow_C(const uint8* src_y,
1404                     const uint8* src_vu,
1405                     uint8* rgb_buf,
1406                     int width) {
1407  int x;
1408  for (x = 0; x < width - 1; x += 2) {
1409    YuvPixel(src_y[0], src_vu[1], src_vu[0],
1410             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1411    rgb_buf[3] = 255;
1412
1413    YuvPixel(src_y[1], src_vu[1], src_vu[0],
1414             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1415    rgb_buf[7] = 255;
1416
1417    src_y += 2;
1418    src_vu += 2;
1419    rgb_buf += 8;  // Advance 2 pixels.
1420  }
1421  if (width & 1) {
1422    YuvPixel(src_y[0], src_vu[1], src_vu[0],
1423             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1424    rgb_buf[3] = 255;
1425  }
1426}
1427
1428void NV12ToRGB565Row_C(const uint8* src_y,
1429                       const uint8* src_uv,
1430                       uint8* dst_rgb565,
1431                       int width) {
1432  uint8 b0;
1433  uint8 g0;
1434  uint8 r0;
1435  uint8 b1;
1436  uint8 g1;
1437  uint8 r1;
1438  int x;
1439  for (x = 0; x < width - 1; x += 2) {
1440    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
1441    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);
1442    b0 = b0 >> 3;
1443    g0 = g0 >> 2;
1444    r0 = r0 >> 3;
1445    b1 = b1 >> 3;
1446    g1 = g1 >> 2;
1447    r1 = r1 >> 3;
1448    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1449        (b1 << 16) | (g1 << 21) | (r1 << 27);
1450    src_y += 2;
1451    src_uv += 2;
1452    dst_rgb565 += 4;  // Advance 2 pixels.
1453  }
1454  if (width & 1) {
1455    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
1456    b0 = b0 >> 3;
1457    g0 = g0 >> 2;
1458    r0 = r0 >> 3;
1459    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1460  }
1461}
1462
1463void NV21ToRGB565Row_C(const uint8* src_y,
1464                       const uint8* vsrc_u,
1465                       uint8* dst_rgb565,
1466                       int width) {
1467  uint8 b0;
1468  uint8 g0;
1469  uint8 r0;
1470  uint8 b1;
1471  uint8 g1;
1472  uint8 r1;
1473  int x;
1474  for (x = 0; x < width - 1; x += 2) {
1475    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
1476    YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
1477    b0 = b0 >> 3;
1478    g0 = g0 >> 2;
1479    r0 = r0 >> 3;
1480    b1 = b1 >> 3;
1481    g1 = g1 >> 2;
1482    r1 = r1 >> 3;
1483    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1484        (b1 << 16) | (g1 << 21) | (r1 << 27);
1485    src_y += 2;
1486    vsrc_u += 2;
1487    dst_rgb565 += 4;  // Advance 2 pixels.
1488  }
1489  if (width & 1) {
1490    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
1491    b0 = b0 >> 3;
1492    g0 = g0 >> 2;
1493    r0 = r0 >> 3;
1494    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1495  }
1496}
1497
1498void YUY2ToARGBRow_C(const uint8* src_yuy2,
1499                     uint8* rgb_buf,
1500                     int width) {
1501  int x;
1502  for (x = 0; x < width - 1; x += 2) {
1503    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
1504             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1505    rgb_buf[3] = 255;
1506    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
1507             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1508    rgb_buf[7] = 255;
1509    src_yuy2 += 4;
1510    rgb_buf += 8;  // Advance 2 pixels.
1511  }
1512  if (width & 1) {
1513    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
1514             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1515    rgb_buf[3] = 255;
1516  }
1517}
1518
1519void UYVYToARGBRow_C(const uint8* src_uyvy,
1520                     uint8* rgb_buf,
1521                     int width) {
1522  int x;
1523  for (x = 0; x < width - 1; x += 2) {
1524    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
1525             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1526    rgb_buf[3] = 255;
1527    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
1528             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1529    rgb_buf[7] = 255;
1530    src_uyvy += 4;
1531    rgb_buf += 8;  // Advance 2 pixels.
1532  }
1533  if (width & 1) {
1534    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
1535             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1536    rgb_buf[3] = 255;
1537  }
1538}
1539
1540void I422ToBGRARow_C(const uint8* src_y,
1541                     const uint8* src_u,
1542                     const uint8* src_v,
1543                     uint8* rgb_buf,
1544                     int width) {
1545  int x;
1546  for (x = 0; x < width - 1; x += 2) {
1547    YuvPixel(src_y[0], src_u[0], src_v[0],
1548             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
1549    rgb_buf[0] = 255;
1550    YuvPixel(src_y[1], src_u[0], src_v[0],
1551             rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
1552    rgb_buf[4] = 255;
1553    src_y += 2;
1554    src_u += 1;
1555    src_v += 1;
1556    rgb_buf += 8;  // Advance 2 pixels.
1557  }
1558  if (width & 1) {
1559    YuvPixel(src_y[0], src_u[0], src_v[0],
1560             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
1561    rgb_buf[0] = 255;
1562  }
1563}
1564
1565void I422ToABGRRow_C(const uint8* src_y,
1566                     const uint8* src_u,
1567                     const uint8* src_v,
1568                     uint8* rgb_buf,
1569                     int width) {
1570  int x;
1571  for (x = 0; x < width - 1; x += 2) {
1572    YuvPixel(src_y[0], src_u[0], src_v[0],
1573             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1574    rgb_buf[3] = 255;
1575    YuvPixel(src_y[1], src_u[0], src_v[0],
1576             rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
1577    rgb_buf[7] = 255;
1578    src_y += 2;
1579    src_u += 1;
1580    src_v += 1;
1581    rgb_buf += 8;  // Advance 2 pixels.
1582  }
1583  if (width & 1) {
1584    YuvPixel(src_y[0], src_u[0], src_v[0],
1585             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1586    rgb_buf[3] = 255;
1587  }
1588}
1589
1590void I422ToRGBARow_C(const uint8* src_y,
1591                     const uint8* src_u,
1592                     const uint8* src_v,
1593                     uint8* rgb_buf,
1594                     int width) {
1595  int x;
1596  for (x = 0; x < width - 1; x += 2) {
1597    YuvPixel(src_y[0], src_u[0], src_v[0],
1598             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
1599    rgb_buf[0] = 255;
1600    YuvPixel(src_y[1], src_u[0], src_v[0],
1601             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
1602    rgb_buf[4] = 255;
1603    src_y += 2;
1604    src_u += 1;
1605    src_v += 1;
1606    rgb_buf += 8;  // Advance 2 pixels.
1607  }
1608  if (width & 1) {
1609    YuvPixel(src_y[0], src_u[0], src_v[0],
1610             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
1611    rgb_buf[0] = 255;
1612  }
1613}
1614
1615void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
1616  int x;
1617  for (x = 0; x < width - 1; x += 2) {
1618    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1619    rgb_buf[3] = 255;
1620    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1621    rgb_buf[7] = 255;
1622    src_y += 2;
1623    rgb_buf += 8;  // Advance 2 pixels.
1624  }
1625  if (width & 1) {
1626    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1627    rgb_buf[3] = 255;
1628  }
1629}
1630
1631void MirrorRow_C(const uint8* src, uint8* dst, int width) {
1632  int x;
1633  src += width - 1;
1634  for (x = 0; x < width - 1; x += 2) {
1635    dst[x] = src[0];
1636    dst[x + 1] = src[-1];
1637    src -= 2;
1638  }
1639  if (width & 1) {
1640    dst[width - 1] = src[0];
1641  }
1642}
1643
1644void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1645  int x;
1646  src_uv += (width - 1) << 1;
1647  for (x = 0; x < width - 1; x += 2) {
1648    dst_u[x] = src_uv[0];
1649    dst_u[x + 1] = src_uv[-2];
1650    dst_v[x] = src_uv[1];
1651    dst_v[x + 1] = src_uv[-2 + 1];
1652    src_uv -= 4;
1653  }
1654  if (width & 1) {
1655    dst_u[width - 1] = src_uv[0];
1656    dst_v[width - 1] = src_uv[1];
1657  }
1658}
1659
1660void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
1661  int x;
1662  const uint32* src32 = (const uint32*)(src);
1663  uint32* dst32 = (uint32*)(dst);
1664  src32 += width - 1;
1665  for (x = 0; x < width - 1; x += 2) {
1666    dst32[x] = src32[0];
1667    dst32[x + 1] = src32[-1];
1668    src32 -= 2;
1669  }
1670  if (width & 1) {
1671    dst32[width - 1] = src32[0];
1672  }
1673}
1674
1675void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1676  int x;
1677  for (x = 0; x < width - 1; x += 2) {
1678    dst_u[x] = src_uv[0];
1679    dst_u[x + 1] = src_uv[2];
1680    dst_v[x] = src_uv[1];
1681    dst_v[x + 1] = src_uv[3];
1682    src_uv += 4;
1683  }
1684  if (width & 1) {
1685    dst_u[width - 1] = src_uv[0];
1686    dst_v[width - 1] = src_uv[1];
1687  }
1688}
1689
1690void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
1691                  int width) {
1692  int x;
1693  for (x = 0; x < width - 1; x += 2) {
1694    dst_uv[0] = src_u[x];
1695    dst_uv[1] = src_v[x];
1696    dst_uv[2] = src_u[x + 1];
1697    dst_uv[3] = src_v[x + 1];
1698    dst_uv += 4;
1699  }
1700  if (width & 1) {
1701    dst_uv[0] = src_u[width - 1];
1702    dst_uv[1] = src_v[width - 1];
1703  }
1704}
1705
1706void CopyRow_C(const uint8* src, uint8* dst, int count) {
1707  memcpy(dst, src, count);
1708}
1709
1710void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
1711  memcpy(dst, src, count * 2);
1712}
1713
1714void SetRow_C(uint8* dst, uint8 v8, int width) {
1715  memset(dst, v8, width);
1716}
1717
1718void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
1719  uint32* d = (uint32*)(dst_argb);
1720  int x;
1721  for (x = 0; x < width; ++x) {
1722    d[x] = v32;
1723  }
1724}
1725
1726// Filter 2 rows of YUY2 UV's (422) into U and V (420).
1727void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
1728                   uint8* dst_u, uint8* dst_v, int width) {
1729  // Output a row of UV values, filtering 2 rows of YUY2.
1730  int x;
1731  for (x = 0; x < width; x += 2) {
1732    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
1733    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
1734    src_yuy2 += 4;
1735    dst_u += 1;
1736    dst_v += 1;
1737  }
1738}
1739
1740// Copy row of YUY2 UV's (422) into U and V (422).
1741void YUY2ToUV422Row_C(const uint8* src_yuy2,
1742                      uint8* dst_u, uint8* dst_v, int width) {
1743  // Output a row of UV values.
1744  int x;
1745  for (x = 0; x < width; x += 2) {
1746    dst_u[0] = src_yuy2[1];
1747    dst_v[0] = src_yuy2[3];
1748    src_yuy2 += 4;
1749    dst_u += 1;
1750    dst_v += 1;
1751  }
1752}
1753
1754// Copy row of YUY2 Y's (422) into Y (420/422).
1755void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
1756  // Output a row of Y values.
1757  int x;
1758  for (x = 0; x < width - 1; x += 2) {
1759    dst_y[x] = src_yuy2[0];
1760    dst_y[x + 1] = src_yuy2[2];
1761    src_yuy2 += 4;
1762  }
1763  if (width & 1) {
1764    dst_y[width - 1] = src_yuy2[0];
1765  }
1766}
1767
1768// Filter 2 rows of UYVY UV's (422) into U and V (420).
1769void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
1770                   uint8* dst_u, uint8* dst_v, int width) {
1771  // Output a row of UV values.
1772  int x;
1773  for (x = 0; x < width; x += 2) {
1774    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
1775    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
1776    src_uyvy += 4;
1777    dst_u += 1;
1778    dst_v += 1;
1779  }
1780}
1781
1782// Copy row of UYVY UV's (422) into U and V (422).
1783void UYVYToUV422Row_C(const uint8* src_uyvy,
1784                      uint8* dst_u, uint8* dst_v, int width) {
1785  // Output a row of UV values.
1786  int x;
1787  for (x = 0; x < width; x += 2) {
1788    dst_u[0] = src_uyvy[0];
1789    dst_v[0] = src_uyvy[2];
1790    src_uyvy += 4;
1791    dst_u += 1;
1792    dst_v += 1;
1793  }
1794}
1795
1796// Copy row of UYVY Y's (422) into Y (420/422).
1797void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
1798  // Output a row of Y values.
1799  int x;
1800  for (x = 0; x < width - 1; x += 2) {
1801    dst_y[x] = src_uyvy[1];
1802    dst_y[x + 1] = src_uyvy[3];
1803    src_uyvy += 4;
1804  }
1805  if (width & 1) {
1806    dst_y[width - 1] = src_uyvy[1];
1807  }
1808}
1809
1810#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
1811
1812// Blend src_argb0 over src_argb1 and store to dst_argb.
1813// dst_argb may be src_argb0 or src_argb1.
1814// This code mimics the SSSE3 version for better testability.
1815void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
1816                    uint8* dst_argb, int width) {
1817  int x;
1818  for (x = 0; x < width - 1; x += 2) {
1819    uint32 fb = src_argb0[0];
1820    uint32 fg = src_argb0[1];
1821    uint32 fr = src_argb0[2];
1822    uint32 a = src_argb0[3];
1823    uint32 bb = src_argb1[0];
1824    uint32 bg = src_argb1[1];
1825    uint32 br = src_argb1[2];
1826    dst_argb[0] = BLEND(fb, bb, a);
1827    dst_argb[1] = BLEND(fg, bg, a);
1828    dst_argb[2] = BLEND(fr, br, a);
1829    dst_argb[3] = 255u;
1830
1831    fb = src_argb0[4 + 0];
1832    fg = src_argb0[4 + 1];
1833    fr = src_argb0[4 + 2];
1834    a = src_argb0[4 + 3];
1835    bb = src_argb1[4 + 0];
1836    bg = src_argb1[4 + 1];
1837    br = src_argb1[4 + 2];
1838    dst_argb[4 + 0] = BLEND(fb, bb, a);
1839    dst_argb[4 + 1] = BLEND(fg, bg, a);
1840    dst_argb[4 + 2] = BLEND(fr, br, a);
1841    dst_argb[4 + 3] = 255u;
1842    src_argb0 += 8;
1843    src_argb1 += 8;
1844    dst_argb += 8;
1845  }
1846
1847  if (width & 1) {
1848    uint32 fb = src_argb0[0];
1849    uint32 fg = src_argb0[1];
1850    uint32 fr = src_argb0[2];
1851    uint32 a = src_argb0[3];
1852    uint32 bb = src_argb1[0];
1853    uint32 bg = src_argb1[1];
1854    uint32 br = src_argb1[2];
1855    dst_argb[0] = BLEND(fb, bb, a);
1856    dst_argb[1] = BLEND(fg, bg, a);
1857    dst_argb[2] = BLEND(fr, br, a);
1858    dst_argb[3] = 255u;
1859  }
1860}
1861#undef BLEND
1862#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
1863
1864// Multiply source RGB by alpha and store to destination.
1865// This code mimics the SSSE3 version for better testability.
1866void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
1867  int i;
1868  for (i = 0; i < width - 1; i += 2) {
1869    uint32 b = src_argb[0];
1870    uint32 g = src_argb[1];
1871    uint32 r = src_argb[2];
1872    uint32 a = src_argb[3];
1873    dst_argb[0] = ATTENUATE(b, a);
1874    dst_argb[1] = ATTENUATE(g, a);
1875    dst_argb[2] = ATTENUATE(r, a);
1876    dst_argb[3] = a;
1877    b = src_argb[4];
1878    g = src_argb[5];
1879    r = src_argb[6];
1880    a = src_argb[7];
1881    dst_argb[4] = ATTENUATE(b, a);
1882    dst_argb[5] = ATTENUATE(g, a);
1883    dst_argb[6] = ATTENUATE(r, a);
1884    dst_argb[7] = a;
1885    src_argb += 8;
1886    dst_argb += 8;
1887  }
1888
1889  if (width & 1) {
1890    const uint32 b = src_argb[0];
1891    const uint32 g = src_argb[1];
1892    const uint32 r = src_argb[2];
1893    const uint32 a = src_argb[3];
1894    dst_argb[0] = ATTENUATE(b, a);
1895    dst_argb[1] = ATTENUATE(g, a);
1896    dst_argb[2] = ATTENUATE(r, a);
1897    dst_argb[3] = a;
1898  }
1899}
1900#undef ATTENUATE
1901
1902// Divide source RGB by alpha and store to destination.
1903// b = (b * 255 + (a / 2)) / a;
1904// g = (g * 255 + (a / 2)) / a;
1905// r = (r * 255 + (a / 2)) / a;
1906// Reciprocal method is off by 1 on some values. ie 125
1907// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
1908#define T(a) 0x01000000 + (0x10000 / a)
1909const uint32 fixed_invtbl8[256] = {
1910  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
1911  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
1912  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
1913  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
1914  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
1915  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
1916  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
1917  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
1918  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
1919  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
1920  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
1921  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
1922  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
1923  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
1924  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
1925  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
1926  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
1927  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
1928  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
1929  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
1930  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
1931  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
1932  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
1933  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
1934  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
1935  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
1936  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
1937  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
1938  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
1939  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
1940  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
1941  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
1942#undef T
1943
1944void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
1945  int i;
1946  for (i = 0; i < width; ++i) {
1947    uint32 b = src_argb[0];
1948    uint32 g = src_argb[1];
1949    uint32 r = src_argb[2];
1950    const uint32 a = src_argb[3];
1951    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
1952    b = (b * ia) >> 8;
1953    g = (g * ia) >> 8;
1954    r = (r * ia) >> 8;
1955    // Clamping should not be necessary but is free in assembly.
1956    dst_argb[0] = clamp255(b);
1957    dst_argb[1] = clamp255(g);
1958    dst_argb[2] = clamp255(r);
1959    dst_argb[3] = a;
1960    src_argb += 4;
1961    dst_argb += 4;
1962  }
1963}
1964
1965void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
1966                               const int32* previous_cumsum, int width) {
1967  int32 row_sum[4] = {0, 0, 0, 0};
1968  int x;
1969  for (x = 0; x < width; ++x) {
1970    row_sum[0] += row[x * 4 + 0];
1971    row_sum[1] += row[x * 4 + 1];
1972    row_sum[2] += row[x * 4 + 2];
1973    row_sum[3] += row[x * 4 + 3];
1974    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
1975    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
1976    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
1977    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
1978  }
1979}
1980
1981void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
1982                                int w, int area, uint8* dst, int count) {
1983  float ooa = 1.0f / area;
1984  int i;
1985  for (i = 0; i < count; ++i) {
1986    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
1987    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
1988    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
1989    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
1990    dst += 4;
1991    tl += 4;
1992    bl += 4;
1993  }
1994}
1995
1996// Copy pixels from rotated source to destination row with a slope.
1997LIBYUV_API
1998void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
1999                     uint8* dst_argb, const float* uv_dudv, int width) {
2000  int i;
2001  // Render a row of pixels from source into a buffer.
2002  float uv[2];
2003  uv[0] = uv_dudv[0];
2004  uv[1] = uv_dudv[1];
2005  for (i = 0; i < width; ++i) {
2006    int x = (int)(uv[0]);
2007    int y = (int)(uv[1]);
2008    *(uint32*)(dst_argb) =
2009        *(const uint32*)(src_argb + y * src_argb_stride +
2010                                         x * 4);
2011    dst_argb += 4;
2012    uv[0] += uv_dudv[2];
2013    uv[1] += uv_dudv[3];
2014  }
2015}
2016
2017// Blend 2 rows into 1.
2018static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
2019                      uint8* dst_uv, int pix) {
2020  int x;
2021  for (x = 0; x < pix; ++x) {
2022    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
2023  }
2024}
2025
2026static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
2027                         uint16* dst_uv, int pix) {
2028  int x;
2029  for (x = 0; x < pix; ++x) {
2030    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
2031  }
2032}
2033
2034// C version 2x2 -> 2x1.
2035void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
2036                      ptrdiff_t src_stride,
2037                      int width, int source_y_fraction) {
2038  int y1_fraction = source_y_fraction;
2039  int y0_fraction = 256 - y1_fraction;
2040  const uint8* src_ptr1 = src_ptr + src_stride;
2041  int x;
2042  if (source_y_fraction == 0) {
2043    memcpy(dst_ptr, src_ptr, width);
2044    return;
2045  }
2046  if (source_y_fraction == 128) {
2047    HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
2048    return;
2049  }
2050  for (x = 0; x < width - 1; x += 2) {
2051    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2052    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
2053    src_ptr += 2;
2054    src_ptr1 += 2;
2055    dst_ptr += 2;
2056  }
2057  if (width & 1) {
2058    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2059  }
2060}
2061
2062void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
2063                         ptrdiff_t src_stride,
2064                         int width, int source_y_fraction) {
2065  int y1_fraction = source_y_fraction;
2066  int y0_fraction = 256 - y1_fraction;
2067  const uint16* src_ptr1 = src_ptr + src_stride;
2068  int x;
2069  if (source_y_fraction == 0) {
2070    memcpy(dst_ptr, src_ptr, width * 2);
2071    return;
2072  }
2073  if (source_y_fraction == 128) {
2074    HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
2075    return;
2076  }
2077  for (x = 0; x < width - 1; x += 2) {
2078    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2079    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
2080    src_ptr += 2;
2081    src_ptr1 += 2;
2082    dst_ptr += 2;
2083  }
2084  if (width & 1) {
2085    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2086  }
2087}
2088
2089// Use first 4 shuffler values to reorder ARGB channels.
2090void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
2091                      const uint8* shuffler, int pix) {
2092  int index0 = shuffler[0];
2093  int index1 = shuffler[1];
2094  int index2 = shuffler[2];
2095  int index3 = shuffler[3];
2096  // Shuffle a row of ARGB.
2097  int x;
2098  for (x = 0; x < pix; ++x) {
2099    // To support in-place conversion.
2100    uint8 b = src_argb[index0];
2101    uint8 g = src_argb[index1];
2102    uint8 r = src_argb[index2];
2103    uint8 a = src_argb[index3];
2104    dst_argb[0] = b;
2105    dst_argb[1] = g;
2106    dst_argb[2] = r;
2107    dst_argb[3] = a;
2108    src_argb += 4;
2109    dst_argb += 4;
2110  }
2111}
2112
2113void I422ToYUY2Row_C(const uint8* src_y,
2114                     const uint8* src_u,
2115                     const uint8* src_v,
2116                     uint8* dst_frame, int width) {
2117  int x;
2118  for (x = 0; x < width - 1; x += 2) {
2119    dst_frame[0] = src_y[0];
2120    dst_frame[1] = src_u[0];
2121    dst_frame[2] = src_y[1];
2122    dst_frame[3] = src_v[0];
2123    dst_frame += 4;
2124    src_y += 2;
2125    src_u += 1;
2126    src_v += 1;
2127  }
2128  if (width & 1) {
2129    dst_frame[0] = src_y[0];
2130    dst_frame[1] = src_u[0];
2131    dst_frame[2] = 0;
2132    dst_frame[3] = src_v[0];
2133  }
2134}
2135
2136void I422ToUYVYRow_C(const uint8* src_y,
2137                     const uint8* src_u,
2138                     const uint8* src_v,
2139                     uint8* dst_frame, int width) {
2140  int x;
2141  for (x = 0; x < width - 1; x += 2) {
2142    dst_frame[0] = src_u[0];
2143    dst_frame[1] = src_y[0];
2144    dst_frame[2] = src_v[0];
2145    dst_frame[3] = src_y[1];
2146    dst_frame += 4;
2147    src_y += 2;
2148    src_u += 1;
2149    src_v += 1;
2150  }
2151  if (width & 1) {
2152    dst_frame[0] = src_u[0];
2153    dst_frame[1] = src_y[0];
2154    dst_frame[2] = src_v[0];
2155    dst_frame[3] = 0;
2156  }
2157}
2158
2159// Maximum temporary width for wrappers to process at a time, in pixels.
2160#define MAXTWIDTH 2048
2161
2162#if !(defined(_MSC_VER) && !defined(__clang__)) && \
2163    defined(HAS_I422TORGB565ROW_SSSE3)
2164// row_win.cc has asm version, but GCC uses 2 step wrapper.
2165void I422ToRGB565Row_SSSE3(const uint8* src_y,
2166                           const uint8* src_u,
2167                           const uint8* src_v,
2168                           uint8* dst_rgb565,
2169                           int width) {
2170  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2171  while (width > 0) {
2172    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2173    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
2174    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
2175    src_y += twidth;
2176    src_u += twidth / 2;
2177    src_v += twidth / 2;
2178    dst_rgb565 += twidth * 2;
2179    width -= twidth;
2180  }
2181}
2182#endif
2183
2184#if defined(HAS_I422TOARGB1555ROW_SSSE3)
2185void I422ToARGB1555Row_SSSE3(const uint8* src_y,
2186                             const uint8* src_u,
2187                             const uint8* src_v,
2188                             uint8* dst_argb1555,
2189                             int width) {
2190  // Row buffer for intermediate ARGB pixels.
2191  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2192  while (width > 0) {
2193    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2194    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
2195    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
2196    src_y += twidth;
2197    src_u += twidth / 2;
2198    src_v += twidth / 2;
2199    dst_argb1555 += twidth * 2;
2200    width -= twidth;
2201  }
2202}
2203#endif
2204
2205#if defined(HAS_I422TOARGB4444ROW_SSSE3)
2206void I422ToARGB4444Row_SSSE3(const uint8* src_y,
2207                             const uint8* src_u,
2208                             const uint8* src_v,
2209                             uint8* dst_argb4444,
2210                             int width) {
2211  // Row buffer for intermediate ARGB pixels.
2212  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2213  while (width > 0) {
2214    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2215    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
2216    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
2217    src_y += twidth;
2218    src_u += twidth / 2;
2219    src_v += twidth / 2;
2220    dst_argb4444 += twidth * 2;
2221    width -= twidth;
2222  }
2223}
2224#endif
2225
2226#if defined(HAS_NV12TORGB565ROW_SSSE3)
2227void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
2228                           uint8* dst_rgb565, int width) {
2229  // Row buffer for intermediate ARGB pixels.
2230  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2231  while (width > 0) {
2232    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2233    NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
2234    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
2235    src_y += twidth;
2236    src_uv += twidth;
2237    dst_rgb565 += twidth * 2;
2238    width -= twidth;
2239  }
2240}
2241#endif
2242
2243#if defined(HAS_NV21TORGB565ROW_SSSE3)
2244void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
2245                           uint8* dst_rgb565, int width) {
2246  // Row buffer for intermediate ARGB pixels.
2247  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2248  while (width > 0) {
2249    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2250    NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
2251    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
2252    src_y += twidth;
2253    src_vu += twidth;
2254    dst_rgb565 += twidth * 2;
2255    width -= twidth;
2256  }
2257}
2258#endif
2259
2260#if defined(HAS_YUY2TOARGBROW_SSSE3)
2261void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
2262  // Row buffers for intermediate YUV pixels.
2263  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
2264  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
2265  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
2266  while (width > 0) {
2267    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2268    YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
2269    YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
2270    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
2271    src_yuy2 += twidth * 2;
2272    dst_argb += twidth * 4;
2273    width -= twidth;
2274  }
2275}
2276#endif
2277
2278#if defined(HAS_UYVYTOARGBROW_SSSE3)
2279void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
2280  // Row buffers for intermediate YUV pixels.
2281  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
2282  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
2283  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
2284  while (width > 0) {
2285    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2286    UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
2287    UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
2288    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
2289    src_uyvy += twidth * 2;
2290    dst_argb += twidth * 4;
2291    width -= twidth;
2292  }
2293}
2294#endif  // !defined(LIBYUV_DISABLE_X86)
2295
2296#if defined(HAS_I422TORGB565ROW_AVX2)
2297void I422ToRGB565Row_AVX2(const uint8* src_y,
2298                          const uint8* src_u,
2299                          const uint8* src_v,
2300                          uint8* dst_rgb565,
2301                          int width) {
2302  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
2303  while (width > 0) {
2304    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2305    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
2306    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
2307    src_y += twidth;
2308    src_u += twidth / 2;
2309    src_v += twidth / 2;
2310    dst_rgb565 += twidth * 2;
2311    width -= twidth;
2312  }
2313}
2314#endif
2315
2316#if defined(HAS_I422TOARGB1555ROW_AVX2)
2317void I422ToARGB1555Row_AVX2(const uint8* src_y,
2318                            const uint8* src_u,
2319                            const uint8* src_v,
2320                            uint8* dst_argb1555,
2321                            int width) {
2322  // Row buffer for intermediate ARGB pixels.
2323  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
2324  while (width > 0) {
2325    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2326    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
2327    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
2328    src_y += twidth;
2329    src_u += twidth / 2;
2330    src_v += twidth / 2;
2331    dst_argb1555 += twidth * 2;
2332    width -= twidth;
2333  }
2334}
2335#endif
2336
2337#if defined(HAS_I422TOARGB4444ROW_AVX2)
2338void I422ToARGB4444Row_AVX2(const uint8* src_y,
2339                            const uint8* src_u,
2340                            const uint8* src_v,
2341                            uint8* dst_argb4444,
2342                            int width) {
2343  // Row buffer for intermediate ARGB pixels.
2344  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
2345  while (width > 0) {
2346    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2347    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
2348    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
2349    src_y += twidth;
2350    src_u += twidth / 2;
2351    src_v += twidth / 2;
2352    dst_argb4444 += twidth * 2;
2353    width -= twidth;
2354  }
2355}
2356#endif
2357
2358#if defined(HAS_I422TORGB24ROW_AVX2)
2359void I422ToRGB24Row_AVX2(const uint8* src_y,
2360                            const uint8* src_u,
2361                            const uint8* src_v,
2362                            uint8* dst_rgb24,
2363                            int width) {
2364  // Row buffer for intermediate ARGB pixels.
2365  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
2366  while (width > 0) {
2367    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2368    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
2369    // TODO(fbarchard): ARGBToRGB24Row_AVX2
2370    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
2371    src_y += twidth;
2372    src_u += twidth / 2;
2373    src_v += twidth / 2;
2374    dst_rgb24 += twidth * 3;
2375    width -= twidth;
2376  }
2377}
2378#endif
2379
2380#if defined(HAS_I422TORAWROW_AVX2)
2381void I422ToRAWRow_AVX2(const uint8* src_y,
2382                            const uint8* src_u,
2383                            const uint8* src_v,
2384                            uint8* dst_raw,
2385                            int width) {
2386  // Row buffer for intermediate ARGB pixels.
2387  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
2388  while (width > 0) {
2389    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2390    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
2391    // TODO(fbarchard): ARGBToRAWRow_AVX2
2392    ARGBToRAWRow_SSSE3(row, dst_raw, twidth);
2393    src_y += twidth;
2394    src_u += twidth / 2;
2395    src_v += twidth / 2;
2396    dst_raw += twidth * 3;
2397    width -= twidth;
2398  }
2399}
2400#endif
2401
2402#if defined(HAS_NV12TORGB565ROW_AVX2)
2403void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
2404                          uint8* dst_rgb565, int width) {
2405  // Row buffer for intermediate ARGB pixels.
2406  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
2407  while (width > 0) {
2408    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2409    NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth);
2410    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
2411    src_y += twidth;
2412    src_uv += twidth;
2413    dst_rgb565 += twidth * 2;
2414    width -= twidth;
2415  }
2416}
2417#endif
2418
2419#if defined(HAS_NV21TORGB565ROW_AVX2)
2420void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu,
2421                          uint8* dst_rgb565, int width) {
2422  // Row buffer for intermediate ARGB pixels.
2423  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
2424  while (width > 0) {
2425    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2426    NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth);
2427    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
2428    src_y += twidth;
2429    src_vu += twidth;
2430    dst_rgb565 += twidth * 2;
2431    width -= twidth;
2432  }
2433}
2434#endif
2435
2436#if defined(HAS_YUY2TOARGBROW_AVX2)
2437void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) {
2438  // Row buffers for intermediate YUV pixels.
2439  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
2440  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
2441  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
2442  while (width > 0) {
2443    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2444    YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
2445    YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
2446    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
2447    src_yuy2 += twidth * 2;
2448    dst_argb += twidth * 4;
2449    width -= twidth;
2450  }
2451}
2452#endif
2453
2454#if defined(HAS_UYVYTOARGBROW_AVX2)
2455void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) {
2456  // Row buffers for intermediate YUV pixels.
2457  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
2458  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
2459  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
2460  while (width > 0) {
2461    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2462    UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
2463    UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
2464    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
2465    src_uyvy += twidth * 2;
2466    dst_argb += twidth * 4;
2467    width -= twidth;
2468  }
2469}
2470#endif  // !defined(LIBYUV_DISABLE_X86)
2471
2472void ARGBPolynomialRow_C(const uint8* src_argb,
2473                         uint8* dst_argb, const float* poly,
2474                         int width) {
2475  int i;
2476  for (i = 0; i < width; ++i) {
2477    float b = (float)(src_argb[0]);
2478    float g = (float)(src_argb[1]);
2479    float r = (float)(src_argb[2]);
2480    float a = (float)(src_argb[3]);
2481    float b2 = b * b;
2482    float g2 = g * g;
2483    float r2 = r * r;
2484    float a2 = a * a;
2485    float db = poly[0] + poly[4] * b;
2486    float dg = poly[1] + poly[5] * g;
2487    float dr = poly[2] + poly[6] * r;
2488    float da = poly[3] + poly[7] * a;
2489    float b3 = b2 * b;
2490    float g3 = g2 * g;
2491    float r3 = r2 * r;
2492    float a3 = a2 * a;
2493    db += poly[8] * b2;
2494    dg += poly[9] * g2;
2495    dr += poly[10] * r2;
2496    da += poly[11] * a2;
2497    db += poly[12] * b3;
2498    dg += poly[13] * g3;
2499    dr += poly[14] * r3;
2500    da += poly[15] * a3;
2501
2502    dst_argb[0] = Clamp((int32)(db));
2503    dst_argb[1] = Clamp((int32)(dg));
2504    dst_argb[2] = Clamp((int32)(dr));
2505    dst_argb[3] = Clamp((int32)(da));
2506    src_argb += 4;
2507    dst_argb += 4;
2508  }
2509}
2510
2511void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
2512                             const uint8* luma, uint32 lumacoeff) {
2513  uint32 bc = lumacoeff & 0xff;
2514  uint32 gc = (lumacoeff >> 8) & 0xff;
2515  uint32 rc = (lumacoeff >> 16) & 0xff;
2516
2517  int i;
2518  for (i = 0; i < width - 1; i += 2) {
2519    // Luminance in rows, color values in columns.
2520    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
2521                           src_argb[2] * rc) & 0x7F00u) + luma;
2522    const uint8* luma1;
2523    dst_argb[0] = luma0[src_argb[0]];
2524    dst_argb[1] = luma0[src_argb[1]];
2525    dst_argb[2] = luma0[src_argb[2]];
2526    dst_argb[3] = src_argb[3];
2527    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
2528              src_argb[6] * rc) & 0x7F00u) + luma;
2529    dst_argb[4] = luma1[src_argb[4]];
2530    dst_argb[5] = luma1[src_argb[5]];
2531    dst_argb[6] = luma1[src_argb[6]];
2532    dst_argb[7] = src_argb[7];
2533    src_argb += 8;
2534    dst_argb += 8;
2535  }
2536  if (width & 1) {
2537    // Luminance in rows, color values in columns.
2538    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
2539                           src_argb[2] * rc) & 0x7F00u) + luma;
2540    dst_argb[0] = luma0[src_argb[0]];
2541    dst_argb[1] = luma0[src_argb[1]];
2542    dst_argb[2] = luma0[src_argb[2]];
2543    dst_argb[3] = src_argb[3];
2544  }
2545}
2546
2547void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
2548  int i;
2549  for (i = 0; i < width - 1; i += 2) {
2550    dst[3] = src[3];
2551    dst[7] = src[7];
2552    dst += 8;
2553    src += 8;
2554  }
2555  if (width & 1) {
2556    dst[3] = src[3];
2557  }
2558}
2559
2560void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
2561  int i;
2562  for (i = 0; i < width - 1; i += 2) {
2563    dst[3] = src[0];
2564    dst[7] = src[1];
2565    dst += 8;
2566    src += 2;
2567  }
2568  if (width & 1) {
2569    dst[3] = src[0];
2570  }
2571}
2572
2573#ifdef __cplusplus
2574}  // extern "C"
2575}  // namespace libyuv
2576#endif
2577