1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#include <string.h>  // For memcpy and memset.
14
15#include "libyuv/basic_types.h"
16
17#ifdef __cplusplus
18namespace libyuv {
19extern "C" {
20#endif
21
22// llvm x86 is poor at ternary operator, so use branchless min/max.
23
24#define USE_BRANCHLESS 1
25#if USE_BRANCHLESS
26static __inline int32 clamp0(int32 v) {
27  return ((-(v) >> 31) & (v));
28}
29
30static __inline int32 clamp255(int32 v) {
31  return (((255 - (v)) >> 31) | (v)) & 255;
32}
33
34static __inline uint32 Clamp(int32 val) {
35  int v = clamp0(val);
36  return (uint32)(clamp255(v));
37}
38
39static __inline uint32 Abs(int32 v) {
40  int m = v >> 31;
41  return (v + m) ^ m;
42}
43#else  // USE_BRANCHLESS
44static __inline int32 clamp0(int32 v) {
45  return (v < 0) ? 0 : v;
46}
47
48static __inline int32 clamp255(int32 v) {
49  return (v > 255) ? 255 : v;
50}
51
52static __inline uint32 Clamp(int32 val) {
53  int v = clamp0(val);
54  return (uint32)(clamp255(v));
55}
56
57static __inline uint32 Abs(int32 v) {
58  return (v < 0) ? -v : v;
59}
60#endif  // USE_BRANCHLESS
61
62#ifdef LIBYUV_LITTLE_ENDIAN
63#define WRITEWORD(p, v) *(uint32*)(p) = v
64#else
65static inline void WRITEWORD(uint8* p, uint32 v) {
66  p[0] = (uint8)(v & 255);
67  p[1] = (uint8)((v >> 8) & 255);
68  p[2] = (uint8)((v >> 16) & 255);
69  p[3] = (uint8)((v >> 24) & 255);
70}
71#endif
72
73void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
74  int x;
75  for (x = 0; x < width; ++x) {
76    uint8 b = src_rgb24[0];
77    uint8 g = src_rgb24[1];
78    uint8 r = src_rgb24[2];
79    dst_argb[0] = b;
80    dst_argb[1] = g;
81    dst_argb[2] = r;
82    dst_argb[3] = 255u;
83    dst_argb += 4;
84    src_rgb24 += 3;
85  }
86}
87
88void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
89  int x;
90  for (x = 0; x < width; ++x) {
91    uint8 r = src_raw[0];
92    uint8 g = src_raw[1];
93    uint8 b = src_raw[2];
94    dst_argb[0] = b;
95    dst_argb[1] = g;
96    dst_argb[2] = r;
97    dst_argb[3] = 255u;
98    dst_argb += 4;
99    src_raw += 3;
100  }
101}
102
103void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
104  int x;
105  for (x = 0; x < width; ++x) {
106    uint8 r = src_raw[0];
107    uint8 g = src_raw[1];
108    uint8 b = src_raw[2];
109    dst_rgb24[0] = b;
110    dst_rgb24[1] = g;
111    dst_rgb24[2] = r;
112    dst_rgb24 += 3;
113    src_raw += 3;
114  }
115}
116
117void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
118  int x;
119  for (x = 0; x < width; ++x) {
120    uint8 b = src_rgb565[0] & 0x1f;
121    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
122    uint8 r = src_rgb565[1] >> 3;
123    dst_argb[0] = (b << 3) | (b >> 2);
124    dst_argb[1] = (g << 2) | (g >> 4);
125    dst_argb[2] = (r << 3) | (r >> 2);
126    dst_argb[3] = 255u;
127    dst_argb += 4;
128    src_rgb565 += 2;
129  }
130}
131
132void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
133                         int width) {
134  int x;
135  for (x = 0; x < width; ++x) {
136    uint8 b = src_argb1555[0] & 0x1f;
137    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
138    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
139    uint8 a = src_argb1555[1] >> 7;
140    dst_argb[0] = (b << 3) | (b >> 2);
141    dst_argb[1] = (g << 3) | (g >> 2);
142    dst_argb[2] = (r << 3) | (r >> 2);
143    dst_argb[3] = -a;
144    dst_argb += 4;
145    src_argb1555 += 2;
146  }
147}
148
149void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
150                         int width) {
151  int x;
152  for (x = 0; x < width; ++x) {
153    uint8 b = src_argb4444[0] & 0x0f;
154    uint8 g = src_argb4444[0] >> 4;
155    uint8 r = src_argb4444[1] & 0x0f;
156    uint8 a = src_argb4444[1] >> 4;
157    dst_argb[0] = (b << 4) | b;
158    dst_argb[1] = (g << 4) | g;
159    dst_argb[2] = (r << 4) | r;
160    dst_argb[3] = (a << 4) | a;
161    dst_argb += 4;
162    src_argb4444 += 2;
163  }
164}
165
166void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
167  int x;
168  for (x = 0; x < width; ++x) {
169    uint8 b = src_argb[0];
170    uint8 g = src_argb[1];
171    uint8 r = src_argb[2];
172    dst_rgb[0] = b;
173    dst_rgb[1] = g;
174    dst_rgb[2] = r;
175    dst_rgb += 3;
176    src_argb += 4;
177  }
178}
179
180void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
181  int x;
182  for (x = 0; x < width; ++x) {
183    uint8 b = src_argb[0];
184    uint8 g = src_argb[1];
185    uint8 r = src_argb[2];
186    dst_rgb[0] = r;
187    dst_rgb[1] = g;
188    dst_rgb[2] = b;
189    dst_rgb += 3;
190    src_argb += 4;
191  }
192}
193
194void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
195  int x;
196  for (x = 0; x < width - 1; x += 2) {
197    uint8 b0 = src_argb[0] >> 3;
198    uint8 g0 = src_argb[1] >> 2;
199    uint8 r0 = src_argb[2] >> 3;
200    uint8 b1 = src_argb[4] >> 3;
201    uint8 g1 = src_argb[5] >> 2;
202    uint8 r1 = src_argb[6] >> 3;
203    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
204              (b1 << 16) | (g1 << 21) | (r1 << 27));
205    dst_rgb += 4;
206    src_argb += 8;
207  }
208  if (width & 1) {
209    uint8 b0 = src_argb[0] >> 3;
210    uint8 g0 = src_argb[1] >> 2;
211    uint8 r0 = src_argb[2] >> 3;
212    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
213  }
214}
215
216// dither4 is a row of 4 values from 4x4 dither matrix.
217// The 4x4 matrix contains values to increase RGB.  When converting to
218// fewer bits (565) this provides an ordered dither.
219// The order in the 4x4 matrix in first byte is upper left.
220// The 4 values are passed as an int, then referenced as an array, so
221// endian will not affect order of the original matrix.  But the dither4
222// will containing the first pixel in the lower byte for little endian
223// or the upper byte for big endian.
224void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
225                             const uint32 dither4, int width) {
226  int x;
227  for (x = 0; x < width - 1; x += 2) {
228    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
229    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
230    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
231    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
232    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
233    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
234    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
235    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
236    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
237              (b1 << 16) | (g1 << 21) | (r1 << 27));
238    dst_rgb += 4;
239    src_argb += 8;
240  }
241  if (width & 1) {
242    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
243    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
244    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
245    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
246    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
247  }
248}
249
250void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
251  int x;
252  for (x = 0; x < width - 1; x += 2) {
253    uint8 b0 = src_argb[0] >> 3;
254    uint8 g0 = src_argb[1] >> 3;
255    uint8 r0 = src_argb[2] >> 3;
256    uint8 a0 = src_argb[3] >> 7;
257    uint8 b1 = src_argb[4] >> 3;
258    uint8 g1 = src_argb[5] >> 3;
259    uint8 r1 = src_argb[6] >> 3;
260    uint8 a1 = src_argb[7] >> 7;
261    *(uint32*)(dst_rgb) =
262        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
263        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
264    dst_rgb += 4;
265    src_argb += 8;
266  }
267  if (width & 1) {
268    uint8 b0 = src_argb[0] >> 3;
269    uint8 g0 = src_argb[1] >> 3;
270    uint8 r0 = src_argb[2] >> 3;
271    uint8 a0 = src_argb[3] >> 7;
272    *(uint16*)(dst_rgb) =
273        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
274  }
275}
276
277void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
278  int x;
279  for (x = 0; x < width - 1; x += 2) {
280    uint8 b0 = src_argb[0] >> 4;
281    uint8 g0 = src_argb[1] >> 4;
282    uint8 r0 = src_argb[2] >> 4;
283    uint8 a0 = src_argb[3] >> 4;
284    uint8 b1 = src_argb[4] >> 4;
285    uint8 g1 = src_argb[5] >> 4;
286    uint8 r1 = src_argb[6] >> 4;
287    uint8 a1 = src_argb[7] >> 4;
288    *(uint32*)(dst_rgb) =
289        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
290        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
291    dst_rgb += 4;
292    src_argb += 8;
293  }
294  if (width & 1) {
295    uint8 b0 = src_argb[0] >> 4;
296    uint8 g0 = src_argb[1] >> 4;
297    uint8 r0 = src_argb[2] >> 4;
298    uint8 a0 = src_argb[3] >> 4;
299    *(uint16*)(dst_rgb) =
300        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
301  }
302}
303
304static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
305  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
306}
307
308static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
309  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
310}
311static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
312  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
313}
314
315#define MAKEROWY(NAME, R, G, B, BPP) \
316void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
317  int x;                                                                       \
318  for (x = 0; x < width; ++x) {                                                \
319    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
320    src_argb0 += BPP;                                                          \
321    dst_y += 1;                                                                \
322  }                                                                            \
323}                                                                              \
324void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
325                       uint8* dst_u, uint8* dst_v, int width) {                \
326  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
327  int x;                                                                       \
328  for (x = 0; x < width - 1; x += 2) {                                         \
329    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
330               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
331    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
332               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
333    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
334               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
335    dst_u[0] = RGBToU(ar, ag, ab);                                             \
336    dst_v[0] = RGBToV(ar, ag, ab);                                             \
337    src_rgb0 += BPP * 2;                                                       \
338    src_rgb1 += BPP * 2;                                                       \
339    dst_u += 1;                                                                \
340    dst_v += 1;                                                                \
341  }                                                                            \
342  if (width & 1) {                                                             \
343    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
344    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
345    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
346    dst_u[0] = RGBToU(ar, ag, ab);                                             \
347    dst_v[0] = RGBToV(ar, ag, ab);                                             \
348  }                                                                            \
349}
350
351MAKEROWY(ARGB, 2, 1, 0, 4)
352MAKEROWY(BGRA, 1, 2, 3, 4)
353MAKEROWY(ABGR, 0, 1, 2, 4)
354MAKEROWY(RGBA, 3, 2, 1, 4)
355MAKEROWY(RGB24, 2, 1, 0, 3)
356MAKEROWY(RAW, 0, 1, 2, 3)
357#undef MAKEROWY
358
359// JPeg uses a variation on BT.601-1 full range
360// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
361// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
362// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
363// BT.601 Mpeg range uses:
364// b 0.1016 * 255 = 25.908 = 25
365// g 0.5078 * 255 = 129.489 = 129
366// r 0.2578 * 255 = 65.739 = 66
367// JPeg 8 bit Y (not used):
368// b 0.11400 * 256 = 29.184 = 29
369// g 0.58700 * 256 = 150.272 = 150
370// r 0.29900 * 256 = 76.544 = 77
371// JPeg 7 bit Y:
372// b 0.11400 * 128 = 14.592 = 15
373// g 0.58700 * 128 = 75.136 = 75
374// r 0.29900 * 128 = 38.272 = 38
375// JPeg 8 bit U:
376// b  0.50000 * 255 = 127.5 = 127
377// g -0.33126 * 255 = -84.4713 = -84
378// r -0.16874 * 255 = -43.0287 = -43
379// JPeg 8 bit V:
380// b -0.08131 * 255 = -20.73405 = -20
381// g -0.41869 * 255 = -106.76595 = -107
382// r  0.50000 * 255 = 127.5 = 127
383
384static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
385  return (38 * r + 75 * g +  15 * b + 64) >> 7;
386}
387
388static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
389  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
390}
391static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
392  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
393}
394
395#define AVGB(a, b) (((a) + (b) + 1) >> 1)
396
397#define MAKEROWYJ(NAME, R, G, B, BPP) \
398void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
399  int x;                                                                       \
400  for (x = 0; x < width; ++x) {                                                \
401    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
402    src_argb0 += BPP;                                                          \
403    dst_y += 1;                                                                \
404  }                                                                            \
405}                                                                              \
406void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
407                        uint8* dst_u, uint8* dst_v, int width) {               \
408  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
409  int x;                                                                       \
410  for (x = 0; x < width - 1; x += 2) {                                         \
411    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
412                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
413    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
414                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
415    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
416                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
417    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
418    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
419    src_rgb0 += BPP * 2;                                                       \
420    src_rgb1 += BPP * 2;                                                       \
421    dst_u += 1;                                                                \
422    dst_v += 1;                                                                \
423  }                                                                            \
424  if (width & 1) {                                                             \
425    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
426    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
427    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
428    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
429    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
430  }                                                                            \
431}
432
433MAKEROWYJ(ARGB, 2, 1, 0, 4)
434#undef MAKEROWYJ
435
436void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
437  int x;
438  for (x = 0; x < width; ++x) {
439    uint8 b = src_rgb565[0] & 0x1f;
440    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
441    uint8 r = src_rgb565[1] >> 3;
442    b = (b << 3) | (b >> 2);
443    g = (g << 2) | (g >> 4);
444    r = (r << 3) | (r >> 2);
445    dst_y[0] = RGBToY(r, g, b);
446    src_rgb565 += 2;
447    dst_y += 1;
448  }
449}
450
451void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
452  int x;
453  for (x = 0; x < width; ++x) {
454    uint8 b = src_argb1555[0] & 0x1f;
455    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
456    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
457    b = (b << 3) | (b >> 2);
458    g = (g << 3) | (g >> 2);
459    r = (r << 3) | (r >> 2);
460    dst_y[0] = RGBToY(r, g, b);
461    src_argb1555 += 2;
462    dst_y += 1;
463  }
464}
465
466void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
467  int x;
468  for (x = 0; x < width; ++x) {
469    uint8 b = src_argb4444[0] & 0x0f;
470    uint8 g = src_argb4444[0] >> 4;
471    uint8 r = src_argb4444[1] & 0x0f;
472    b = (b << 4) | b;
473    g = (g << 4) | g;
474    r = (r << 4) | r;
475    dst_y[0] = RGBToY(r, g, b);
476    src_argb4444 += 2;
477    dst_y += 1;
478  }
479}
480
481void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
482                     uint8* dst_u, uint8* dst_v, int width) {
483  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
484  int x;
485  for (x = 0; x < width - 1; x += 2) {
486    uint8 b0 = src_rgb565[0] & 0x1f;
487    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
488    uint8 r0 = src_rgb565[1] >> 3;
489    uint8 b1 = src_rgb565[2] & 0x1f;
490    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
491    uint8 r1 = src_rgb565[3] >> 3;
492    uint8 b2 = next_rgb565[0] & 0x1f;
493    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
494    uint8 r2 = next_rgb565[1] >> 3;
495    uint8 b3 = next_rgb565[2] & 0x1f;
496    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
497    uint8 r3 = next_rgb565[3] >> 3;
498    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
499    uint8 g = (g0 + g1 + g2 + g3);
500    uint8 r = (r0 + r1 + r2 + r3);
501    b = (b << 1) | (b >> 6);  // 787 -> 888.
502    r = (r << 1) | (r >> 6);
503    dst_u[0] = RGBToU(r, g, b);
504    dst_v[0] = RGBToV(r, g, b);
505    src_rgb565 += 4;
506    next_rgb565 += 4;
507    dst_u += 1;
508    dst_v += 1;
509  }
510  if (width & 1) {
511    uint8 b0 = src_rgb565[0] & 0x1f;
512    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
513    uint8 r0 = src_rgb565[1] >> 3;
514    uint8 b2 = next_rgb565[0] & 0x1f;
515    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
516    uint8 r2 = next_rgb565[1] >> 3;
517    uint8 b = (b0 + b2);  // 565 * 2 = 676.
518    uint8 g = (g0 + g2);
519    uint8 r = (r0 + r2);
520    b = (b << 2) | (b >> 4);  // 676 -> 888
521    g = (g << 1) | (g >> 6);
522    r = (r << 2) | (r >> 4);
523    dst_u[0] = RGBToU(r, g, b);
524    dst_v[0] = RGBToV(r, g, b);
525  }
526}
527
528void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
529                       uint8* dst_u, uint8* dst_v, int width) {
530  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
531  int x;
532  for (x = 0; x < width - 1; x += 2) {
533    uint8 b0 = src_argb1555[0] & 0x1f;
534    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
535    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
536    uint8 b1 = src_argb1555[2] & 0x1f;
537    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
538    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
539    uint8 b2 = next_argb1555[0] & 0x1f;
540    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
541    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
542    uint8 b3 = next_argb1555[2] & 0x1f;
543    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
544    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
545    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
546    uint8 g = (g0 + g1 + g2 + g3);
547    uint8 r = (r0 + r1 + r2 + r3);
548    b = (b << 1) | (b >> 6);  // 777 -> 888.
549    g = (g << 1) | (g >> 6);
550    r = (r << 1) | (r >> 6);
551    dst_u[0] = RGBToU(r, g, b);
552    dst_v[0] = RGBToV(r, g, b);
553    src_argb1555 += 4;
554    next_argb1555 += 4;
555    dst_u += 1;
556    dst_v += 1;
557  }
558  if (width & 1) {
559    uint8 b0 = src_argb1555[0] & 0x1f;
560    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
561    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
562    uint8 b2 = next_argb1555[0] & 0x1f;
563    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
564    uint8 r2 = next_argb1555[1] >> 3;
565    uint8 b = (b0 + b2);  // 555 * 2 = 666.
566    uint8 g = (g0 + g2);
567    uint8 r = (r0 + r2);
568    b = (b << 2) | (b >> 4);  // 666 -> 888.
569    g = (g << 2) | (g >> 4);
570    r = (r << 2) | (r >> 4);
571    dst_u[0] = RGBToU(r, g, b);
572    dst_v[0] = RGBToV(r, g, b);
573  }
574}
575
576void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
577                       uint8* dst_u, uint8* dst_v, int width) {
578  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
579  int x;
580  for (x = 0; x < width - 1; x += 2) {
581    uint8 b0 = src_argb4444[0] & 0x0f;
582    uint8 g0 = src_argb4444[0] >> 4;
583    uint8 r0 = src_argb4444[1] & 0x0f;
584    uint8 b1 = src_argb4444[2] & 0x0f;
585    uint8 g1 = src_argb4444[2] >> 4;
586    uint8 r1 = src_argb4444[3] & 0x0f;
587    uint8 b2 = next_argb4444[0] & 0x0f;
588    uint8 g2 = next_argb4444[0] >> 4;
589    uint8 r2 = next_argb4444[1] & 0x0f;
590    uint8 b3 = next_argb4444[2] & 0x0f;
591    uint8 g3 = next_argb4444[2] >> 4;
592    uint8 r3 = next_argb4444[3] & 0x0f;
593    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
594    uint8 g = (g0 + g1 + g2 + g3);
595    uint8 r = (r0 + r1 + r2 + r3);
596    b = (b << 2) | (b >> 4);  // 666 -> 888.
597    g = (g << 2) | (g >> 4);
598    r = (r << 2) | (r >> 4);
599    dst_u[0] = RGBToU(r, g, b);
600    dst_v[0] = RGBToV(r, g, b);
601    src_argb4444 += 4;
602    next_argb4444 += 4;
603    dst_u += 1;
604    dst_v += 1;
605  }
606  if (width & 1) {
607    uint8 b0 = src_argb4444[0] & 0x0f;
608    uint8 g0 = src_argb4444[0] >> 4;
609    uint8 r0 = src_argb4444[1] & 0x0f;
610    uint8 b2 = next_argb4444[0] & 0x0f;
611    uint8 g2 = next_argb4444[0] >> 4;
612    uint8 r2 = next_argb4444[1] & 0x0f;
613    uint8 b = (b0 + b2);  // 444 * 2 = 555.
614    uint8 g = (g0 + g2);
615    uint8 r = (r0 + r2);
616    b = (b << 3) | (b >> 2);  // 555 -> 888.
617    g = (g << 3) | (g >> 2);
618    r = (r << 3) | (r >> 2);
619    dst_u[0] = RGBToU(r, g, b);
620    dst_v[0] = RGBToV(r, g, b);
621  }
622}
623
624void ARGBToUV444Row_C(const uint8* src_argb,
625                      uint8* dst_u, uint8* dst_v, int width) {
626  int x;
627  for (x = 0; x < width; ++x) {
628    uint8 ab = src_argb[0];
629    uint8 ag = src_argb[1];
630    uint8 ar = src_argb[2];
631    dst_u[0] = RGBToU(ar, ag, ab);
632    dst_v[0] = RGBToV(ar, ag, ab);
633    src_argb += 4;
634    dst_u += 1;
635    dst_v += 1;
636  }
637}
638
639void ARGBToUV411Row_C(const uint8* src_argb,
640                      uint8* dst_u, uint8* dst_v, int width) {
641  int x;
642  for (x = 0; x < width - 3; x += 4) {
643    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
644    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
645    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
646    dst_u[0] = RGBToU(ar, ag, ab);
647    dst_v[0] = RGBToV(ar, ag, ab);
648    src_argb += 16;
649    dst_u += 1;
650    dst_v += 1;
651  }
652  // Odd width handling mimics 'any' function which replicates last pixel.
653  if ((width & 3) == 3) {
654    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;
655    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;
656    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;
657    dst_u[0] = RGBToU(ar, ag, ab);
658    dst_v[0] = RGBToV(ar, ag, ab);
659  } else if ((width & 3) == 2) {
660    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
661    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
662    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
663    dst_u[0] = RGBToU(ar, ag, ab);
664    dst_v[0] = RGBToV(ar, ag, ab);
665  } else if ((width & 3) == 1) {
666    uint8 ab = src_argb[0];
667    uint8 ag = src_argb[1];
668    uint8 ar = src_argb[2];
669    dst_u[0] = RGBToU(ar, ag, ab);
670    dst_v[0] = RGBToV(ar, ag, ab);
671  }
672}
673
674void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
675  int x;
676  for (x = 0; x < width; ++x) {
677    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
678    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
679    dst_argb[3] = src_argb[3];
680    dst_argb += 4;
681    src_argb += 4;
682  }
683}
684
685// Convert a row of image to Sepia tone.
686void ARGBSepiaRow_C(uint8* dst_argb, int width) {
687  int x;
688  for (x = 0; x < width; ++x) {
689    int b = dst_argb[0];
690    int g = dst_argb[1];
691    int r = dst_argb[2];
692    int sb = (b * 17 + g * 68 + r * 35) >> 7;
693    int sg = (b * 22 + g * 88 + r * 45) >> 7;
694    int sr = (b * 24 + g * 98 + r * 50) >> 7;
695    // b does not over flow. a is preserved from original.
696    dst_argb[0] = sb;
697    dst_argb[1] = clamp255(sg);
698    dst_argb[2] = clamp255(sr);
699    dst_argb += 4;
700  }
701}
702
703// Apply color matrix to a row of image. Matrix is signed.
704// TODO(fbarchard): Consider adding rounding (+32).
705void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
706                          const int8* matrix_argb, int width) {
707  int x;
708  for (x = 0; x < width; ++x) {
709    int b = src_argb[0];
710    int g = src_argb[1];
711    int r = src_argb[2];
712    int a = src_argb[3];
713    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
714              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
715    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
716              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
717    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
718              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
719    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
720              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
721    dst_argb[0] = Clamp(sb);
722    dst_argb[1] = Clamp(sg);
723    dst_argb[2] = Clamp(sr);
724    dst_argb[3] = Clamp(sa);
725    src_argb += 4;
726    dst_argb += 4;
727  }
728}
729
730// Apply color table to a row of image.
731void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
732  int x;
733  for (x = 0; x < width; ++x) {
734    int b = dst_argb[0];
735    int g = dst_argb[1];
736    int r = dst_argb[2];
737    int a = dst_argb[3];
738    dst_argb[0] = table_argb[b * 4 + 0];
739    dst_argb[1] = table_argb[g * 4 + 1];
740    dst_argb[2] = table_argb[r * 4 + 2];
741    dst_argb[3] = table_argb[a * 4 + 3];
742    dst_argb += 4;
743  }
744}
745
746// Apply color table to a row of image.
747void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
748  int x;
749  for (x = 0; x < width; ++x) {
750    int b = dst_argb[0];
751    int g = dst_argb[1];
752    int r = dst_argb[2];
753    dst_argb[0] = table_argb[b * 4 + 0];
754    dst_argb[1] = table_argb[g * 4 + 1];
755    dst_argb[2] = table_argb[r * 4 + 2];
756    dst_argb += 4;
757  }
758}
759
760void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
761                       int interval_offset, int width) {
762  int x;
763  for (x = 0; x < width; ++x) {
764    int b = dst_argb[0];
765    int g = dst_argb[1];
766    int r = dst_argb[2];
767    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
768    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
769    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
770    dst_argb += 4;
771  }
772}
773
774#define REPEAT8(v) (v) | ((v) << 8)
775#define SHADE(f, v) v * f >> 24
776
777void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
778                    uint32 value) {
779  const uint32 b_scale = REPEAT8(value & 0xff);
780  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
781  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
782  const uint32 a_scale = REPEAT8(value >> 24);
783
784  int i;
785  for (i = 0; i < width; ++i) {
786    const uint32 b = REPEAT8(src_argb[0]);
787    const uint32 g = REPEAT8(src_argb[1]);
788    const uint32 r = REPEAT8(src_argb[2]);
789    const uint32 a = REPEAT8(src_argb[3]);
790    dst_argb[0] = SHADE(b, b_scale);
791    dst_argb[1] = SHADE(g, g_scale);
792    dst_argb[2] = SHADE(r, r_scale);
793    dst_argb[3] = SHADE(a, a_scale);
794    src_argb += 4;
795    dst_argb += 4;
796  }
797}
798#undef REPEAT8
799#undef SHADE
800
801#define REPEAT8(v) (v) | ((v) << 8)
802#define SHADE(f, v) v * f >> 16
803
804void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
805                       uint8* dst_argb, int width) {
806  int i;
807  for (i = 0; i < width; ++i) {
808    const uint32 b = REPEAT8(src_argb0[0]);
809    const uint32 g = REPEAT8(src_argb0[1]);
810    const uint32 r = REPEAT8(src_argb0[2]);
811    const uint32 a = REPEAT8(src_argb0[3]);
812    const uint32 b_scale = src_argb1[0];
813    const uint32 g_scale = src_argb1[1];
814    const uint32 r_scale = src_argb1[2];
815    const uint32 a_scale = src_argb1[3];
816    dst_argb[0] = SHADE(b, b_scale);
817    dst_argb[1] = SHADE(g, g_scale);
818    dst_argb[2] = SHADE(r, r_scale);
819    dst_argb[3] = SHADE(a, a_scale);
820    src_argb0 += 4;
821    src_argb1 += 4;
822    dst_argb += 4;
823  }
824}
825#undef REPEAT8
826#undef SHADE
827
828#define SHADE(f, v) clamp255(v + f)
829
830void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
831                  uint8* dst_argb, int width) {
832  int i;
833  for (i = 0; i < width; ++i) {
834    const int b = src_argb0[0];
835    const int g = src_argb0[1];
836    const int r = src_argb0[2];
837    const int a = src_argb0[3];
838    const int b_add = src_argb1[0];
839    const int g_add = src_argb1[1];
840    const int r_add = src_argb1[2];
841    const int a_add = src_argb1[3];
842    dst_argb[0] = SHADE(b, b_add);
843    dst_argb[1] = SHADE(g, g_add);
844    dst_argb[2] = SHADE(r, r_add);
845    dst_argb[3] = SHADE(a, a_add);
846    src_argb0 += 4;
847    src_argb1 += 4;
848    dst_argb += 4;
849  }
850}
851#undef SHADE
852
853#define SHADE(f, v) clamp0(f - v)
854
855void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
856                       uint8* dst_argb, int width) {
857  int i;
858  for (i = 0; i < width; ++i) {
859    const int b = src_argb0[0];
860    const int g = src_argb0[1];
861    const int r = src_argb0[2];
862    const int a = src_argb0[3];
863    const int b_sub = src_argb1[0];
864    const int g_sub = src_argb1[1];
865    const int r_sub = src_argb1[2];
866    const int a_sub = src_argb1[3];
867    dst_argb[0] = SHADE(b, b_sub);
868    dst_argb[1] = SHADE(g, g_sub);
869    dst_argb[2] = SHADE(r, r_sub);
870    dst_argb[3] = SHADE(a, a_sub);
871    src_argb0 += 4;
872    src_argb1 += 4;
873    dst_argb += 4;
874  }
875}
876#undef SHADE
877
878// Sobel functions which mimics SSSE3.
879void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
880                 uint8* dst_sobelx, int width) {
881  int i;
882  for (i = 0; i < width; ++i) {
883    int a = src_y0[i];
884    int b = src_y1[i];
885    int c = src_y2[i];
886    int a_sub = src_y0[i + 2];
887    int b_sub = src_y1[i + 2];
888    int c_sub = src_y2[i + 2];
889    int a_diff = a - a_sub;
890    int b_diff = b - b_sub;
891    int c_diff = c - c_sub;
892    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
893    dst_sobelx[i] = (uint8)(clamp255(sobel));
894  }
895}
896
897void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
898                 uint8* dst_sobely, int width) {
899  int i;
900  for (i = 0; i < width; ++i) {
901    int a = src_y0[i + 0];
902    int b = src_y0[i + 1];
903    int c = src_y0[i + 2];
904    int a_sub = src_y1[i + 0];
905    int b_sub = src_y1[i + 1];
906    int c_sub = src_y1[i + 2];
907    int a_diff = a - a_sub;
908    int b_diff = b - b_sub;
909    int c_diff = c - c_sub;
910    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
911    dst_sobely[i] = (uint8)(clamp255(sobel));
912  }
913}
914
915void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
916                uint8* dst_argb, int width) {
917  int i;
918  for (i = 0; i < width; ++i) {
919    int r = src_sobelx[i];
920    int b = src_sobely[i];
921    int s = clamp255(r + b);
922    dst_argb[0] = (uint8)(s);
923    dst_argb[1] = (uint8)(s);
924    dst_argb[2] = (uint8)(s);
925    dst_argb[3] = (uint8)(255u);
926    dst_argb += 4;
927  }
928}
929
930void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
931                       uint8* dst_y, int width) {
932  int i;
933  for (i = 0; i < width; ++i) {
934    int r = src_sobelx[i];
935    int b = src_sobely[i];
936    int s = clamp255(r + b);
937    dst_y[i] = (uint8)(s);
938  }
939}
940
941void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
942                  uint8* dst_argb, int width) {
943  int i;
944  for (i = 0; i < width; ++i) {
945    int r = src_sobelx[i];
946    int b = src_sobely[i];
947    int g = clamp255(r + b);
948    dst_argb[0] = (uint8)(b);
949    dst_argb[1] = (uint8)(g);
950    dst_argb[2] = (uint8)(r);
951    dst_argb[3] = (uint8)(255u);
952    dst_argb += 4;
953  }
954}
955
956void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
957  // Copy a Y to RGB.
958  int x;
959  for (x = 0; x < width; ++x) {
960    uint8 y = src_y[0];
961    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
962    dst_argb[3] = 255u;
963    dst_argb += 4;
964    ++src_y;
965  }
966}
967
968// TODO(fbarchard): Unify these structures to be platform independent.
969// TODO(fbarchard): Generate SIMD structures from float matrix.
970
971// BT.601 YUV to RGB reference
972//  R = (Y - 16) * 1.164              - V * -1.596
973//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
974//  B = (Y - 16) * 1.164 - U * -2.018
975
976// Y contribution to R,G,B.  Scale and bias.
977#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
978#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
979
980// U and V contributions to R,G,B.
981#define UB -128 /* max(-128, round(-2.018 * 64)) */
982#define UG 25 /* round(0.391 * 64) */
983#define VG 52 /* round(0.813 * 64) */
984#define VR -102 /* round(-1.596 * 64) */
985
986// Bias values to subtract 16 from Y and 128 from U and V.
987#define BB (UB * 128            + YGB)
988#define BG (UG * 128 + VG * 128 + YGB)
989#define BR            (VR * 128 + YGB)
990
991#if defined(__aarch64__)  // 64 bit arm
992const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
993  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
994  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
995  { UG, VG, UG, VG, UG, VG, UG, VG },
996  { UG, VG, UG, VG, UG, VG, UG, VG },
997  { BB, BG, BR, 0, 0, 0, 0, 0 },
998  { 0x0101 * YG, 0, 0, 0 }
999};
1000const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
1001  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
1002  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
1003  { VG, UG, VG, UG, VG, UG, VG, UG },
1004  { VG, UG, VG, UG, VG, UG, VG, UG },
1005  { BR, BG, BB, 0, 0, 0, 0, 0 },
1006  { 0x0101 * YG, 0, 0, 0 }
1007};
1008#elif defined(__arm__)  // 32 bit arm
1009const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
1010  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
1011  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
1012  { BB, BG, BR, 0, 0, 0, 0, 0 },
1013  { 0x0101 * YG, 0, 0, 0 }
1014};
1015const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
1016  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
1017  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
1018  { BR, BG, BB, 0, 0, 0, 0, 0 },
1019  { 0x0101 * YG, 0, 0, 0 }
1020};
1021#else
1022const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
1023  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1024    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
1025  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1026    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1027  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1028    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
1029  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1030  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1031  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1032  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1033};
1034const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
1035  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1036    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
1037  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1038    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1039  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1040    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
1041  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1042  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1043  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1044  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1045};
1046#endif
1047
1048#undef BB
1049#undef BG
1050#undef BR
1051#undef YGB
1052#undef UB
1053#undef UG
1054#undef VG
1055#undef VR
1056#undef YG
1057
1058// JPEG YUV to RGB reference
1059// *  R = Y                - V * -1.40200
1060// *  G = Y - U *  0.34414 - V *  0.71414
1061// *  B = Y - U * -1.77200
1062
1063// Y contribution to R,G,B.  Scale and bias.
1064#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1065#define YGB 32  /* 64 / 2 */
1066
1067// U and V contributions to R,G,B.
1068#define UB -113 /* round(-1.77200 * 64) */
1069#define UG 22 /* round(0.34414 * 64) */
1070#define VG 46 /* round(0.71414  * 64) */
1071#define VR -90 /* round(-1.40200 * 64) */
1072
1073// Bias values to round, and subtract 128 from U and V.
1074#define BB (UB * 128            + YGB)
1075#define BG (UG * 128 + VG * 128 + YGB)
1076#define BR            (VR * 128 + YGB)
1077
1078#if defined(__aarch64__)
1079const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
1080  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
1081  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
1082  { UG, VG, UG, VG, UG, VG, UG, VG },
1083  { UG, VG, UG, VG, UG, VG, UG, VG },
1084  { BB, BG, BR, 0, 0, 0, 0, 0 },
1085  { 0x0101 * YG, 0, 0, 0 }
1086};
1087const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
1088  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
1089  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
1090  { VG, UG, VG, UG, VG, UG, VG, UG },
1091  { VG, UG, VG, UG, VG, UG, VG, UG },
1092  { BR, BG, BB, 0, 0, 0, 0, 0 },
1093  { 0x0101 * YG, 0, 0, 0 }
1094};
1095#elif defined(__arm__)
1096const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
1097  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
1098  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
1099  { BB, BG, BR, 0, 0, 0, 0, 0 },
1100  { 0x0101 * YG, 0, 0, 0 }
1101};
1102const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
1103  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
1104  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
1105  { BR, BG, BB, 0, 0, 0, 0, 0 },
1106  { 0x0101 * YG, 0, 0, 0 }
1107};
1108#else
1109const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
1110  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1111    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
1112  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1113    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1114  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1115    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
1116  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1117  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1118  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1119  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1120};
1121const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
1122  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1123    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
1124  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1125    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1126  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1127    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
1128  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1129  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1130  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1131  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1132};
1133#endif
1134
1135#undef BB
1136#undef BG
1137#undef BR
1138#undef YGB
1139#undef UB
1140#undef UG
1141#undef VG
1142#undef VR
1143#undef YG
1144
1145// BT.709 YUV to RGB reference
1146// *  R = Y                - V * -1.28033
1147// *  G = Y - U *  0.21482 - V *  0.38059
1148// *  B = Y - U * -2.12798
1149
1150// Y contribution to R,G,B.  Scale and bias.
1151#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1152#define YGB 32  /* 64 / 2 */
1153
1154// TODO(fbarchard): Find way to express 2.12 instead of 2.0.
1155// U and V contributions to R,G,B.
1156#define UB -128 /* max(-128, round(-2.12798 * 64)) */
1157#define UG 14 /* round(0.21482 * 64) */
1158#define VG 24 /* round(0.38059  * 64) */
1159#define VR -82 /* round(-1.28033 * 64) */
1160
1161// Bias values to round, and subtract 128 from U and V.
1162#define BB (UB * 128            + YGB)
1163#define BG (UG * 128 + VG * 128 + YGB)
1164#define BR            (VR * 128 + YGB)
1165
1166#if defined(__aarch64__)
1167const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
1168  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
1169  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
1170  { UG, VG, UG, VG, UG, VG, UG, VG },
1171  { UG, VG, UG, VG, UG, VG, UG, VG },
1172  { BB, BG, BR, 0, 0, 0, 0, 0 },
1173  { 0x0101 * YG, 0, 0, 0 }
1174};
1175const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
1176  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
1177  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
1178  { VG, UG, VG, UG, VG, UG, VG, UG },
1179  { VG, UG, VG, UG, VG, UG, VG, UG },
1180  { BR, BG, BB, 0, 0, 0, 0, 0 },
1181  { 0x0101 * YG, 0, 0, 0 }
1182};
1183#elif defined(__arm__)
1184const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
1185  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
1186  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
1187  { BB, BG, BR, 0, 0, 0, 0, 0 },
1188  { 0x0101 * YG, 0, 0, 0 }
1189};
1190const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
1191  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
1192  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
1193  { BR, BG, BB, 0, 0, 0, 0, 0 },
1194  { 0x0101 * YG, 0, 0, 0 }
1195};
1196#else
1197const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
1198  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1199    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
1200  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1201    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1202  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1203    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
1204  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1205  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1206  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1207  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1208};
1209const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
1210  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1211    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
1212  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1213    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1214  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1215    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
1216  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1217  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1218  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1219  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1220};
1221#endif
1222
1223#undef BB
1224#undef BG
1225#undef BR
1226#undef YGB
1227#undef UB
1228#undef UG
1229#undef VG
1230#undef VR
1231#undef YG
1232
1233// C reference code that mimics the YUV assembly.
1234static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
1235                              uint8* b, uint8* g, uint8* r,
1236                              const struct YuvConstants* yuvconstants) {
1237#if defined(__aarch64__)
1238  int ub = -yuvconstants->kUVToRB[0];
1239  int ug = yuvconstants->kUVToG[0];
1240  int vg = yuvconstants->kUVToG[1];
1241  int vr = -yuvconstants->kUVToRB[1];
1242  int bb = yuvconstants->kUVBiasBGR[0];
1243  int bg = yuvconstants->kUVBiasBGR[1];
1244  int br = yuvconstants->kUVBiasBGR[2];
1245  int yg = yuvconstants->kYToRgb[0] / 0x0101;
1246#elif defined(__arm__)
1247  int ub = -yuvconstants->kUVToRB[0];
1248  int ug = yuvconstants->kUVToG[0];
1249  int vg = yuvconstants->kUVToG[4];
1250  int vr = -yuvconstants->kUVToRB[4];
1251  int bb = yuvconstants->kUVBiasBGR[0];
1252  int bg = yuvconstants->kUVBiasBGR[1];
1253  int br = yuvconstants->kUVBiasBGR[2];
1254  int yg = yuvconstants->kYToRgb[0] / 0x0101;
1255#else
1256  int ub = yuvconstants->kUVToB[0];
1257  int ug = yuvconstants->kUVToG[0];
1258  int vg = yuvconstants->kUVToG[1];
1259  int vr = yuvconstants->kUVToR[1];
1260  int bb = yuvconstants->kUVBiasB[0];
1261  int bg = yuvconstants->kUVBiasG[0];
1262  int br = yuvconstants->kUVBiasR[0];
1263  int yg = yuvconstants->kYToRgb[0];
1264#endif
1265
1266  uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
1267  *b = Clamp((int32)(-(u * ub)          + y1 + bb) >> 6);
1268  *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
1269  *r = Clamp((int32)         (-(v * vr) + y1 + br) >> 6);
1270}
1271
1272// Y contribution to R,G,B.  Scale and bias.
1273#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1274#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1275
1276// C reference code that mimics the YUV assembly.
1277static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
1278  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
1279  *b = Clamp((int32)(y1 + YGB) >> 6);
1280  *g = Clamp((int32)(y1 + YGB) >> 6);
1281  *r = Clamp((int32)(y1 + YGB) >> 6);
1282}
1283
1284#undef YG
1285#undef YGB
1286
1287#if !defined(LIBYUV_DISABLE_NEON) && \
1288    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
1289// C mimic assembly.
1290// TODO(fbarchard): Remove subsampling from Neon.
1291void I444ToARGBRow_C(const uint8* src_y,
1292                     const uint8* src_u,
1293                     const uint8* src_v,
1294                     uint8* rgb_buf,
1295                     const struct YuvConstants* yuvconstants,
1296                     int width) {
1297  int x;
1298  for (x = 0; x < width - 1; x += 2) {
1299    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
1300    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
1301    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
1302             yuvconstants);
1303    rgb_buf[3] = 255;
1304    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
1305             yuvconstants);
1306    rgb_buf[7] = 255;
1307    src_y += 2;
1308    src_u += 2;
1309    src_v += 2;
1310    rgb_buf += 8;  // Advance 2 pixels.
1311  }
1312  if (width & 1) {
1313    YuvPixel(src_y[0], src_u[0], src_v[0],
1314             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1315    rgb_buf[3] = 255;
1316  }
1317}
1318#else
1319void I444ToARGBRow_C(const uint8* src_y,
1320                     const uint8* src_u,
1321                     const uint8* src_v,
1322                     uint8* rgb_buf,
1323                     const struct YuvConstants* yuvconstants,
1324                     int width) {
1325  int x;
1326  for (x = 0; x < width; ++x) {
1327    YuvPixel(src_y[0], src_u[0], src_v[0],
1328             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1329    rgb_buf[3] = 255;
1330    src_y += 1;
1331    src_u += 1;
1332    src_v += 1;
1333    rgb_buf += 4;  // Advance 1 pixel.
1334  }
1335}
1336#endif
1337
1338// Also used for 420
1339void I422ToARGBRow_C(const uint8* src_y,
1340                     const uint8* src_u,
1341                     const uint8* src_v,
1342                     uint8* rgb_buf,
1343                     const struct YuvConstants* yuvconstants,
1344                     int width) {
1345  int x;
1346  for (x = 0; x < width - 1; x += 2) {
1347    YuvPixel(src_y[0], src_u[0], src_v[0],
1348             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1349    rgb_buf[3] = 255;
1350    YuvPixel(src_y[1], src_u[0], src_v[0],
1351             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
1352    rgb_buf[7] = 255;
1353    src_y += 2;
1354    src_u += 1;
1355    src_v += 1;
1356    rgb_buf += 8;  // Advance 2 pixels.
1357  }
1358  if (width & 1) {
1359    YuvPixel(src_y[0], src_u[0], src_v[0],
1360             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1361    rgb_buf[3] = 255;
1362  }
1363}
1364
1365void I422AlphaToARGBRow_C(const uint8* src_y,
1366                          const uint8* src_u,
1367                          const uint8* src_v,
1368                          const uint8* src_a,
1369                          uint8* rgb_buf,
1370                          const struct YuvConstants* yuvconstants,
1371                          int width) {
1372  int x;
1373  for (x = 0; x < width - 1; x += 2) {
1374    YuvPixel(src_y[0], src_u[0], src_v[0],
1375             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1376    rgb_buf[3] = src_a[0];
1377    YuvPixel(src_y[1], src_u[0], src_v[0],
1378             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
1379    rgb_buf[7] = src_a[1];
1380    src_y += 2;
1381    src_u += 1;
1382    src_v += 1;
1383    src_a += 2;
1384    rgb_buf += 8;  // Advance 2 pixels.
1385  }
1386  if (width & 1) {
1387    YuvPixel(src_y[0], src_u[0], src_v[0],
1388             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1389    rgb_buf[3] = src_a[0];
1390  }
1391}
1392
1393void I422ToRGB24Row_C(const uint8* src_y,
1394                      const uint8* src_u,
1395                      const uint8* src_v,
1396                      uint8* rgb_buf,
1397                      const struct YuvConstants* yuvconstants,
1398                      int width) {
1399  int x;
1400  for (x = 0; x < width - 1; x += 2) {
1401    YuvPixel(src_y[0], src_u[0], src_v[0],
1402             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1403    YuvPixel(src_y[1], src_u[0], src_v[0],
1404             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);
1405    src_y += 2;
1406    src_u += 1;
1407    src_v += 1;
1408    rgb_buf += 6;  // Advance 2 pixels.
1409  }
1410  if (width & 1) {
1411    YuvPixel(src_y[0], src_u[0], src_v[0],
1412             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1413  }
1414}
1415
1416void I422ToARGB4444Row_C(const uint8* src_y,
1417                         const uint8* src_u,
1418                         const uint8* src_v,
1419                         uint8* dst_argb4444,
1420                         const struct YuvConstants* yuvconstants,
1421                         int width) {
1422  uint8 b0;
1423  uint8 g0;
1424  uint8 r0;
1425  uint8 b1;
1426  uint8 g1;
1427  uint8 r1;
1428  int x;
1429  for (x = 0; x < width - 1; x += 2) {
1430    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1431    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
1432    b0 = b0 >> 4;
1433    g0 = g0 >> 4;
1434    r0 = r0 >> 4;
1435    b1 = b1 >> 4;
1436    g1 = g1 >> 4;
1437    r1 = r1 >> 4;
1438    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
1439        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
1440    src_y += 2;
1441    src_u += 1;
1442    src_v += 1;
1443    dst_argb4444 += 4;  // Advance 2 pixels.
1444  }
1445  if (width & 1) {
1446    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1447    b0 = b0 >> 4;
1448    g0 = g0 >> 4;
1449    r0 = r0 >> 4;
1450    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
1451        0xf000;
1452  }
1453}
1454
1455void I422ToARGB1555Row_C(const uint8* src_y,
1456                         const uint8* src_u,
1457                         const uint8* src_v,
1458                         uint8* dst_argb1555,
1459                         const struct YuvConstants* yuvconstants,
1460                         int width) {
1461  uint8 b0;
1462  uint8 g0;
1463  uint8 r0;
1464  uint8 b1;
1465  uint8 g1;
1466  uint8 r1;
1467  int x;
1468  for (x = 0; x < width - 1; x += 2) {
1469    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1470    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
1471    b0 = b0 >> 3;
1472    g0 = g0 >> 3;
1473    r0 = r0 >> 3;
1474    b1 = b1 >> 3;
1475    g1 = g1 >> 3;
1476    r1 = r1 >> 3;
1477    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
1478        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
1479    src_y += 2;
1480    src_u += 1;
1481    src_v += 1;
1482    dst_argb1555 += 4;  // Advance 2 pixels.
1483  }
1484  if (width & 1) {
1485    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1486    b0 = b0 >> 3;
1487    g0 = g0 >> 3;
1488    r0 = r0 >> 3;
1489    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
1490        0x8000;
1491  }
1492}
1493
1494void I422ToRGB565Row_C(const uint8* src_y,
1495                       const uint8* src_u,
1496                       const uint8* src_v,
1497                       uint8* dst_rgb565,
1498                       const struct YuvConstants* yuvconstants,
1499                       int width) {
1500  uint8 b0;
1501  uint8 g0;
1502  uint8 r0;
1503  uint8 b1;
1504  uint8 g1;
1505  uint8 r1;
1506  int x;
1507  for (x = 0; x < width - 1; x += 2) {
1508    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1509    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
1510    b0 = b0 >> 3;
1511    g0 = g0 >> 2;
1512    r0 = r0 >> 3;
1513    b1 = b1 >> 3;
1514    g1 = g1 >> 2;
1515    r1 = r1 >> 3;
1516    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1517        (b1 << 16) | (g1 << 21) | (r1 << 27);
1518    src_y += 2;
1519    src_u += 1;
1520    src_v += 1;
1521    dst_rgb565 += 4;  // Advance 2 pixels.
1522  }
1523  if (width & 1) {
1524    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1525    b0 = b0 >> 3;
1526    g0 = g0 >> 2;
1527    r0 = r0 >> 3;
1528    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1529  }
1530}
1531
1532void I411ToARGBRow_C(const uint8* src_y,
1533                     const uint8* src_u,
1534                     const uint8* src_v,
1535                     uint8* rgb_buf,
1536                     const struct YuvConstants* yuvconstants,
1537                     int width) {
1538  int x;
1539  for (x = 0; x < width - 3; x += 4) {
1540    YuvPixel(src_y[0], src_u[0], src_v[0],
1541             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1542    rgb_buf[3] = 255;
1543    YuvPixel(src_y[1], src_u[0], src_v[0],
1544             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
1545    rgb_buf[7] = 255;
1546    YuvPixel(src_y[2], src_u[0], src_v[0],
1547             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);
1548    rgb_buf[11] = 255;
1549    YuvPixel(src_y[3], src_u[0], src_v[0],
1550             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);
1551    rgb_buf[15] = 255;
1552    src_y += 4;
1553    src_u += 1;
1554    src_v += 1;
1555    rgb_buf += 16;  // Advance 4 pixels.
1556  }
1557  if (width & 2) {
1558    YuvPixel(src_y[0], src_u[0], src_v[0],
1559             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1560    rgb_buf[3] = 255;
1561    YuvPixel(src_y[1], src_u[0], src_v[0],
1562             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
1563    rgb_buf[7] = 255;
1564    src_y += 2;
1565    rgb_buf += 8;  // Advance 2 pixels.
1566  }
1567  if (width & 1) {
1568    YuvPixel(src_y[0], src_u[0], src_v[0],
1569             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1570    rgb_buf[3] = 255;
1571  }
1572}
1573
1574void NV12ToARGBRow_C(const uint8* src_y,
1575                     const uint8* src_uv,
1576                     uint8* rgb_buf,
1577                     const struct YuvConstants* yuvconstants,
1578                     int width) {
1579  int x;
1580  for (x = 0; x < width - 1; x += 2) {
1581    YuvPixel(src_y[0], src_uv[0], src_uv[1],
1582             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1583    rgb_buf[3] = 255;
1584    YuvPixel(src_y[1], src_uv[0], src_uv[1],
1585             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
1586    rgb_buf[7] = 255;
1587    src_y += 2;
1588    src_uv += 2;
1589    rgb_buf += 8;  // Advance 2 pixels.
1590  }
1591  if (width & 1) {
1592    YuvPixel(src_y[0], src_uv[0], src_uv[1],
1593             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1594    rgb_buf[3] = 255;
1595  }
1596}
1597
1598void NV21ToARGBRow_C(const uint8* src_y,
1599                     const uint8* src_vu,
1600                     uint8* rgb_buf,
1601                     const struct YuvConstants* yuvconstants,
1602                     int width) {
1603  int x;
1604  for (x = 0; x < width - 1; x += 2) {
1605    YuvPixel(src_y[0], src_vu[1], src_vu[0],
1606             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1607    rgb_buf[3] = 255;
1608    YuvPixel(src_y[1], src_vu[1], src_vu[0],
1609             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
1610    rgb_buf[7] = 255;
1611    src_y += 2;
1612    src_vu += 2;
1613    rgb_buf += 8;  // Advance 2 pixels.
1614  }
1615  if (width & 1) {
1616    YuvPixel(src_y[0], src_vu[1], src_vu[0],
1617             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1618    rgb_buf[3] = 255;
1619  }
1620}
1621
1622void NV12ToRGB565Row_C(const uint8* src_y,
1623                       const uint8* src_uv,
1624                       uint8* dst_rgb565,
1625                       const struct YuvConstants* yuvconstants,
1626                       int width) {
1627  uint8 b0;
1628  uint8 g0;
1629  uint8 r0;
1630  uint8 b1;
1631  uint8 g1;
1632  uint8 r1;
1633  int x;
1634  for (x = 0; x < width - 1; x += 2) {
1635    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
1636    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
1637    b0 = b0 >> 3;
1638    g0 = g0 >> 2;
1639    r0 = r0 >> 3;
1640    b1 = b1 >> 3;
1641    g1 = g1 >> 2;
1642    r1 = r1 >> 3;
1643    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1644        (b1 << 16) | (g1 << 21) | (r1 << 27);
1645    src_y += 2;
1646    src_uv += 2;
1647    dst_rgb565 += 4;  // Advance 2 pixels.
1648  }
1649  if (width & 1) {
1650    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
1651    b0 = b0 >> 3;
1652    g0 = g0 >> 2;
1653    r0 = r0 >> 3;
1654    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1655  }
1656}
1657
1658void YUY2ToARGBRow_C(const uint8* src_yuy2,
1659                     uint8* rgb_buf,
1660                     const struct YuvConstants* yuvconstants,
1661                     int width) {
1662  int x;
1663  for (x = 0; x < width - 1; x += 2) {
1664    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
1665             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1666    rgb_buf[3] = 255;
1667    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
1668             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
1669    rgb_buf[7] = 255;
1670    src_yuy2 += 4;
1671    rgb_buf += 8;  // Advance 2 pixels.
1672  }
1673  if (width & 1) {
1674    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
1675             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1676    rgb_buf[3] = 255;
1677  }
1678}
1679
1680void UYVYToARGBRow_C(const uint8* src_uyvy,
1681                     uint8* rgb_buf,
1682                     const struct YuvConstants* yuvconstants,
1683                     int width) {
1684  int x;
1685  for (x = 0; x < width - 1; x += 2) {
1686    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
1687             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1688    rgb_buf[3] = 255;
1689    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
1690             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
1691    rgb_buf[7] = 255;
1692    src_uyvy += 4;
1693    rgb_buf += 8;  // Advance 2 pixels.
1694  }
1695  if (width & 1) {
1696    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
1697             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
1698    rgb_buf[3] = 255;
1699  }
1700}
1701
1702void I422ToRGBARow_C(const uint8* src_y,
1703                     const uint8* src_u,
1704                     const uint8* src_v,
1705                     uint8* rgb_buf,
1706                     const struct YuvConstants* yuvconstants,
1707                     int width) {
1708  int x;
1709  for (x = 0; x < width - 1; x += 2) {
1710    YuvPixel(src_y[0], src_u[0], src_v[0],
1711             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
1712    rgb_buf[0] = 255;
1713    YuvPixel(src_y[1], src_u[0], src_v[0],
1714             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);
1715    rgb_buf[4] = 255;
1716    src_y += 2;
1717    src_u += 1;
1718    src_v += 1;
1719    rgb_buf += 8;  // Advance 2 pixels.
1720  }
1721  if (width & 1) {
1722    YuvPixel(src_y[0], src_u[0], src_v[0],
1723             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
1724    rgb_buf[0] = 255;
1725  }
1726}
1727
1728void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
1729  int x;
1730  for (x = 0; x < width - 1; x += 2) {
1731    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1732    rgb_buf[3] = 255;
1733    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1734    rgb_buf[7] = 255;
1735    src_y += 2;
1736    rgb_buf += 8;  // Advance 2 pixels.
1737  }
1738  if (width & 1) {
1739    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1740    rgb_buf[3] = 255;
1741  }
1742}
1743
1744void MirrorRow_C(const uint8* src, uint8* dst, int width) {
1745  int x;
1746  src += width - 1;
1747  for (x = 0; x < width - 1; x += 2) {
1748    dst[x] = src[0];
1749    dst[x + 1] = src[-1];
1750    src -= 2;
1751  }
1752  if (width & 1) {
1753    dst[width - 1] = src[0];
1754  }
1755}
1756
1757void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1758  int x;
1759  src_uv += (width - 1) << 1;
1760  for (x = 0; x < width - 1; x += 2) {
1761    dst_u[x] = src_uv[0];
1762    dst_u[x + 1] = src_uv[-2];
1763    dst_v[x] = src_uv[1];
1764    dst_v[x + 1] = src_uv[-2 + 1];
1765    src_uv -= 4;
1766  }
1767  if (width & 1) {
1768    dst_u[width - 1] = src_uv[0];
1769    dst_v[width - 1] = src_uv[1];
1770  }
1771}
1772
1773void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
1774  int x;
1775  const uint32* src32 = (const uint32*)(src);
1776  uint32* dst32 = (uint32*)(dst);
1777  src32 += width - 1;
1778  for (x = 0; x < width - 1; x += 2) {
1779    dst32[x] = src32[0];
1780    dst32[x + 1] = src32[-1];
1781    src32 -= 2;
1782  }
1783  if (width & 1) {
1784    dst32[width - 1] = src32[0];
1785  }
1786}
1787
1788void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1789  int x;
1790  for (x = 0; x < width - 1; x += 2) {
1791    dst_u[x] = src_uv[0];
1792    dst_u[x + 1] = src_uv[2];
1793    dst_v[x] = src_uv[1];
1794    dst_v[x + 1] = src_uv[3];
1795    src_uv += 4;
1796  }
1797  if (width & 1) {
1798    dst_u[width - 1] = src_uv[0];
1799    dst_v[width - 1] = src_uv[1];
1800  }
1801}
1802
1803void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
1804                  int width) {
1805  int x;
1806  for (x = 0; x < width - 1; x += 2) {
1807    dst_uv[0] = src_u[x];
1808    dst_uv[1] = src_v[x];
1809    dst_uv[2] = src_u[x + 1];
1810    dst_uv[3] = src_v[x + 1];
1811    dst_uv += 4;
1812  }
1813  if (width & 1) {
1814    dst_uv[0] = src_u[width - 1];
1815    dst_uv[1] = src_v[width - 1];
1816  }
1817}
1818
1819void CopyRow_C(const uint8* src, uint8* dst, int count) {
1820  memcpy(dst, src, count);
1821}
1822
1823void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
1824  memcpy(dst, src, count * 2);
1825}
1826
1827void SetRow_C(uint8* dst, uint8 v8, int width) {
1828  memset(dst, v8, width);
1829}
1830
1831void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
1832  uint32* d = (uint32*)(dst_argb);
1833  int x;
1834  for (x = 0; x < width; ++x) {
1835    d[x] = v32;
1836  }
1837}
1838
1839// Filter 2 rows of YUY2 UV's (422) into U and V (420).
1840void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
1841                   uint8* dst_u, uint8* dst_v, int width) {
1842  // Output a row of UV values, filtering 2 rows of YUY2.
1843  int x;
1844  for (x = 0; x < width; x += 2) {
1845    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
1846    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
1847    src_yuy2 += 4;
1848    dst_u += 1;
1849    dst_v += 1;
1850  }
1851}
1852
1853// Copy row of YUY2 UV's (422) into U and V (422).
1854void YUY2ToUV422Row_C(const uint8* src_yuy2,
1855                      uint8* dst_u, uint8* dst_v, int width) {
1856  // Output a row of UV values.
1857  int x;
1858  for (x = 0; x < width; x += 2) {
1859    dst_u[0] = src_yuy2[1];
1860    dst_v[0] = src_yuy2[3];
1861    src_yuy2 += 4;
1862    dst_u += 1;
1863    dst_v += 1;
1864  }
1865}
1866
1867// Copy row of YUY2 Y's (422) into Y (420/422).
1868void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
1869  // Output a row of Y values.
1870  int x;
1871  for (x = 0; x < width - 1; x += 2) {
1872    dst_y[x] = src_yuy2[0];
1873    dst_y[x + 1] = src_yuy2[2];
1874    src_yuy2 += 4;
1875  }
1876  if (width & 1) {
1877    dst_y[width - 1] = src_yuy2[0];
1878  }
1879}
1880
1881// Filter 2 rows of UYVY UV's (422) into U and V (420).
1882void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
1883                   uint8* dst_u, uint8* dst_v, int width) {
1884  // Output a row of UV values.
1885  int x;
1886  for (x = 0; x < width; x += 2) {
1887    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
1888    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
1889    src_uyvy += 4;
1890    dst_u += 1;
1891    dst_v += 1;
1892  }
1893}
1894
1895// Copy row of UYVY UV's (422) into U and V (422).
1896void UYVYToUV422Row_C(const uint8* src_uyvy,
1897                      uint8* dst_u, uint8* dst_v, int width) {
1898  // Output a row of UV values.
1899  int x;
1900  for (x = 0; x < width; x += 2) {
1901    dst_u[0] = src_uyvy[0];
1902    dst_v[0] = src_uyvy[2];
1903    src_uyvy += 4;
1904    dst_u += 1;
1905    dst_v += 1;
1906  }
1907}
1908
1909// Copy row of UYVY Y's (422) into Y (420/422).
1910void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
1911  // Output a row of Y values.
1912  int x;
1913  for (x = 0; x < width - 1; x += 2) {
1914    dst_y[x] = src_uyvy[1];
1915    dst_y[x + 1] = src_uyvy[3];
1916    src_uyvy += 4;
1917  }
1918  if (width & 1) {
1919    dst_y[width - 1] = src_uyvy[1];
1920  }
1921}
1922
1923#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
1924
1925// Blend src_argb0 over src_argb1 and store to dst_argb.
1926// dst_argb may be src_argb0 or src_argb1.
1927// This code mimics the SSSE3 version for better testability.
1928void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
1929                    uint8* dst_argb, int width) {
1930  int x;
1931  for (x = 0; x < width - 1; x += 2) {
1932    uint32 fb = src_argb0[0];
1933    uint32 fg = src_argb0[1];
1934    uint32 fr = src_argb0[2];
1935    uint32 a = src_argb0[3];
1936    uint32 bb = src_argb1[0];
1937    uint32 bg = src_argb1[1];
1938    uint32 br = src_argb1[2];
1939    dst_argb[0] = BLEND(fb, bb, a);
1940    dst_argb[1] = BLEND(fg, bg, a);
1941    dst_argb[2] = BLEND(fr, br, a);
1942    dst_argb[3] = 255u;
1943
1944    fb = src_argb0[4 + 0];
1945    fg = src_argb0[4 + 1];
1946    fr = src_argb0[4 + 2];
1947    a = src_argb0[4 + 3];
1948    bb = src_argb1[4 + 0];
1949    bg = src_argb1[4 + 1];
1950    br = src_argb1[4 + 2];
1951    dst_argb[4 + 0] = BLEND(fb, bb, a);
1952    dst_argb[4 + 1] = BLEND(fg, bg, a);
1953    dst_argb[4 + 2] = BLEND(fr, br, a);
1954    dst_argb[4 + 3] = 255u;
1955    src_argb0 += 8;
1956    src_argb1 += 8;
1957    dst_argb += 8;
1958  }
1959
1960  if (width & 1) {
1961    uint32 fb = src_argb0[0];
1962    uint32 fg = src_argb0[1];
1963    uint32 fr = src_argb0[2];
1964    uint32 a = src_argb0[3];
1965    uint32 bb = src_argb1[0];
1966    uint32 bg = src_argb1[1];
1967    uint32 br = src_argb1[2];
1968    dst_argb[0] = BLEND(fb, bb, a);
1969    dst_argb[1] = BLEND(fg, bg, a);
1970    dst_argb[2] = BLEND(fr, br, a);
1971    dst_argb[3] = 255u;
1972  }
1973}
1974#undef BLEND
1975
1976#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
1977void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
1978                     const uint8* alpha, uint8* dst, int width) {
1979  int x;
1980  for (x = 0; x < width - 1; x += 2) {
1981    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
1982    dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
1983    src0 += 2;
1984    src1 += 2;
1985    alpha += 2;
1986    dst += 2;
1987  }
1988  if (width & 1) {
1989    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
1990  }
1991}
1992#undef UBLEND
1993
1994#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
1995
1996// Multiply source RGB by alpha and store to destination.
1997// This code mimics the SSSE3 version for better testability.
1998void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
1999  int i;
2000  for (i = 0; i < width - 1; i += 2) {
2001    uint32 b = src_argb[0];
2002    uint32 g = src_argb[1];
2003    uint32 r = src_argb[2];
2004    uint32 a = src_argb[3];
2005    dst_argb[0] = ATTENUATE(b, a);
2006    dst_argb[1] = ATTENUATE(g, a);
2007    dst_argb[2] = ATTENUATE(r, a);
2008    dst_argb[3] = a;
2009    b = src_argb[4];
2010    g = src_argb[5];
2011    r = src_argb[6];
2012    a = src_argb[7];
2013    dst_argb[4] = ATTENUATE(b, a);
2014    dst_argb[5] = ATTENUATE(g, a);
2015    dst_argb[6] = ATTENUATE(r, a);
2016    dst_argb[7] = a;
2017    src_argb += 8;
2018    dst_argb += 8;
2019  }
2020
2021  if (width & 1) {
2022    const uint32 b = src_argb[0];
2023    const uint32 g = src_argb[1];
2024    const uint32 r = src_argb[2];
2025    const uint32 a = src_argb[3];
2026    dst_argb[0] = ATTENUATE(b, a);
2027    dst_argb[1] = ATTENUATE(g, a);
2028    dst_argb[2] = ATTENUATE(r, a);
2029    dst_argb[3] = a;
2030  }
2031}
2032#undef ATTENUATE
2033
2034// Divide source RGB by alpha and store to destination.
2035// b = (b * 255 + (a / 2)) / a;
2036// g = (g * 255 + (a / 2)) / a;
2037// r = (r * 255 + (a / 2)) / a;
2038// Reciprocal method is off by 1 on some values. ie 125
2039// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
2040#define T(a) 0x01000000 + (0x10000 / a)
2041const uint32 fixed_invtbl8[256] = {
2042  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
2043  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
2044  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
2045  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
2046  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
2047  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
2048  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
2049  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
2050  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
2051  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
2052  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
2053  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
2054  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
2055  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
2056  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
2057  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
2058  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
2059  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
2060  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
2061  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
2062  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
2063  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
2064  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
2065  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
2066  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
2067  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
2068  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
2069  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
2070  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
2071  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
2072  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
2073  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
2074#undef T
2075
2076void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
2077  int i;
2078  for (i = 0; i < width; ++i) {
2079    uint32 b = src_argb[0];
2080    uint32 g = src_argb[1];
2081    uint32 r = src_argb[2];
2082    const uint32 a = src_argb[3];
2083    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
2084    b = (b * ia) >> 8;
2085    g = (g * ia) >> 8;
2086    r = (r * ia) >> 8;
2087    // Clamping should not be necessary but is free in assembly.
2088    dst_argb[0] = clamp255(b);
2089    dst_argb[1] = clamp255(g);
2090    dst_argb[2] = clamp255(r);
2091    dst_argb[3] = a;
2092    src_argb += 4;
2093    dst_argb += 4;
2094  }
2095}
2096
2097void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
2098                               const int32* previous_cumsum, int width) {
2099  int32 row_sum[4] = {0, 0, 0, 0};
2100  int x;
2101  for (x = 0; x < width; ++x) {
2102    row_sum[0] += row[x * 4 + 0];
2103    row_sum[1] += row[x * 4 + 1];
2104    row_sum[2] += row[x * 4 + 2];
2105    row_sum[3] += row[x * 4 + 3];
2106    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
2107    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
2108    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
2109    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
2110  }
2111}
2112
2113void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
2114                                int w, int area, uint8* dst, int count) {
2115  float ooa = 1.0f / area;
2116  int i;
2117  for (i = 0; i < count; ++i) {
2118    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
2119    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
2120    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
2121    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
2122    dst += 4;
2123    tl += 4;
2124    bl += 4;
2125  }
2126}
2127
2128// Copy pixels from rotated source to destination row with a slope.
2129LIBYUV_API
2130void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
2131                     uint8* dst_argb, const float* uv_dudv, int width) {
2132  int i;
2133  // Render a row of pixels from source into a buffer.
2134  float uv[2];
2135  uv[0] = uv_dudv[0];
2136  uv[1] = uv_dudv[1];
2137  for (i = 0; i < width; ++i) {
2138    int x = (int)(uv[0]);
2139    int y = (int)(uv[1]);
2140    *(uint32*)(dst_argb) =
2141        *(const uint32*)(src_argb + y * src_argb_stride +
2142                                         x * 4);
2143    dst_argb += 4;
2144    uv[0] += uv_dudv[2];
2145    uv[1] += uv_dudv[3];
2146  }
2147}
2148
2149// Blend 2 rows into 1.
2150static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride,
2151                      uint8* dst_uv, int width) {
2152  int x;
2153  for (x = 0; x < width; ++x) {
2154    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
2155  }
2156}
2157
2158static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,
2159                         uint16* dst_uv, int width) {
2160  int x;
2161  for (x = 0; x < width; ++x) {
2162    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
2163  }
2164}
2165
2166// C version 2x2 -> 2x1.
2167void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
2168                      ptrdiff_t src_stride,
2169                      int width, int source_y_fraction) {
2170  int y1_fraction = source_y_fraction;
2171  int y0_fraction = 256 - y1_fraction;
2172  const uint8* src_ptr1 = src_ptr + src_stride;
2173  int x;
2174  if (y1_fraction == 0) {
2175    memcpy(dst_ptr, src_ptr, width);
2176    return;
2177  }
2178  if (y1_fraction == 128) {
2179    HalfRow_C(src_ptr, src_stride, dst_ptr, width);
2180    return;
2181  }
2182  for (x = 0; x < width - 1; x += 2) {
2183    dst_ptr[0] =
2184        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
2185    dst_ptr[1] =
2186        (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
2187    src_ptr += 2;
2188    src_ptr1 += 2;
2189    dst_ptr += 2;
2190  }
2191  if (width & 1) {
2192    dst_ptr[0] =
2193        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
2194  }
2195}
2196
2197void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
2198                         ptrdiff_t src_stride,
2199                         int width, int source_y_fraction) {
2200  int y1_fraction = source_y_fraction;
2201  int y0_fraction = 256 - y1_fraction;
2202  const uint16* src_ptr1 = src_ptr + src_stride;
2203  int x;
2204  if (source_y_fraction == 0) {
2205    memcpy(dst_ptr, src_ptr, width * 2);
2206    return;
2207  }
2208  if (source_y_fraction == 128) {
2209    HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
2210    return;
2211  }
2212  for (x = 0; x < width - 1; x += 2) {
2213    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2214    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
2215    src_ptr += 2;
2216    src_ptr1 += 2;
2217    dst_ptr += 2;
2218  }
2219  if (width & 1) {
2220    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2221  }
2222}
2223
2224// Use first 4 shuffler values to reorder ARGB channels.
2225void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
2226                      const uint8* shuffler, int width) {
2227  int index0 = shuffler[0];
2228  int index1 = shuffler[1];
2229  int index2 = shuffler[2];
2230  int index3 = shuffler[3];
2231  // Shuffle a row of ARGB.
2232  int x;
2233  for (x = 0; x < width; ++x) {
2234    // To support in-place conversion.
2235    uint8 b = src_argb[index0];
2236    uint8 g = src_argb[index1];
2237    uint8 r = src_argb[index2];
2238    uint8 a = src_argb[index3];
2239    dst_argb[0] = b;
2240    dst_argb[1] = g;
2241    dst_argb[2] = r;
2242    dst_argb[3] = a;
2243    src_argb += 4;
2244    dst_argb += 4;
2245  }
2246}
2247
2248void I422ToYUY2Row_C(const uint8* src_y,
2249                     const uint8* src_u,
2250                     const uint8* src_v,
2251                     uint8* dst_frame, int width) {
2252  int x;
2253  for (x = 0; x < width - 1; x += 2) {
2254    dst_frame[0] = src_y[0];
2255    dst_frame[1] = src_u[0];
2256    dst_frame[2] = src_y[1];
2257    dst_frame[3] = src_v[0];
2258    dst_frame += 4;
2259    src_y += 2;
2260    src_u += 1;
2261    src_v += 1;
2262  }
2263  if (width & 1) {
2264    dst_frame[0] = src_y[0];
2265    dst_frame[1] = src_u[0];
2266    dst_frame[2] = 0;
2267    dst_frame[3] = src_v[0];
2268  }
2269}
2270
2271void I422ToUYVYRow_C(const uint8* src_y,
2272                     const uint8* src_u,
2273                     const uint8* src_v,
2274                     uint8* dst_frame, int width) {
2275  int x;
2276  for (x = 0; x < width - 1; x += 2) {
2277    dst_frame[0] = src_u[0];
2278    dst_frame[1] = src_y[0];
2279    dst_frame[2] = src_v[0];
2280    dst_frame[3] = src_y[1];
2281    dst_frame += 4;
2282    src_y += 2;
2283    src_u += 1;
2284    src_v += 1;
2285  }
2286  if (width & 1) {
2287    dst_frame[0] = src_u[0];
2288    dst_frame[1] = src_y[0];
2289    dst_frame[2] = src_v[0];
2290    dst_frame[3] = 0;
2291  }
2292}
2293
2294
2295void ARGBPolynomialRow_C(const uint8* src_argb,
2296                         uint8* dst_argb,
2297                         const float* poly,
2298                         int width) {
2299  int i;
2300  for (i = 0; i < width; ++i) {
2301    float b = (float)(src_argb[0]);
2302    float g = (float)(src_argb[1]);
2303    float r = (float)(src_argb[2]);
2304    float a = (float)(src_argb[3]);
2305    float b2 = b * b;
2306    float g2 = g * g;
2307    float r2 = r * r;
2308    float a2 = a * a;
2309    float db = poly[0] + poly[4] * b;
2310    float dg = poly[1] + poly[5] * g;
2311    float dr = poly[2] + poly[6] * r;
2312    float da = poly[3] + poly[7] * a;
2313    float b3 = b2 * b;
2314    float g3 = g2 * g;
2315    float r3 = r2 * r;
2316    float a3 = a2 * a;
2317    db += poly[8] * b2;
2318    dg += poly[9] * g2;
2319    dr += poly[10] * r2;
2320    da += poly[11] * a2;
2321    db += poly[12] * b3;
2322    dg += poly[13] * g3;
2323    dr += poly[14] * r3;
2324    da += poly[15] * a3;
2325
2326    dst_argb[0] = Clamp((int32)(db));
2327    dst_argb[1] = Clamp((int32)(dg));
2328    dst_argb[2] = Clamp((int32)(dr));
2329    dst_argb[3] = Clamp((int32)(da));
2330    src_argb += 4;
2331    dst_argb += 4;
2332  }
2333}
2334
2335void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
2336                             const uint8* luma, uint32 lumacoeff) {
2337  uint32 bc = lumacoeff & 0xff;
2338  uint32 gc = (lumacoeff >> 8) & 0xff;
2339  uint32 rc = (lumacoeff >> 16) & 0xff;
2340
2341  int i;
2342  for (i = 0; i < width - 1; i += 2) {
2343    // Luminance in rows, color values in columns.
2344    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
2345                           src_argb[2] * rc) & 0x7F00u) + luma;
2346    const uint8* luma1;
2347    dst_argb[0] = luma0[src_argb[0]];
2348    dst_argb[1] = luma0[src_argb[1]];
2349    dst_argb[2] = luma0[src_argb[2]];
2350    dst_argb[3] = src_argb[3];
2351    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
2352              src_argb[6] * rc) & 0x7F00u) + luma;
2353    dst_argb[4] = luma1[src_argb[4]];
2354    dst_argb[5] = luma1[src_argb[5]];
2355    dst_argb[6] = luma1[src_argb[6]];
2356    dst_argb[7] = src_argb[7];
2357    src_argb += 8;
2358    dst_argb += 8;
2359  }
2360  if (width & 1) {
2361    // Luminance in rows, color values in columns.
2362    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
2363                           src_argb[2] * rc) & 0x7F00u) + luma;
2364    dst_argb[0] = luma0[src_argb[0]];
2365    dst_argb[1] = luma0[src_argb[1]];
2366    dst_argb[2] = luma0[src_argb[2]];
2367    dst_argb[3] = src_argb[3];
2368  }
2369}
2370
2371void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
2372  int i;
2373  for (i = 0; i < width - 1; i += 2) {
2374    dst[3] = src[3];
2375    dst[7] = src[7];
2376    dst += 8;
2377    src += 8;
2378  }
2379  if (width & 1) {
2380    dst[3] = src[3];
2381  }
2382}
2383
2384void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {
2385  int i;
2386  for (i = 0; i < width - 1; i += 2) {
2387    dst_a[0] = src_argb[3];
2388    dst_a[1] = src_argb[7];
2389    dst_a += 2;
2390    src_argb += 8;
2391  }
2392  if (width & 1) {
2393    dst_a[0] = src_argb[3];
2394  }
2395}
2396
2397void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
2398  int i;
2399  for (i = 0; i < width - 1; i += 2) {
2400    dst[3] = src[0];
2401    dst[7] = src[1];
2402    dst += 8;
2403    src += 2;
2404  }
2405  if (width & 1) {
2406    dst[3] = src[0];
2407  }
2408}
2409
2410// Maximum temporary width for wrappers to process at a time, in pixels.
2411#define MAXTWIDTH 2048
2412
2413#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
2414    defined(HAS_I422TORGB565ROW_SSSE3)
2415// row_win.cc has asm version, but GCC uses 2 step wrapper.
2416void I422ToRGB565Row_SSSE3(const uint8* src_y,
2417                           const uint8* src_u,
2418                           const uint8* src_v,
2419                           uint8* dst_rgb565,
2420                           const struct YuvConstants* yuvconstants,
2421                           int width) {
2422  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2423  while (width > 0) {
2424    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2425    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
2426    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
2427    src_y += twidth;
2428    src_u += twidth / 2;
2429    src_v += twidth / 2;
2430    dst_rgb565 += twidth * 2;
2431    width -= twidth;
2432  }
2433}
2434#endif
2435
2436#if defined(HAS_I422TOARGB1555ROW_SSSE3)
2437void I422ToARGB1555Row_SSSE3(const uint8* src_y,
2438                             const uint8* src_u,
2439                             const uint8* src_v,
2440                             uint8* dst_argb1555,
2441                             const struct YuvConstants* yuvconstants,
2442                             int width) {
2443  // Row buffer for intermediate ARGB pixels.
2444  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2445  while (width > 0) {
2446    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2447    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
2448    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
2449    src_y += twidth;
2450    src_u += twidth / 2;
2451    src_v += twidth / 2;
2452    dst_argb1555 += twidth * 2;
2453    width -= twidth;
2454  }
2455}
2456#endif
2457
2458#if defined(HAS_I422TOARGB4444ROW_SSSE3)
2459void I422ToARGB4444Row_SSSE3(const uint8* src_y,
2460                             const uint8* src_u,
2461                             const uint8* src_v,
2462                             uint8* dst_argb4444,
2463                             const struct YuvConstants* yuvconstants,
2464                             int width) {
2465  // Row buffer for intermediate ARGB pixels.
2466  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2467  while (width > 0) {
2468    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2469    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
2470    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
2471    src_y += twidth;
2472    src_u += twidth / 2;
2473    src_v += twidth / 2;
2474    dst_argb4444 += twidth * 2;
2475    width -= twidth;
2476  }
2477}
2478#endif
2479
2480#if defined(HAS_NV12TORGB565ROW_SSSE3)
2481void NV12ToRGB565Row_SSSE3(const uint8* src_y,
2482                           const uint8* src_uv,
2483                           uint8* dst_rgb565,
2484                           const struct YuvConstants* yuvconstants,
2485                           int width) {
2486  // Row buffer for intermediate ARGB pixels.
2487  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2488  while (width > 0) {
2489    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2490    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
2491    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
2492    src_y += twidth;
2493    src_uv += twidth;
2494    dst_rgb565 += twidth * 2;
2495    width -= twidth;
2496  }
2497}
2498#endif
2499
2500#if defined(HAS_I422TORGB565ROW_AVX2)
2501void I422ToRGB565Row_AVX2(const uint8* src_y,
2502                          const uint8* src_u,
2503                          const uint8* src_v,
2504                          uint8* dst_rgb565,
2505                          const struct YuvConstants* yuvconstants,
2506                          int width) {
2507  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2508  while (width > 0) {
2509    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2510    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
2511#if defined(HAS_ARGBTORGB565ROW_AVX2)
2512    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
2513#else
2514    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
2515#endif
2516    src_y += twidth;
2517    src_u += twidth / 2;
2518    src_v += twidth / 2;
2519    dst_rgb565 += twidth * 2;
2520    width -= twidth;
2521  }
2522}
2523#endif
2524
2525#if defined(HAS_I422TOARGB1555ROW_AVX2)
2526void I422ToARGB1555Row_AVX2(const uint8* src_y,
2527                            const uint8* src_u,
2528                            const uint8* src_v,
2529                            uint8* dst_argb1555,
2530                            const struct YuvConstants* yuvconstants,
2531                            int width) {
2532  // Row buffer for intermediate ARGB pixels.
2533  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2534  while (width > 0) {
2535    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2536    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
2537#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
2538    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
2539#else
2540    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
2541#endif
2542    src_y += twidth;
2543    src_u += twidth / 2;
2544    src_v += twidth / 2;
2545    dst_argb1555 += twidth * 2;
2546    width -= twidth;
2547  }
2548}
2549#endif
2550
2551#if defined(HAS_I422TOARGB4444ROW_AVX2)
2552void I422ToARGB4444Row_AVX2(const uint8* src_y,
2553                            const uint8* src_u,
2554                            const uint8* src_v,
2555                            uint8* dst_argb4444,
2556                            const struct YuvConstants* yuvconstants,
2557                            int width) {
2558  // Row buffer for intermediate ARGB pixels.
2559  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2560  while (width > 0) {
2561    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2562    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
2563#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
2564    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
2565#else
2566    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
2567#endif
2568    src_y += twidth;
2569    src_u += twidth / 2;
2570    src_v += twidth / 2;
2571    dst_argb4444 += twidth * 2;
2572    width -= twidth;
2573  }
2574}
2575#endif
2576
2577#if defined(HAS_I422TORGB24ROW_AVX2)
2578void I422ToRGB24Row_AVX2(const uint8* src_y,
2579                            const uint8* src_u,
2580                            const uint8* src_v,
2581                            uint8* dst_rgb24,
2582                            const struct YuvConstants* yuvconstants,
2583                            int width) {
2584  // Row buffer for intermediate ARGB pixels.
2585  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2586  while (width > 0) {
2587    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2588    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
2589    // TODO(fbarchard): ARGBToRGB24Row_AVX2
2590    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
2591    src_y += twidth;
2592    src_u += twidth / 2;
2593    src_v += twidth / 2;
2594    dst_rgb24 += twidth * 3;
2595    width -= twidth;
2596  }
2597}
2598#endif
2599
2600#if defined(HAS_NV12TORGB565ROW_AVX2)
2601void NV12ToRGB565Row_AVX2(const uint8* src_y,
2602                          const uint8* src_uv,
2603                          uint8* dst_rgb565,
2604                          const struct YuvConstants* yuvconstants,
2605                          int width) {
2606  // Row buffer for intermediate ARGB pixels.
2607  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2608  while (width > 0) {
2609    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2610    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
2611#if defined(HAS_ARGBTORGB565ROW_AVX2)
2612    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
2613#else
2614    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
2615#endif
2616    src_y += twidth;
2617    src_uv += twidth;
2618    dst_rgb565 += twidth * 2;
2619    width -= twidth;
2620  }
2621}
2622#endif
2623
2624#ifdef __cplusplus
2625}  // extern "C"
2626}  // namespace libyuv
2627#endif
2628