1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#include <string.h>  // For memcpy and memset.
14
15#include "libyuv/basic_types.h"
16
17#ifdef __cplusplus
18namespace libyuv {
19extern "C" {
20#endif
21
22// llvm x86 is poor at ternary operator, so use branchless min/max.
23
24#define USE_BRANCHLESS 1
25#if USE_BRANCHLESS
26static __inline int32 clamp0(int32 v) {
27  return ((-(v) >> 31) & (v));
28}
29
30static __inline int32 clamp255(int32 v) {
31  return (((255 - (v)) >> 31) | (v)) & 255;
32}
33
34static __inline uint32 Clamp(int32 val) {
35  int v = clamp0(val);
36  return (uint32)(clamp255(v));
37}
38
39static __inline uint32 Abs(int32 v) {
40  int m = v >> 31;
41  return (v + m) ^ m;
42}
43#else   // USE_BRANCHLESS
44static __inline int32 clamp0(int32 v) {
45  return (v < 0) ? 0 : v;
46}
47
48static __inline int32 clamp255(int32 v) {
49  return (v > 255) ? 255 : v;
50}
51
52static __inline uint32 Clamp(int32 val) {
53  int v = clamp0(val);
54  return (uint32)(clamp255(v));
55}
56
57static __inline uint32 Abs(int32 v) {
58  return (v < 0) ? -v : v;
59}
60#endif  // USE_BRANCHLESS
61
62#ifdef LIBYUV_LITTLE_ENDIAN
63#define WRITEWORD(p, v) *(uint32*)(p) = v
64#else
65static inline void WRITEWORD(uint8* p, uint32 v) {
66  p[0] = (uint8)(v & 255);
67  p[1] = (uint8)((v >> 8) & 255);
68  p[2] = (uint8)((v >> 16) & 255);
69  p[3] = (uint8)((v >> 24) & 255);
70}
71#endif
72
73void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
74  int x;
75  for (x = 0; x < width; ++x) {
76    uint8 b = src_rgb24[0];
77    uint8 g = src_rgb24[1];
78    uint8 r = src_rgb24[2];
79    dst_argb[0] = b;
80    dst_argb[1] = g;
81    dst_argb[2] = r;
82    dst_argb[3] = 255u;
83    dst_argb += 4;
84    src_rgb24 += 3;
85  }
86}
87
88void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
89  int x;
90  for (x = 0; x < width; ++x) {
91    uint8 r = src_raw[0];
92    uint8 g = src_raw[1];
93    uint8 b = src_raw[2];
94    dst_argb[0] = b;
95    dst_argb[1] = g;
96    dst_argb[2] = r;
97    dst_argb[3] = 255u;
98    dst_argb += 4;
99    src_raw += 3;
100  }
101}
102
103void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
104  int x;
105  for (x = 0; x < width; ++x) {
106    uint8 r = src_raw[0];
107    uint8 g = src_raw[1];
108    uint8 b = src_raw[2];
109    dst_rgb24[0] = b;
110    dst_rgb24[1] = g;
111    dst_rgb24[2] = r;
112    dst_rgb24 += 3;
113    src_raw += 3;
114  }
115}
116
117void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
118  int x;
119  for (x = 0; x < width; ++x) {
120    uint8 b = src_rgb565[0] & 0x1f;
121    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
122    uint8 r = src_rgb565[1] >> 3;
123    dst_argb[0] = (b << 3) | (b >> 2);
124    dst_argb[1] = (g << 2) | (g >> 4);
125    dst_argb[2] = (r << 3) | (r >> 2);
126    dst_argb[3] = 255u;
127    dst_argb += 4;
128    src_rgb565 += 2;
129  }
130}
131
132void ARGB1555ToARGBRow_C(const uint8* src_argb1555,
133                         uint8* dst_argb,
134                         int width) {
135  int x;
136  for (x = 0; x < width; ++x) {
137    uint8 b = src_argb1555[0] & 0x1f;
138    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
139    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
140    uint8 a = src_argb1555[1] >> 7;
141    dst_argb[0] = (b << 3) | (b >> 2);
142    dst_argb[1] = (g << 3) | (g >> 2);
143    dst_argb[2] = (r << 3) | (r >> 2);
144    dst_argb[3] = -a;
145    dst_argb += 4;
146    src_argb1555 += 2;
147  }
148}
149
150void ARGB4444ToARGBRow_C(const uint8* src_argb4444,
151                         uint8* dst_argb,
152                         int width) {
153  int x;
154  for (x = 0; x < width; ++x) {
155    uint8 b = src_argb4444[0] & 0x0f;
156    uint8 g = src_argb4444[0] >> 4;
157    uint8 r = src_argb4444[1] & 0x0f;
158    uint8 a = src_argb4444[1] >> 4;
159    dst_argb[0] = (b << 4) | b;
160    dst_argb[1] = (g << 4) | g;
161    dst_argb[2] = (r << 4) | r;
162    dst_argb[3] = (a << 4) | a;
163    dst_argb += 4;
164    src_argb4444 += 2;
165  }
166}
167
168void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
169  int x;
170  for (x = 0; x < width; ++x) {
171    uint8 b = src_argb[0];
172    uint8 g = src_argb[1];
173    uint8 r = src_argb[2];
174    dst_rgb[0] = b;
175    dst_rgb[1] = g;
176    dst_rgb[2] = r;
177    dst_rgb += 3;
178    src_argb += 4;
179  }
180}
181
182void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
183  int x;
184  for (x = 0; x < width; ++x) {
185    uint8 b = src_argb[0];
186    uint8 g = src_argb[1];
187    uint8 r = src_argb[2];
188    dst_rgb[0] = r;
189    dst_rgb[1] = g;
190    dst_rgb[2] = b;
191    dst_rgb += 3;
192    src_argb += 4;
193  }
194}
195
196void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
197  int x;
198  for (x = 0; x < width - 1; x += 2) {
199    uint8 b0 = src_argb[0] >> 3;
200    uint8 g0 = src_argb[1] >> 2;
201    uint8 r0 = src_argb[2] >> 3;
202    uint8 b1 = src_argb[4] >> 3;
203    uint8 g1 = src_argb[5] >> 2;
204    uint8 r1 = src_argb[6] >> 3;
205    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
206                           (r1 << 27));
207    dst_rgb += 4;
208    src_argb += 8;
209  }
210  if (width & 1) {
211    uint8 b0 = src_argb[0] >> 3;
212    uint8 g0 = src_argb[1] >> 2;
213    uint8 r0 = src_argb[2] >> 3;
214    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
215  }
216}
217
218// dither4 is a row of 4 values from 4x4 dither matrix.
219// The 4x4 matrix contains values to increase RGB.  When converting to
220// fewer bits (565) this provides an ordered dither.
221// The order in the 4x4 matrix in first byte is upper left.
222// The 4 values are passed as an int, then referenced as an array, so
223// endian will not affect order of the original matrix.  But the dither4
224// will containing the first pixel in the lower byte for little endian
225// or the upper byte for big endian.
226void ARGBToRGB565DitherRow_C(const uint8* src_argb,
227                             uint8* dst_rgb,
228                             const uint32 dither4,
229                             int width) {
230  int x;
231  for (x = 0; x < width - 1; x += 2) {
232    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
233    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
234    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
235    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
236    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
237    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
238    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
239    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
240    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
241                           (r1 << 27));
242    dst_rgb += 4;
243    src_argb += 8;
244  }
245  if (width & 1) {
246    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
247    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
248    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
249    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
250    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
251  }
252}
253
254void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
255  int x;
256  for (x = 0; x < width - 1; x += 2) {
257    uint8 b0 = src_argb[0] >> 3;
258    uint8 g0 = src_argb[1] >> 3;
259    uint8 r0 = src_argb[2] >> 3;
260    uint8 a0 = src_argb[3] >> 7;
261    uint8 b1 = src_argb[4] >> 3;
262    uint8 g1 = src_argb[5] >> 3;
263    uint8 r1 = src_argb[6] >> 3;
264    uint8 a1 = src_argb[7] >> 7;
265    *(uint32*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
266                          (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
267    dst_rgb += 4;
268    src_argb += 8;
269  }
270  if (width & 1) {
271    uint8 b0 = src_argb[0] >> 3;
272    uint8 g0 = src_argb[1] >> 3;
273    uint8 r0 = src_argb[2] >> 3;
274    uint8 a0 = src_argb[3] >> 7;
275    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
276  }
277}
278
279void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
280  int x;
281  for (x = 0; x < width - 1; x += 2) {
282    uint8 b0 = src_argb[0] >> 4;
283    uint8 g0 = src_argb[1] >> 4;
284    uint8 r0 = src_argb[2] >> 4;
285    uint8 a0 = src_argb[3] >> 4;
286    uint8 b1 = src_argb[4] >> 4;
287    uint8 g1 = src_argb[5] >> 4;
288    uint8 r1 = src_argb[6] >> 4;
289    uint8 a1 = src_argb[7] >> 4;
290    *(uint32*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | (b1 << 16) |
291                          (g1 << 20) | (r1 << 24) | (a1 << 28);
292    dst_rgb += 4;
293    src_argb += 8;
294  }
295  if (width & 1) {
296    uint8 b0 = src_argb[0] >> 4;
297    uint8 g0 = src_argb[1] >> 4;
298    uint8 r0 = src_argb[2] >> 4;
299    uint8 a0 = src_argb[3] >> 4;
300    *(uint16*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
301  }
302}
303
304static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
305  return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
306}
307
308static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
309  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
310}
311static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
312  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
313}
314
315// ARGBToY_C and ARGBToUV_C
316#define MAKEROWY(NAME, R, G, B, BPP)                                     \
317  void NAME##ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
318    int x;                                                               \
319    for (x = 0; x < width; ++x) {                                        \
320      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);       \
321      src_argb0 += BPP;                                                  \
322      dst_y += 1;                                                        \
323    }                                                                    \
324  }                                                                      \
325  void NAME##ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,        \
326                       uint8* dst_u, uint8* dst_v, int width) {          \
327    const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                   \
328    int x;                                                               \
329    for (x = 0; x < width - 1; x += 2) {                                 \
330      uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +        \
331                  src_rgb1[B + BPP]) >>                                  \
332                 2;                                                      \
333      uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +        \
334                  src_rgb1[G + BPP]) >>                                  \
335                 2;                                                      \
336      uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +        \
337                  src_rgb1[R + BPP]) >>                                  \
338                 2;                                                      \
339      dst_u[0] = RGBToU(ar, ag, ab);                                     \
340      dst_v[0] = RGBToV(ar, ag, ab);                                     \
341      src_rgb0 += BPP * 2;                                               \
342      src_rgb1 += BPP * 2;                                               \
343      dst_u += 1;                                                        \
344      dst_v += 1;                                                        \
345    }                                                                    \
346    if (width & 1) {                                                     \
347      uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                       \
348      uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                       \
349      uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                       \
350      dst_u[0] = RGBToU(ar, ag, ab);                                     \
351      dst_v[0] = RGBToV(ar, ag, ab);                                     \
352    }                                                                    \
353  }
354
355MAKEROWY(ARGB, 2, 1, 0, 4)
356MAKEROWY(BGRA, 1, 2, 3, 4)
357MAKEROWY(ABGR, 0, 1, 2, 4)
358MAKEROWY(RGBA, 3, 2, 1, 4)
359MAKEROWY(RGB24, 2, 1, 0, 3)
360MAKEROWY(RAW, 0, 1, 2, 3)
361#undef MAKEROWY
362
363// JPeg uses a variation on BT.601-1 full range
364// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
365// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
366// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
367// BT.601 Mpeg range uses:
368// b 0.1016 * 255 = 25.908 = 25
369// g 0.5078 * 255 = 129.489 = 129
370// r 0.2578 * 255 = 65.739 = 66
371// JPeg 8 bit Y (not used):
372// b 0.11400 * 256 = 29.184 = 29
373// g 0.58700 * 256 = 150.272 = 150
374// r 0.29900 * 256 = 76.544 = 77
375// JPeg 7 bit Y:
376// b 0.11400 * 128 = 14.592 = 15
377// g 0.58700 * 128 = 75.136 = 75
378// r 0.29900 * 128 = 38.272 = 38
379// JPeg 8 bit U:
380// b  0.50000 * 255 = 127.5 = 127
381// g -0.33126 * 255 = -84.4713 = -84
382// r -0.16874 * 255 = -43.0287 = -43
383// JPeg 8 bit V:
384// b -0.08131 * 255 = -20.73405 = -20
385// g -0.41869 * 255 = -106.76595 = -107
386// r  0.50000 * 255 = 127.5 = 127
387
388static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
389  return (38 * r + 75 * g + 15 * b + 64) >> 7;
390}
391
392static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
393  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
394}
395static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
396  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
397}
398
399#define AVGB(a, b) (((a) + (b) + 1) >> 1)
400
401// ARGBToYJ_C and ARGBToUVJ_C
402#define MAKEROWYJ(NAME, R, G, B, BPP)                                     \
403  void NAME##ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
404    int x;                                                                \
405    for (x = 0; x < width; ++x) {                                         \
406      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);       \
407      src_argb0 += BPP;                                                   \
408      dst_y += 1;                                                         \
409    }                                                                     \
410  }                                                                       \
411  void NAME##ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,        \
412                        uint8* dst_u, uint8* dst_v, int width) {          \
413    const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                    \
414    int x;                                                                \
415    for (x = 0; x < width - 1; x += 2) {                                  \
416      uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                     \
417                      AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));        \
418      uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                     \
419                      AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));        \
420      uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                     \
421                      AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));        \
422      dst_u[0] = RGBToUJ(ar, ag, ab);                                     \
423      dst_v[0] = RGBToVJ(ar, ag, ab);                                     \
424      src_rgb0 += BPP * 2;                                                \
425      src_rgb1 += BPP * 2;                                                \
426      dst_u += 1;                                                         \
427      dst_v += 1;                                                         \
428    }                                                                     \
429    if (width & 1) {                                                      \
430      uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                          \
431      uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                          \
432      uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                          \
433      dst_u[0] = RGBToUJ(ar, ag, ab);                                     \
434      dst_v[0] = RGBToVJ(ar, ag, ab);                                     \
435    }                                                                     \
436  }
437
438MAKEROWYJ(ARGB, 2, 1, 0, 4)
439#undef MAKEROWYJ
440
441void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
442  int x;
443  for (x = 0; x < width; ++x) {
444    uint8 b = src_rgb565[0] & 0x1f;
445    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
446    uint8 r = src_rgb565[1] >> 3;
447    b = (b << 3) | (b >> 2);
448    g = (g << 2) | (g >> 4);
449    r = (r << 3) | (r >> 2);
450    dst_y[0] = RGBToY(r, g, b);
451    src_rgb565 += 2;
452    dst_y += 1;
453  }
454}
455
456void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
457  int x;
458  for (x = 0; x < width; ++x) {
459    uint8 b = src_argb1555[0] & 0x1f;
460    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
461    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
462    b = (b << 3) | (b >> 2);
463    g = (g << 3) | (g >> 2);
464    r = (r << 3) | (r >> 2);
465    dst_y[0] = RGBToY(r, g, b);
466    src_argb1555 += 2;
467    dst_y += 1;
468  }
469}
470
471void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
472  int x;
473  for (x = 0; x < width; ++x) {
474    uint8 b = src_argb4444[0] & 0x0f;
475    uint8 g = src_argb4444[0] >> 4;
476    uint8 r = src_argb4444[1] & 0x0f;
477    b = (b << 4) | b;
478    g = (g << 4) | g;
479    r = (r << 4) | r;
480    dst_y[0] = RGBToY(r, g, b);
481    src_argb4444 += 2;
482    dst_y += 1;
483  }
484}
485
486void RGB565ToUVRow_C(const uint8* src_rgb565,
487                     int src_stride_rgb565,
488                     uint8* dst_u,
489                     uint8* dst_v,
490                     int width) {
491  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
492  int x;
493  for (x = 0; x < width - 1; x += 2) {
494    uint8 b0 = src_rgb565[0] & 0x1f;
495    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
496    uint8 r0 = src_rgb565[1] >> 3;
497    uint8 b1 = src_rgb565[2] & 0x1f;
498    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
499    uint8 r1 = src_rgb565[3] >> 3;
500    uint8 b2 = next_rgb565[0] & 0x1f;
501    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
502    uint8 r2 = next_rgb565[1] >> 3;
503    uint8 b3 = next_rgb565[2] & 0x1f;
504    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
505    uint8 r3 = next_rgb565[3] >> 3;
506    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
507    uint8 g = (g0 + g1 + g2 + g3);
508    uint8 r = (r0 + r1 + r2 + r3);
509    b = (b << 1) | (b >> 6);  // 787 -> 888.
510    r = (r << 1) | (r >> 6);
511    dst_u[0] = RGBToU(r, g, b);
512    dst_v[0] = RGBToV(r, g, b);
513    src_rgb565 += 4;
514    next_rgb565 += 4;
515    dst_u += 1;
516    dst_v += 1;
517  }
518  if (width & 1) {
519    uint8 b0 = src_rgb565[0] & 0x1f;
520    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
521    uint8 r0 = src_rgb565[1] >> 3;
522    uint8 b2 = next_rgb565[0] & 0x1f;
523    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
524    uint8 r2 = next_rgb565[1] >> 3;
525    uint8 b = (b0 + b2);  // 565 * 2 = 676.
526    uint8 g = (g0 + g2);
527    uint8 r = (r0 + r2);
528    b = (b << 2) | (b >> 4);  // 676 -> 888
529    g = (g << 1) | (g >> 6);
530    r = (r << 2) | (r >> 4);
531    dst_u[0] = RGBToU(r, g, b);
532    dst_v[0] = RGBToV(r, g, b);
533  }
534}
535
536void ARGB1555ToUVRow_C(const uint8* src_argb1555,
537                       int src_stride_argb1555,
538                       uint8* dst_u,
539                       uint8* dst_v,
540                       int width) {
541  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
542  int x;
543  for (x = 0; x < width - 1; x += 2) {
544    uint8 b0 = src_argb1555[0] & 0x1f;
545    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
546    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
547    uint8 b1 = src_argb1555[2] & 0x1f;
548    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
549    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
550    uint8 b2 = next_argb1555[0] & 0x1f;
551    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
552    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
553    uint8 b3 = next_argb1555[2] & 0x1f;
554    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
555    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
556    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
557    uint8 g = (g0 + g1 + g2 + g3);
558    uint8 r = (r0 + r1 + r2 + r3);
559    b = (b << 1) | (b >> 6);  // 777 -> 888.
560    g = (g << 1) | (g >> 6);
561    r = (r << 1) | (r >> 6);
562    dst_u[0] = RGBToU(r, g, b);
563    dst_v[0] = RGBToV(r, g, b);
564    src_argb1555 += 4;
565    next_argb1555 += 4;
566    dst_u += 1;
567    dst_v += 1;
568  }
569  if (width & 1) {
570    uint8 b0 = src_argb1555[0] & 0x1f;
571    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
572    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
573    uint8 b2 = next_argb1555[0] & 0x1f;
574    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
575    uint8 r2 = next_argb1555[1] >> 3;
576    uint8 b = (b0 + b2);  // 555 * 2 = 666.
577    uint8 g = (g0 + g2);
578    uint8 r = (r0 + r2);
579    b = (b << 2) | (b >> 4);  // 666 -> 888.
580    g = (g << 2) | (g >> 4);
581    r = (r << 2) | (r >> 4);
582    dst_u[0] = RGBToU(r, g, b);
583    dst_v[0] = RGBToV(r, g, b);
584  }
585}
586
587void ARGB4444ToUVRow_C(const uint8* src_argb4444,
588                       int src_stride_argb4444,
589                       uint8* dst_u,
590                       uint8* dst_v,
591                       int width) {
592  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
593  int x;
594  for (x = 0; x < width - 1; x += 2) {
595    uint8 b0 = src_argb4444[0] & 0x0f;
596    uint8 g0 = src_argb4444[0] >> 4;
597    uint8 r0 = src_argb4444[1] & 0x0f;
598    uint8 b1 = src_argb4444[2] & 0x0f;
599    uint8 g1 = src_argb4444[2] >> 4;
600    uint8 r1 = src_argb4444[3] & 0x0f;
601    uint8 b2 = next_argb4444[0] & 0x0f;
602    uint8 g2 = next_argb4444[0] >> 4;
603    uint8 r2 = next_argb4444[1] & 0x0f;
604    uint8 b3 = next_argb4444[2] & 0x0f;
605    uint8 g3 = next_argb4444[2] >> 4;
606    uint8 r3 = next_argb4444[3] & 0x0f;
607    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
608    uint8 g = (g0 + g1 + g2 + g3);
609    uint8 r = (r0 + r1 + r2 + r3);
610    b = (b << 2) | (b >> 4);  // 666 -> 888.
611    g = (g << 2) | (g >> 4);
612    r = (r << 2) | (r >> 4);
613    dst_u[0] = RGBToU(r, g, b);
614    dst_v[0] = RGBToV(r, g, b);
615    src_argb4444 += 4;
616    next_argb4444 += 4;
617    dst_u += 1;
618    dst_v += 1;
619  }
620  if (width & 1) {
621    uint8 b0 = src_argb4444[0] & 0x0f;
622    uint8 g0 = src_argb4444[0] >> 4;
623    uint8 r0 = src_argb4444[1] & 0x0f;
624    uint8 b2 = next_argb4444[0] & 0x0f;
625    uint8 g2 = next_argb4444[0] >> 4;
626    uint8 r2 = next_argb4444[1] & 0x0f;
627    uint8 b = (b0 + b2);  // 444 * 2 = 555.
628    uint8 g = (g0 + g2);
629    uint8 r = (r0 + r2);
630    b = (b << 3) | (b >> 2);  // 555 -> 888.
631    g = (g << 3) | (g >> 2);
632    r = (r << 3) | (r >> 2);
633    dst_u[0] = RGBToU(r, g, b);
634    dst_v[0] = RGBToV(r, g, b);
635  }
636}
637
638void ARGBToUV444Row_C(const uint8* src_argb,
639                      uint8* dst_u,
640                      uint8* dst_v,
641                      int width) {
642  int x;
643  for (x = 0; x < width; ++x) {
644    uint8 ab = src_argb[0];
645    uint8 ag = src_argb[1];
646    uint8 ar = src_argb[2];
647    dst_u[0] = RGBToU(ar, ag, ab);
648    dst_v[0] = RGBToV(ar, ag, ab);
649    src_argb += 4;
650    dst_u += 1;
651    dst_v += 1;
652  }
653}
654
655void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
656  int x;
657  for (x = 0; x < width; ++x) {
658    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
659    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
660    dst_argb[3] = src_argb[3];
661    dst_argb += 4;
662    src_argb += 4;
663  }
664}
665
666// Convert a row of image to Sepia tone.
667void ARGBSepiaRow_C(uint8* dst_argb, int width) {
668  int x;
669  for (x = 0; x < width; ++x) {
670    int b = dst_argb[0];
671    int g = dst_argb[1];
672    int r = dst_argb[2];
673    int sb = (b * 17 + g * 68 + r * 35) >> 7;
674    int sg = (b * 22 + g * 88 + r * 45) >> 7;
675    int sr = (b * 24 + g * 98 + r * 50) >> 7;
676    // b does not over flow. a is preserved from original.
677    dst_argb[0] = sb;
678    dst_argb[1] = clamp255(sg);
679    dst_argb[2] = clamp255(sr);
680    dst_argb += 4;
681  }
682}
683
684// Apply color matrix to a row of image. Matrix is signed.
685// TODO(fbarchard): Consider adding rounding (+32).
686void ARGBColorMatrixRow_C(const uint8* src_argb,
687                          uint8* dst_argb,
688                          const int8* matrix_argb,
689                          int width) {
690  int x;
691  for (x = 0; x < width; ++x) {
692    int b = src_argb[0];
693    int g = src_argb[1];
694    int r = src_argb[2];
695    int a = src_argb[3];
696    int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
697              a * matrix_argb[3]) >>
698             6;
699    int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
700              a * matrix_argb[7]) >>
701             6;
702    int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
703              a * matrix_argb[11]) >>
704             6;
705    int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
706              a * matrix_argb[15]) >>
707             6;
708    dst_argb[0] = Clamp(sb);
709    dst_argb[1] = Clamp(sg);
710    dst_argb[2] = Clamp(sr);
711    dst_argb[3] = Clamp(sa);
712    src_argb += 4;
713    dst_argb += 4;
714  }
715}
716
717// Apply color table to a row of image.
718void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
719  int x;
720  for (x = 0; x < width; ++x) {
721    int b = dst_argb[0];
722    int g = dst_argb[1];
723    int r = dst_argb[2];
724    int a = dst_argb[3];
725    dst_argb[0] = table_argb[b * 4 + 0];
726    dst_argb[1] = table_argb[g * 4 + 1];
727    dst_argb[2] = table_argb[r * 4 + 2];
728    dst_argb[3] = table_argb[a * 4 + 3];
729    dst_argb += 4;
730  }
731}
732
733// Apply color table to a row of image.
734void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
735  int x;
736  for (x = 0; x < width; ++x) {
737    int b = dst_argb[0];
738    int g = dst_argb[1];
739    int r = dst_argb[2];
740    dst_argb[0] = table_argb[b * 4 + 0];
741    dst_argb[1] = table_argb[g * 4 + 1];
742    dst_argb[2] = table_argb[r * 4 + 2];
743    dst_argb += 4;
744  }
745}
746
747void ARGBQuantizeRow_C(uint8* dst_argb,
748                       int scale,
749                       int interval_size,
750                       int interval_offset,
751                       int width) {
752  int x;
753  for (x = 0; x < width; ++x) {
754    int b = dst_argb[0];
755    int g = dst_argb[1];
756    int r = dst_argb[2];
757    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
758    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
759    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
760    dst_argb += 4;
761  }
762}
763
764#define REPEAT8(v) (v) | ((v) << 8)
765#define SHADE(f, v) v* f >> 24
766
767void ARGBShadeRow_C(const uint8* src_argb,
768                    uint8* dst_argb,
769                    int width,
770                    uint32 value) {
771  const uint32 b_scale = REPEAT8(value & 0xff);
772  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
773  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
774  const uint32 a_scale = REPEAT8(value >> 24);
775
776  int i;
777  for (i = 0; i < width; ++i) {
778    const uint32 b = REPEAT8(src_argb[0]);
779    const uint32 g = REPEAT8(src_argb[1]);
780    const uint32 r = REPEAT8(src_argb[2]);
781    const uint32 a = REPEAT8(src_argb[3]);
782    dst_argb[0] = SHADE(b, b_scale);
783    dst_argb[1] = SHADE(g, g_scale);
784    dst_argb[2] = SHADE(r, r_scale);
785    dst_argb[3] = SHADE(a, a_scale);
786    src_argb += 4;
787    dst_argb += 4;
788  }
789}
790#undef REPEAT8
791#undef SHADE
792
793#define REPEAT8(v) (v) | ((v) << 8)
794#define SHADE(f, v) v* f >> 16
795
796void ARGBMultiplyRow_C(const uint8* src_argb0,
797                       const uint8* src_argb1,
798                       uint8* dst_argb,
799                       int width) {
800  int i;
801  for (i = 0; i < width; ++i) {
802    const uint32 b = REPEAT8(src_argb0[0]);
803    const uint32 g = REPEAT8(src_argb0[1]);
804    const uint32 r = REPEAT8(src_argb0[2]);
805    const uint32 a = REPEAT8(src_argb0[3]);
806    const uint32 b_scale = src_argb1[0];
807    const uint32 g_scale = src_argb1[1];
808    const uint32 r_scale = src_argb1[2];
809    const uint32 a_scale = src_argb1[3];
810    dst_argb[0] = SHADE(b, b_scale);
811    dst_argb[1] = SHADE(g, g_scale);
812    dst_argb[2] = SHADE(r, r_scale);
813    dst_argb[3] = SHADE(a, a_scale);
814    src_argb0 += 4;
815    src_argb1 += 4;
816    dst_argb += 4;
817  }
818}
819#undef REPEAT8
820#undef SHADE
821
822#define SHADE(f, v) clamp255(v + f)
823
824void ARGBAddRow_C(const uint8* src_argb0,
825                  const uint8* src_argb1,
826                  uint8* dst_argb,
827                  int width) {
828  int i;
829  for (i = 0; i < width; ++i) {
830    const int b = src_argb0[0];
831    const int g = src_argb0[1];
832    const int r = src_argb0[2];
833    const int a = src_argb0[3];
834    const int b_add = src_argb1[0];
835    const int g_add = src_argb1[1];
836    const int r_add = src_argb1[2];
837    const int a_add = src_argb1[3];
838    dst_argb[0] = SHADE(b, b_add);
839    dst_argb[1] = SHADE(g, g_add);
840    dst_argb[2] = SHADE(r, r_add);
841    dst_argb[3] = SHADE(a, a_add);
842    src_argb0 += 4;
843    src_argb1 += 4;
844    dst_argb += 4;
845  }
846}
847#undef SHADE
848
849#define SHADE(f, v) clamp0(f - v)
850
851void ARGBSubtractRow_C(const uint8* src_argb0,
852                       const uint8* src_argb1,
853                       uint8* dst_argb,
854                       int width) {
855  int i;
856  for (i = 0; i < width; ++i) {
857    const int b = src_argb0[0];
858    const int g = src_argb0[1];
859    const int r = src_argb0[2];
860    const int a = src_argb0[3];
861    const int b_sub = src_argb1[0];
862    const int g_sub = src_argb1[1];
863    const int r_sub = src_argb1[2];
864    const int a_sub = src_argb1[3];
865    dst_argb[0] = SHADE(b, b_sub);
866    dst_argb[1] = SHADE(g, g_sub);
867    dst_argb[2] = SHADE(r, r_sub);
868    dst_argb[3] = SHADE(a, a_sub);
869    src_argb0 += 4;
870    src_argb1 += 4;
871    dst_argb += 4;
872  }
873}
874#undef SHADE
875
876// Sobel functions which mimics SSSE3.
877void SobelXRow_C(const uint8* src_y0,
878                 const uint8* src_y1,
879                 const uint8* src_y2,
880                 uint8* dst_sobelx,
881                 int width) {
882  int i;
883  for (i = 0; i < width; ++i) {
884    int a = src_y0[i];
885    int b = src_y1[i];
886    int c = src_y2[i];
887    int a_sub = src_y0[i + 2];
888    int b_sub = src_y1[i + 2];
889    int c_sub = src_y2[i + 2];
890    int a_diff = a - a_sub;
891    int b_diff = b - b_sub;
892    int c_diff = c - c_sub;
893    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
894    dst_sobelx[i] = (uint8)(clamp255(sobel));
895  }
896}
897
898void SobelYRow_C(const uint8* src_y0,
899                 const uint8* src_y1,
900                 uint8* dst_sobely,
901                 int width) {
902  int i;
903  for (i = 0; i < width; ++i) {
904    int a = src_y0[i + 0];
905    int b = src_y0[i + 1];
906    int c = src_y0[i + 2];
907    int a_sub = src_y1[i + 0];
908    int b_sub = src_y1[i + 1];
909    int c_sub = src_y1[i + 2];
910    int a_diff = a - a_sub;
911    int b_diff = b - b_sub;
912    int c_diff = c - c_sub;
913    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
914    dst_sobely[i] = (uint8)(clamp255(sobel));
915  }
916}
917
918void SobelRow_C(const uint8* src_sobelx,
919                const uint8* src_sobely,
920                uint8* dst_argb,
921                int width) {
922  int i;
923  for (i = 0; i < width; ++i) {
924    int r = src_sobelx[i];
925    int b = src_sobely[i];
926    int s = clamp255(r + b);
927    dst_argb[0] = (uint8)(s);
928    dst_argb[1] = (uint8)(s);
929    dst_argb[2] = (uint8)(s);
930    dst_argb[3] = (uint8)(255u);
931    dst_argb += 4;
932  }
933}
934
935void SobelToPlaneRow_C(const uint8* src_sobelx,
936                       const uint8* src_sobely,
937                       uint8* dst_y,
938                       int width) {
939  int i;
940  for (i = 0; i < width; ++i) {
941    int r = src_sobelx[i];
942    int b = src_sobely[i];
943    int s = clamp255(r + b);
944    dst_y[i] = (uint8)(s);
945  }
946}
947
948void SobelXYRow_C(const uint8* src_sobelx,
949                  const uint8* src_sobely,
950                  uint8* dst_argb,
951                  int width) {
952  int i;
953  for (i = 0; i < width; ++i) {
954    int r = src_sobelx[i];
955    int b = src_sobely[i];
956    int g = clamp255(r + b);
957    dst_argb[0] = (uint8)(b);
958    dst_argb[1] = (uint8)(g);
959    dst_argb[2] = (uint8)(r);
960    dst_argb[3] = (uint8)(255u);
961    dst_argb += 4;
962  }
963}
964
965void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
966  // Copy a Y to RGB.
967  int x;
968  for (x = 0; x < width; ++x) {
969    uint8 y = src_y[0];
970    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
971    dst_argb[3] = 255u;
972    dst_argb += 4;
973    ++src_y;
974  }
975}
976
977// TODO(fbarchard): Unify these structures to be platform independent.
978// TODO(fbarchard): Generate SIMD structures from float matrix.
979
980// BT.601 YUV to RGB reference
981//  R = (Y - 16) * 1.164              - V * -1.596
982//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
983//  B = (Y - 16) * 1.164 - U * -2.018
984
985// Y contribution to R,G,B.  Scale and bias.
986#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
987#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
988
989// U and V contributions to R,G,B.
990#define UB -128 /* max(-128, round(-2.018 * 64)) */
991#define UG 25   /* round(0.391 * 64) */
992#define VG 52   /* round(0.813 * 64) */
993#define VR -102 /* round(-1.596 * 64) */
994
995// Bias values to subtract 16 from Y and 128 from U and V.
996#define BB (UB * 128 + YGB)
997#define BG (UG * 128 + VG * 128 + YGB)
998#define BR (VR * 128 + YGB)
999
1000#if defined(__aarch64__)  // 64 bit arm
1001const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
1002    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
1003    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
1004    {UG, VG, UG, VG, UG, VG, UG, VG},
1005    {UG, VG, UG, VG, UG, VG, UG, VG},
1006    {BB, BG, BR, 0, 0, 0, 0, 0},
1007    {0x0101 * YG, 0, 0, 0}};
1008const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
1009    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
1010    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
1011    {VG, UG, VG, UG, VG, UG, VG, UG},
1012    {VG, UG, VG, UG, VG, UG, VG, UG},
1013    {BR, BG, BB, 0, 0, 0, 0, 0},
1014    {0x0101 * YG, 0, 0, 0}};
1015#elif defined(__arm__)  // 32 bit arm
1016const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
1017    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
1018    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
1019    {BB, BG, BR, 0, 0, 0, 0, 0},
1020    {0x0101 * YG, 0, 0, 0}};
1021const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
1022    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
1023    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
1024    {BR, BG, BB, 0, 0, 0, 0, 0},
1025    {0x0101 * YG, 0, 0, 0}};
1026#else
1027const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
1028    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1029     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
1030    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1031     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
1032    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1033     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
1034    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
1035    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
1036    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
1037    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
1038const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
1039    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1040     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
1041    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1042     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
1043    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1044     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
1045    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
1046    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
1047    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
1048    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
1049#endif
1050
1051#undef BB
1052#undef BG
1053#undef BR
1054#undef YGB
1055#undef UB
1056#undef UG
1057#undef VG
1058#undef VR
1059#undef YG
1060
1061// JPEG YUV to RGB reference
1062// *  R = Y                - V * -1.40200
1063// *  G = Y - U *  0.34414 - V *  0.71414
1064// *  B = Y - U * -1.77200
1065
1066// Y contribution to R,G,B.  Scale and bias.
1067#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1068#define YGB 32   /* 64 / 2 */
1069
1070// U and V contributions to R,G,B.
1071#define UB -113 /* round(-1.77200 * 64) */
1072#define UG 22   /* round(0.34414 * 64) */
1073#define VG 46   /* round(0.71414  * 64) */
1074#define VR -90  /* round(-1.40200 * 64) */
1075
1076// Bias values to round, and subtract 128 from U and V.
1077#define BB (UB * 128 + YGB)
1078#define BG (UG * 128 + VG * 128 + YGB)
1079#define BR (VR * 128 + YGB)
1080
1081#if defined(__aarch64__)
1082const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
1083    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
1084    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
1085    {UG, VG, UG, VG, UG, VG, UG, VG},
1086    {UG, VG, UG, VG, UG, VG, UG, VG},
1087    {BB, BG, BR, 0, 0, 0, 0, 0},
1088    {0x0101 * YG, 0, 0, 0}};
1089const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
1090    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
1091    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
1092    {VG, UG, VG, UG, VG, UG, VG, UG},
1093    {VG, UG, VG, UG, VG, UG, VG, UG},
1094    {BR, BG, BB, 0, 0, 0, 0, 0},
1095    {0x0101 * YG, 0, 0, 0}};
1096#elif defined(__arm__)
1097const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
1098    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
1099    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
1100    {BB, BG, BR, 0, 0, 0, 0, 0},
1101    {0x0101 * YG, 0, 0, 0}};
1102const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
1103    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
1104    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
1105    {BR, BG, BB, 0, 0, 0, 0, 0},
1106    {0x0101 * YG, 0, 0, 0}};
1107#else
1108const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
1109    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1110     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
1111    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1112     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
1113    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1114     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
1115    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
1116    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
1117    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
1118    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
1119const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
1120    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1121     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
1122    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1123     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
1124    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1125     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
1126    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
1127    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
1128    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
1129    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
1130#endif
1131
1132#undef BB
1133#undef BG
1134#undef BR
1135#undef YGB
1136#undef UB
1137#undef UG
1138#undef VG
1139#undef VR
1140#undef YG
1141
1142// BT.709 YUV to RGB reference
1143//  R = (Y - 16) * 1.164              - V * -1.793
1144//  G = (Y - 16) * 1.164 - U *  0.213 - V *  0.533
1145//  B = (Y - 16) * 1.164 - U * -2.112
1146// See also http://www.equasys.de/colorconversion.html
1147
1148// Y contribution to R,G,B.  Scale and bias.
1149#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
1150#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1151
1152// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
1153// U and V contributions to R,G,B.
1154#define UB -128 /* max(-128, round(-2.112 * 64)) */
1155#define UG 14   /* round(0.213 * 64) */
1156#define VG 34   /* round(0.533  * 64) */
1157#define VR -115 /* round(-1.793 * 64) */
1158
1159// Bias values to round, and subtract 128 from U and V.
1160#define BB (UB * 128 + YGB)
1161#define BG (UG * 128 + VG * 128 + YGB)
1162#define BR (VR * 128 + YGB)
1163
1164#if defined(__aarch64__)
1165const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
1166    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
1167    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
1168    {UG, VG, UG, VG, UG, VG, UG, VG},
1169    {UG, VG, UG, VG, UG, VG, UG, VG},
1170    {BB, BG, BR, 0, 0, 0, 0, 0},
1171    {0x0101 * YG, 0, 0, 0}};
1172const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
1173    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
1174    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
1175    {VG, UG, VG, UG, VG, UG, VG, UG},
1176    {VG, UG, VG, UG, VG, UG, VG, UG},
1177    {BR, BG, BB, 0, 0, 0, 0, 0},
1178    {0x0101 * YG, 0, 0, 0}};
1179#elif defined(__arm__)
1180const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
1181    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
1182    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
1183    {BB, BG, BR, 0, 0, 0, 0, 0},
1184    {0x0101 * YG, 0, 0, 0}};
1185const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
1186    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
1187    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
1188    {BR, BG, BB, 0, 0, 0, 0, 0},
1189    {0x0101 * YG, 0, 0, 0}};
1190#else
1191const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
1192    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1193     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
1194    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1195     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
1196    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1197     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
1198    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
1199    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
1200    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
1201    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
1202const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
1203    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1204     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
1205    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1206     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
1207    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1208     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
1209    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
1210    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
1211    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
1212    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
1213#endif
1214
1215#undef BB
1216#undef BG
1217#undef BR
1218#undef YGB
1219#undef UB
1220#undef UG
1221#undef VG
1222#undef VR
1223#undef YG
1224
1225// C reference code that mimics the YUV assembly.
1226static __inline void YuvPixel(uint8 y,
1227                              uint8 u,
1228                              uint8 v,
1229                              uint8* b,
1230                              uint8* g,
1231                              uint8* r,
1232                              const struct YuvConstants* yuvconstants) {
1233#if defined(__aarch64__)
1234  int ub = -yuvconstants->kUVToRB[0];
1235  int ug = yuvconstants->kUVToG[0];
1236  int vg = yuvconstants->kUVToG[1];
1237  int vr = -yuvconstants->kUVToRB[1];
1238  int bb = yuvconstants->kUVBiasBGR[0];
1239  int bg = yuvconstants->kUVBiasBGR[1];
1240  int br = yuvconstants->kUVBiasBGR[2];
1241  int yg = yuvconstants->kYToRgb[0] / 0x0101;
1242#elif defined(__arm__)
1243  int ub = -yuvconstants->kUVToRB[0];
1244  int ug = yuvconstants->kUVToG[0];
1245  int vg = yuvconstants->kUVToG[4];
1246  int vr = -yuvconstants->kUVToRB[4];
1247  int bb = yuvconstants->kUVBiasBGR[0];
1248  int bg = yuvconstants->kUVBiasBGR[1];
1249  int br = yuvconstants->kUVBiasBGR[2];
1250  int yg = yuvconstants->kYToRgb[0] / 0x0101;
1251#else
1252  int ub = yuvconstants->kUVToB[0];
1253  int ug = yuvconstants->kUVToG[0];
1254  int vg = yuvconstants->kUVToG[1];
1255  int vr = yuvconstants->kUVToR[1];
1256  int bb = yuvconstants->kUVBiasB[0];
1257  int bg = yuvconstants->kUVBiasG[0];
1258  int br = yuvconstants->kUVBiasR[0];
1259  int yg = yuvconstants->kYToRgb[0];
1260#endif
1261
1262  uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
1263  *b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6);
1264  *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
1265  *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
1266}
1267
1268// Y contribution to R,G,B.  Scale and bias.
1269#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
1270#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1271
1272// C reference code that mimics the YUV assembly.
1273static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
1274  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
1275  *b = Clamp((int32)(y1 + YGB) >> 6);
1276  *g = Clamp((int32)(y1 + YGB) >> 6);
1277  *r = Clamp((int32)(y1 + YGB) >> 6);
1278}
1279
1280#undef YG
1281#undef YGB
1282
1283#if !defined(LIBYUV_DISABLE_NEON) && \
1284    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
1285// C mimic assembly.
1286// TODO(fbarchard): Remove subsampling from Neon.
1287void I444ToARGBRow_C(const uint8* src_y,
1288                     const uint8* src_u,
1289                     const uint8* src_v,
1290                     uint8* rgb_buf,
1291                     const struct YuvConstants* yuvconstants,
1292                     int width) {
1293  int x;
1294  for (x = 0; x < width - 1; x += 2) {
1295    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
1296    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
1297    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
1298             yuvconstants);
1299    rgb_buf[3] = 255;
1300    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
1301             yuvconstants);
1302    rgb_buf[7] = 255;
1303    src_y += 2;
1304    src_u += 2;
1305    src_v += 2;
1306    rgb_buf += 8;  // Advance 2 pixels.
1307  }
1308  if (width & 1) {
1309    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1310             rgb_buf + 2, yuvconstants);
1311    rgb_buf[3] = 255;
1312  }
1313}
1314#else
1315void I444ToARGBRow_C(const uint8* src_y,
1316                     const uint8* src_u,
1317                     const uint8* src_v,
1318                     uint8* rgb_buf,
1319                     const struct YuvConstants* yuvconstants,
1320                     int width) {
1321  int x;
1322  for (x = 0; x < width; ++x) {
1323    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1324             rgb_buf + 2, yuvconstants);
1325    rgb_buf[3] = 255;
1326    src_y += 1;
1327    src_u += 1;
1328    src_v += 1;
1329    rgb_buf += 4;  // Advance 1 pixel.
1330  }
1331}
1332#endif
1333
1334// Also used for 420
1335void I422ToARGBRow_C(const uint8* src_y,
1336                     const uint8* src_u,
1337                     const uint8* src_v,
1338                     uint8* rgb_buf,
1339                     const struct YuvConstants* yuvconstants,
1340                     int width) {
1341  int x;
1342  for (x = 0; x < width - 1; x += 2) {
1343    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1344             rgb_buf + 2, yuvconstants);
1345    rgb_buf[3] = 255;
1346    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1347             rgb_buf + 6, yuvconstants);
1348    rgb_buf[7] = 255;
1349    src_y += 2;
1350    src_u += 1;
1351    src_v += 1;
1352    rgb_buf += 8;  // Advance 2 pixels.
1353  }
1354  if (width & 1) {
1355    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1356             rgb_buf + 2, yuvconstants);
1357    rgb_buf[3] = 255;
1358  }
1359}
1360
1361void I422AlphaToARGBRow_C(const uint8* src_y,
1362                          const uint8* src_u,
1363                          const uint8* src_v,
1364                          const uint8* src_a,
1365                          uint8* rgb_buf,
1366                          const struct YuvConstants* yuvconstants,
1367                          int width) {
1368  int x;
1369  for (x = 0; x < width - 1; x += 2) {
1370    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1371             rgb_buf + 2, yuvconstants);
1372    rgb_buf[3] = src_a[0];
1373    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1374             rgb_buf + 6, yuvconstants);
1375    rgb_buf[7] = src_a[1];
1376    src_y += 2;
1377    src_u += 1;
1378    src_v += 1;
1379    src_a += 2;
1380    rgb_buf += 8;  // Advance 2 pixels.
1381  }
1382  if (width & 1) {
1383    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1384             rgb_buf + 2, yuvconstants);
1385    rgb_buf[3] = src_a[0];
1386  }
1387}
1388
1389void I422ToRGB24Row_C(const uint8* src_y,
1390                      const uint8* src_u,
1391                      const uint8* src_v,
1392                      uint8* rgb_buf,
1393                      const struct YuvConstants* yuvconstants,
1394                      int width) {
1395  int x;
1396  for (x = 0; x < width - 1; x += 2) {
1397    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1398             rgb_buf + 2, yuvconstants);
1399    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
1400             rgb_buf + 5, yuvconstants);
1401    src_y += 2;
1402    src_u += 1;
1403    src_v += 1;
1404    rgb_buf += 6;  // Advance 2 pixels.
1405  }
1406  if (width & 1) {
1407    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1408             rgb_buf + 2, yuvconstants);
1409  }
1410}
1411
1412void I422ToARGB4444Row_C(const uint8* src_y,
1413                         const uint8* src_u,
1414                         const uint8* src_v,
1415                         uint8* dst_argb4444,
1416                         const struct YuvConstants* yuvconstants,
1417                         int width) {
1418  uint8 b0;
1419  uint8 g0;
1420  uint8 r0;
1421  uint8 b1;
1422  uint8 g1;
1423  uint8 r1;
1424  int x;
1425  for (x = 0; x < width - 1; x += 2) {
1426    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1427    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
1428    b0 = b0 >> 4;
1429    g0 = g0 >> 4;
1430    r0 = r0 >> 4;
1431    b1 = b1 >> 4;
1432    g1 = g1 >> 4;
1433    r1 = r1 >> 4;
1434    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
1435                               (g1 << 20) | (r1 << 24) | 0xf000f000;
1436    src_y += 2;
1437    src_u += 1;
1438    src_v += 1;
1439    dst_argb4444 += 4;  // Advance 2 pixels.
1440  }
1441  if (width & 1) {
1442    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1443    b0 = b0 >> 4;
1444    g0 = g0 >> 4;
1445    r0 = r0 >> 4;
1446    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
1447  }
1448}
1449
1450void I422ToARGB1555Row_C(const uint8* src_y,
1451                         const uint8* src_u,
1452                         const uint8* src_v,
1453                         uint8* dst_argb1555,
1454                         const struct YuvConstants* yuvconstants,
1455                         int width) {
1456  uint8 b0;
1457  uint8 g0;
1458  uint8 r0;
1459  uint8 b1;
1460  uint8 g1;
1461  uint8 r1;
1462  int x;
1463  for (x = 0; x < width - 1; x += 2) {
1464    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1465    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
1466    b0 = b0 >> 3;
1467    g0 = g0 >> 3;
1468    r0 = r0 >> 3;
1469    b1 = b1 >> 3;
1470    g1 = g1 >> 3;
1471    r1 = r1 >> 3;
1472    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
1473                               (g1 << 21) | (r1 << 26) | 0x80008000;
1474    src_y += 2;
1475    src_u += 1;
1476    src_v += 1;
1477    dst_argb1555 += 4;  // Advance 2 pixels.
1478  }
1479  if (width & 1) {
1480    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1481    b0 = b0 >> 3;
1482    g0 = g0 >> 3;
1483    r0 = r0 >> 3;
1484    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
1485  }
1486}
1487
1488void I422ToRGB565Row_C(const uint8* src_y,
1489                       const uint8* src_u,
1490                       const uint8* src_v,
1491                       uint8* dst_rgb565,
1492                       const struct YuvConstants* yuvconstants,
1493                       int width) {
1494  uint8 b0;
1495  uint8 g0;
1496  uint8 r0;
1497  uint8 b1;
1498  uint8 g1;
1499  uint8 r1;
1500  int x;
1501  for (x = 0; x < width - 1; x += 2) {
1502    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1503    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
1504    b0 = b0 >> 3;
1505    g0 = g0 >> 2;
1506    r0 = r0 >> 3;
1507    b1 = b1 >> 3;
1508    g1 = g1 >> 2;
1509    r1 = r1 >> 3;
1510    *(uint32*)(dst_rgb565) =
1511        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
1512    src_y += 2;
1513    src_u += 1;
1514    src_v += 1;
1515    dst_rgb565 += 4;  // Advance 2 pixels.
1516  }
1517  if (width & 1) {
1518    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
1519    b0 = b0 >> 3;
1520    g0 = g0 >> 2;
1521    r0 = r0 >> 3;
1522    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1523  }
1524}
1525
1526void NV12ToARGBRow_C(const uint8* src_y,
1527                     const uint8* src_uv,
1528                     uint8* rgb_buf,
1529                     const struct YuvConstants* yuvconstants,
1530                     int width) {
1531  int x;
1532  for (x = 0; x < width - 1; x += 2) {
1533    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
1534             rgb_buf + 2, yuvconstants);
1535    rgb_buf[3] = 255;
1536    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
1537             rgb_buf + 6, yuvconstants);
1538    rgb_buf[7] = 255;
1539    src_y += 2;
1540    src_uv += 2;
1541    rgb_buf += 8;  // Advance 2 pixels.
1542  }
1543  if (width & 1) {
1544    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
1545             rgb_buf + 2, yuvconstants);
1546    rgb_buf[3] = 255;
1547  }
1548}
1549
1550void NV21ToARGBRow_C(const uint8* src_y,
1551                     const uint8* src_vu,
1552                     uint8* rgb_buf,
1553                     const struct YuvConstants* yuvconstants,
1554                     int width) {
1555  int x;
1556  for (x = 0; x < width - 1; x += 2) {
1557    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
1558             rgb_buf + 2, yuvconstants);
1559    rgb_buf[3] = 255;
1560    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
1561             rgb_buf + 6, yuvconstants);
1562    rgb_buf[7] = 255;
1563    src_y += 2;
1564    src_vu += 2;
1565    rgb_buf += 8;  // Advance 2 pixels.
1566  }
1567  if (width & 1) {
1568    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
1569             rgb_buf + 2, yuvconstants);
1570    rgb_buf[3] = 255;
1571  }
1572}
1573
1574void NV12ToRGB565Row_C(const uint8* src_y,
1575                       const uint8* src_uv,
1576                       uint8* dst_rgb565,
1577                       const struct YuvConstants* yuvconstants,
1578                       int width) {
1579  uint8 b0;
1580  uint8 g0;
1581  uint8 r0;
1582  uint8 b1;
1583  uint8 g1;
1584  uint8 r1;
1585  int x;
1586  for (x = 0; x < width - 1; x += 2) {
1587    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
1588    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
1589    b0 = b0 >> 3;
1590    g0 = g0 >> 2;
1591    r0 = r0 >> 3;
1592    b1 = b1 >> 3;
1593    g1 = g1 >> 2;
1594    r1 = r1 >> 3;
1595    *(uint32*)(dst_rgb565) =
1596        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
1597    src_y += 2;
1598    src_uv += 2;
1599    dst_rgb565 += 4;  // Advance 2 pixels.
1600  }
1601  if (width & 1) {
1602    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
1603    b0 = b0 >> 3;
1604    g0 = g0 >> 2;
1605    r0 = r0 >> 3;
1606    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1607  }
1608}
1609
1610void YUY2ToARGBRow_C(const uint8* src_yuy2,
1611                     uint8* rgb_buf,
1612                     const struct YuvConstants* yuvconstants,
1613                     int width) {
1614  int x;
1615  for (x = 0; x < width - 1; x += 2) {
1616    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
1617             rgb_buf + 2, yuvconstants);
1618    rgb_buf[3] = 255;
1619    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
1620             rgb_buf + 6, yuvconstants);
1621    rgb_buf[7] = 255;
1622    src_yuy2 += 4;
1623    rgb_buf += 8;  // Advance 2 pixels.
1624  }
1625  if (width & 1) {
1626    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
1627             rgb_buf + 2, yuvconstants);
1628    rgb_buf[3] = 255;
1629  }
1630}
1631
1632void UYVYToARGBRow_C(const uint8* src_uyvy,
1633                     uint8* rgb_buf,
1634                     const struct YuvConstants* yuvconstants,
1635                     int width) {
1636  int x;
1637  for (x = 0; x < width - 1; x += 2) {
1638    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
1639             rgb_buf + 2, yuvconstants);
1640    rgb_buf[3] = 255;
1641    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
1642             rgb_buf + 6, yuvconstants);
1643    rgb_buf[7] = 255;
1644    src_uyvy += 4;
1645    rgb_buf += 8;  // Advance 2 pixels.
1646  }
1647  if (width & 1) {
1648    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
1649             rgb_buf + 2, yuvconstants);
1650    rgb_buf[3] = 255;
1651  }
1652}
1653
1654void I422ToRGBARow_C(const uint8* src_y,
1655                     const uint8* src_u,
1656                     const uint8* src_v,
1657                     uint8* rgb_buf,
1658                     const struct YuvConstants* yuvconstants,
1659                     int width) {
1660  int x;
1661  for (x = 0; x < width - 1; x += 2) {
1662    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
1663             rgb_buf + 3, yuvconstants);
1664    rgb_buf[0] = 255;
1665    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
1666             rgb_buf + 7, yuvconstants);
1667    rgb_buf[4] = 255;
1668    src_y += 2;
1669    src_u += 1;
1670    src_v += 1;
1671    rgb_buf += 8;  // Advance 2 pixels.
1672  }
1673  if (width & 1) {
1674    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
1675             rgb_buf + 3, yuvconstants);
1676    rgb_buf[0] = 255;
1677  }
1678}
1679
1680void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
1681  int x;
1682  for (x = 0; x < width - 1; x += 2) {
1683    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1684    rgb_buf[3] = 255;
1685    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1686    rgb_buf[7] = 255;
1687    src_y += 2;
1688    rgb_buf += 8;  // Advance 2 pixels.
1689  }
1690  if (width & 1) {
1691    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1692    rgb_buf[3] = 255;
1693  }
1694}
1695
1696void MirrorRow_C(const uint8* src, uint8* dst, int width) {
1697  int x;
1698  src += width - 1;
1699  for (x = 0; x < width - 1; x += 2) {
1700    dst[x] = src[0];
1701    dst[x + 1] = src[-1];
1702    src -= 2;
1703  }
1704  if (width & 1) {
1705    dst[width - 1] = src[0];
1706  }
1707}
1708
1709void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1710  int x;
1711  src_uv += (width - 1) << 1;
1712  for (x = 0; x < width - 1; x += 2) {
1713    dst_u[x] = src_uv[0];
1714    dst_u[x + 1] = src_uv[-2];
1715    dst_v[x] = src_uv[1];
1716    dst_v[x + 1] = src_uv[-2 + 1];
1717    src_uv -= 4;
1718  }
1719  if (width & 1) {
1720    dst_u[width - 1] = src_uv[0];
1721    dst_v[width - 1] = src_uv[1];
1722  }
1723}
1724
1725void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
1726  int x;
1727  const uint32* src32 = (const uint32*)(src);
1728  uint32* dst32 = (uint32*)(dst);
1729  src32 += width - 1;
1730  for (x = 0; x < width - 1; x += 2) {
1731    dst32[x] = src32[0];
1732    dst32[x + 1] = src32[-1];
1733    src32 -= 2;
1734  }
1735  if (width & 1) {
1736    dst32[width - 1] = src32[0];
1737  }
1738}
1739
1740void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1741  int x;
1742  for (x = 0; x < width - 1; x += 2) {
1743    dst_u[x] = src_uv[0];
1744    dst_u[x + 1] = src_uv[2];
1745    dst_v[x] = src_uv[1];
1746    dst_v[x + 1] = src_uv[3];
1747    src_uv += 4;
1748  }
1749  if (width & 1) {
1750    dst_u[width - 1] = src_uv[0];
1751    dst_v[width - 1] = src_uv[1];
1752  }
1753}
1754
1755void MergeUVRow_C(const uint8* src_u,
1756                  const uint8* src_v,
1757                  uint8* dst_uv,
1758                  int width) {
1759  int x;
1760  for (x = 0; x < width - 1; x += 2) {
1761    dst_uv[0] = src_u[x];
1762    dst_uv[1] = src_v[x];
1763    dst_uv[2] = src_u[x + 1];
1764    dst_uv[3] = src_v[x + 1];
1765    dst_uv += 4;
1766  }
1767  if (width & 1) {
1768    dst_uv[0] = src_u[width - 1];
1769    dst_uv[1] = src_v[width - 1];
1770  }
1771}
1772
1773void CopyRow_C(const uint8* src, uint8* dst, int count) {
1774  memcpy(dst, src, count);
1775}
1776
1777void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
1778  memcpy(dst, src, count * 2);
1779}
1780
1781void SetRow_C(uint8* dst, uint8 v8, int width) {
1782  memset(dst, v8, width);
1783}
1784
1785void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
1786  uint32* d = (uint32*)(dst_argb);
1787  int x;
1788  for (x = 0; x < width; ++x) {
1789    d[x] = v32;
1790  }
1791}
1792
1793// Filter 2 rows of YUY2 UV's (422) into U and V (420).
1794void YUY2ToUVRow_C(const uint8* src_yuy2,
1795                   int src_stride_yuy2,
1796                   uint8* dst_u,
1797                   uint8* dst_v,
1798                   int width) {
1799  // Output a row of UV values, filtering 2 rows of YUY2.
1800  int x;
1801  for (x = 0; x < width; x += 2) {
1802    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
1803    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
1804    src_yuy2 += 4;
1805    dst_u += 1;
1806    dst_v += 1;
1807  }
1808}
1809
1810// Copy row of YUY2 UV's (422) into U and V (422).
1811void YUY2ToUV422Row_C(const uint8* src_yuy2,
1812                      uint8* dst_u,
1813                      uint8* dst_v,
1814                      int width) {
1815  // Output a row of UV values.
1816  int x;
1817  for (x = 0; x < width; x += 2) {
1818    dst_u[0] = src_yuy2[1];
1819    dst_v[0] = src_yuy2[3];
1820    src_yuy2 += 4;
1821    dst_u += 1;
1822    dst_v += 1;
1823  }
1824}
1825
1826// Copy row of YUY2 Y's (422) into Y (420/422).
1827void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
1828  // Output a row of Y values.
1829  int x;
1830  for (x = 0; x < width - 1; x += 2) {
1831    dst_y[x] = src_yuy2[0];
1832    dst_y[x + 1] = src_yuy2[2];
1833    src_yuy2 += 4;
1834  }
1835  if (width & 1) {
1836    dst_y[width - 1] = src_yuy2[0];
1837  }
1838}
1839
1840// Filter 2 rows of UYVY UV's (422) into U and V (420).
1841void UYVYToUVRow_C(const uint8* src_uyvy,
1842                   int src_stride_uyvy,
1843                   uint8* dst_u,
1844                   uint8* dst_v,
1845                   int width) {
1846  // Output a row of UV values.
1847  int x;
1848  for (x = 0; x < width; x += 2) {
1849    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
1850    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
1851    src_uyvy += 4;
1852    dst_u += 1;
1853    dst_v += 1;
1854  }
1855}
1856
1857// Copy row of UYVY UV's (422) into U and V (422).
1858void UYVYToUV422Row_C(const uint8* src_uyvy,
1859                      uint8* dst_u,
1860                      uint8* dst_v,
1861                      int width) {
1862  // Output a row of UV values.
1863  int x;
1864  for (x = 0; x < width; x += 2) {
1865    dst_u[0] = src_uyvy[0];
1866    dst_v[0] = src_uyvy[2];
1867    src_uyvy += 4;
1868    dst_u += 1;
1869    dst_v += 1;
1870  }
1871}
1872
1873// Copy row of UYVY Y's (422) into Y (420/422).
1874void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
1875  // Output a row of Y values.
1876  int x;
1877  for (x = 0; x < width - 1; x += 2) {
1878    dst_y[x] = src_uyvy[1];
1879    dst_y[x + 1] = src_uyvy[3];
1880    src_uyvy += 4;
1881  }
1882  if (width & 1) {
1883    dst_y[width - 1] = src_uyvy[1];
1884  }
1885}
1886
1887#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
1888
1889// Blend src_argb0 over src_argb1 and store to dst_argb.
1890// dst_argb may be src_argb0 or src_argb1.
1891// This code mimics the SSSE3 version for better testability.
1892void ARGBBlendRow_C(const uint8* src_argb0,
1893                    const uint8* src_argb1,
1894                    uint8* dst_argb,
1895                    int width) {
1896  int x;
1897  for (x = 0; x < width - 1; x += 2) {
1898    uint32 fb = src_argb0[0];
1899    uint32 fg = src_argb0[1];
1900    uint32 fr = src_argb0[2];
1901    uint32 a = src_argb0[3];
1902    uint32 bb = src_argb1[0];
1903    uint32 bg = src_argb1[1];
1904    uint32 br = src_argb1[2];
1905    dst_argb[0] = BLEND(fb, bb, a);
1906    dst_argb[1] = BLEND(fg, bg, a);
1907    dst_argb[2] = BLEND(fr, br, a);
1908    dst_argb[3] = 255u;
1909
1910    fb = src_argb0[4 + 0];
1911    fg = src_argb0[4 + 1];
1912    fr = src_argb0[4 + 2];
1913    a = src_argb0[4 + 3];
1914    bb = src_argb1[4 + 0];
1915    bg = src_argb1[4 + 1];
1916    br = src_argb1[4 + 2];
1917    dst_argb[4 + 0] = BLEND(fb, bb, a);
1918    dst_argb[4 + 1] = BLEND(fg, bg, a);
1919    dst_argb[4 + 2] = BLEND(fr, br, a);
1920    dst_argb[4 + 3] = 255u;
1921    src_argb0 += 8;
1922    src_argb1 += 8;
1923    dst_argb += 8;
1924  }
1925
1926  if (width & 1) {
1927    uint32 fb = src_argb0[0];
1928    uint32 fg = src_argb0[1];
1929    uint32 fr = src_argb0[2];
1930    uint32 a = src_argb0[3];
1931    uint32 bb = src_argb1[0];
1932    uint32 bg = src_argb1[1];
1933    uint32 br = src_argb1[2];
1934    dst_argb[0] = BLEND(fb, bb, a);
1935    dst_argb[1] = BLEND(fg, bg, a);
1936    dst_argb[2] = BLEND(fr, br, a);
1937    dst_argb[3] = 255u;
1938  }
1939}
1940#undef BLEND
1941
1942#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
1943void BlendPlaneRow_C(const uint8* src0,
1944                     const uint8* src1,
1945                     const uint8* alpha,
1946                     uint8* dst,
1947                     int width) {
1948  int x;
1949  for (x = 0; x < width - 1; x += 2) {
1950    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
1951    dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
1952    src0 += 2;
1953    src1 += 2;
1954    alpha += 2;
1955    dst += 2;
1956  }
1957  if (width & 1) {
1958    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
1959  }
1960}
1961#undef UBLEND
1962
1963#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
1964
1965// Multiply source RGB by alpha and store to destination.
1966// This code mimics the SSSE3 version for better testability.
1967void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
1968  int i;
1969  for (i = 0; i < width - 1; i += 2) {
1970    uint32 b = src_argb[0];
1971    uint32 g = src_argb[1];
1972    uint32 r = src_argb[2];
1973    uint32 a = src_argb[3];
1974    dst_argb[0] = ATTENUATE(b, a);
1975    dst_argb[1] = ATTENUATE(g, a);
1976    dst_argb[2] = ATTENUATE(r, a);
1977    dst_argb[3] = a;
1978    b = src_argb[4];
1979    g = src_argb[5];
1980    r = src_argb[6];
1981    a = src_argb[7];
1982    dst_argb[4] = ATTENUATE(b, a);
1983    dst_argb[5] = ATTENUATE(g, a);
1984    dst_argb[6] = ATTENUATE(r, a);
1985    dst_argb[7] = a;
1986    src_argb += 8;
1987    dst_argb += 8;
1988  }
1989
1990  if (width & 1) {
1991    const uint32 b = src_argb[0];
1992    const uint32 g = src_argb[1];
1993    const uint32 r = src_argb[2];
1994    const uint32 a = src_argb[3];
1995    dst_argb[0] = ATTENUATE(b, a);
1996    dst_argb[1] = ATTENUATE(g, a);
1997    dst_argb[2] = ATTENUATE(r, a);
1998    dst_argb[3] = a;
1999  }
2000}
2001#undef ATTENUATE
2002
2003// Divide source RGB by alpha and store to destination.
2004// b = (b * 255 + (a / 2)) / a;
2005// g = (g * 255 + (a / 2)) / a;
2006// r = (r * 255 + (a / 2)) / a;
2007// Reciprocal method is off by 1 on some values. ie 125
2008// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
2009#define T(a) 0x01000000 + (0x10000 / a)
2010const uint32 fixed_invtbl8[256] = {
2011    0x01000000, 0x0100ffff, T(0x02), T(0x03),   T(0x04), T(0x05), T(0x06),
2012    T(0x07),    T(0x08),    T(0x09), T(0x0a),   T(0x0b), T(0x0c), T(0x0d),
2013    T(0x0e),    T(0x0f),    T(0x10), T(0x11),   T(0x12), T(0x13), T(0x14),
2014    T(0x15),    T(0x16),    T(0x17), T(0x18),   T(0x19), T(0x1a), T(0x1b),
2015    T(0x1c),    T(0x1d),    T(0x1e), T(0x1f),   T(0x20), T(0x21), T(0x22),
2016    T(0x23),    T(0x24),    T(0x25), T(0x26),   T(0x27), T(0x28), T(0x29),
2017    T(0x2a),    T(0x2b),    T(0x2c), T(0x2d),   T(0x2e), T(0x2f), T(0x30),
2018    T(0x31),    T(0x32),    T(0x33), T(0x34),   T(0x35), T(0x36), T(0x37),
2019    T(0x38),    T(0x39),    T(0x3a), T(0x3b),   T(0x3c), T(0x3d), T(0x3e),
2020    T(0x3f),    T(0x40),    T(0x41), T(0x42),   T(0x43), T(0x44), T(0x45),
2021    T(0x46),    T(0x47),    T(0x48), T(0x49),   T(0x4a), T(0x4b), T(0x4c),
2022    T(0x4d),    T(0x4e),    T(0x4f), T(0x50),   T(0x51), T(0x52), T(0x53),
2023    T(0x54),    T(0x55),    T(0x56), T(0x57),   T(0x58), T(0x59), T(0x5a),
2024    T(0x5b),    T(0x5c),    T(0x5d), T(0x5e),   T(0x5f), T(0x60), T(0x61),
2025    T(0x62),    T(0x63),    T(0x64), T(0x65),   T(0x66), T(0x67), T(0x68),
2026    T(0x69),    T(0x6a),    T(0x6b), T(0x6c),   T(0x6d), T(0x6e), T(0x6f),
2027    T(0x70),    T(0x71),    T(0x72), T(0x73),   T(0x74), T(0x75), T(0x76),
2028    T(0x77),    T(0x78),    T(0x79), T(0x7a),   T(0x7b), T(0x7c), T(0x7d),
2029    T(0x7e),    T(0x7f),    T(0x80), T(0x81),   T(0x82), T(0x83), T(0x84),
2030    T(0x85),    T(0x86),    T(0x87), T(0x88),   T(0x89), T(0x8a), T(0x8b),
2031    T(0x8c),    T(0x8d),    T(0x8e), T(0x8f),   T(0x90), T(0x91), T(0x92),
2032    T(0x93),    T(0x94),    T(0x95), T(0x96),   T(0x97), T(0x98), T(0x99),
2033    T(0x9a),    T(0x9b),    T(0x9c), T(0x9d),   T(0x9e), T(0x9f), T(0xa0),
2034    T(0xa1),    T(0xa2),    T(0xa3), T(0xa4),   T(0xa5), T(0xa6), T(0xa7),
2035    T(0xa8),    T(0xa9),    T(0xaa), T(0xab),   T(0xac), T(0xad), T(0xae),
2036    T(0xaf),    T(0xb0),    T(0xb1), T(0xb2),   T(0xb3), T(0xb4), T(0xb5),
2037    T(0xb6),    T(0xb7),    T(0xb8), T(0xb9),   T(0xba), T(0xbb), T(0xbc),
2038    T(0xbd),    T(0xbe),    T(0xbf), T(0xc0),   T(0xc1), T(0xc2), T(0xc3),
2039    T(0xc4),    T(0xc5),    T(0xc6), T(0xc7),   T(0xc8), T(0xc9), T(0xca),
2040    T(0xcb),    T(0xcc),    T(0xcd), T(0xce),   T(0xcf), T(0xd0), T(0xd1),
2041    T(0xd2),    T(0xd3),    T(0xd4), T(0xd5),   T(0xd6), T(0xd7), T(0xd8),
2042    T(0xd9),    T(0xda),    T(0xdb), T(0xdc),   T(0xdd), T(0xde), T(0xdf),
2043    T(0xe0),    T(0xe1),    T(0xe2), T(0xe3),   T(0xe4), T(0xe5), T(0xe6),
2044    T(0xe7),    T(0xe8),    T(0xe9), T(0xea),   T(0xeb), T(0xec), T(0xed),
2045    T(0xee),    T(0xef),    T(0xf0), T(0xf1),   T(0xf2), T(0xf3), T(0xf4),
2046    T(0xf5),    T(0xf6),    T(0xf7), T(0xf8),   T(0xf9), T(0xfa), T(0xfb),
2047    T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
2048#undef T
2049
2050void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
2051  int i;
2052  for (i = 0; i < width; ++i) {
2053    uint32 b = src_argb[0];
2054    uint32 g = src_argb[1];
2055    uint32 r = src_argb[2];
2056    const uint32 a = src_argb[3];
2057    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
2058    b = (b * ia) >> 8;
2059    g = (g * ia) >> 8;
2060    r = (r * ia) >> 8;
2061    // Clamping should not be necessary but is free in assembly.
2062    dst_argb[0] = clamp255(b);
2063    dst_argb[1] = clamp255(g);
2064    dst_argb[2] = clamp255(r);
2065    dst_argb[3] = a;
2066    src_argb += 4;
2067    dst_argb += 4;
2068  }
2069}
2070
2071void ComputeCumulativeSumRow_C(const uint8* row,
2072                               int32* cumsum,
2073                               const int32* previous_cumsum,
2074                               int width) {
2075  int32 row_sum[4] = {0, 0, 0, 0};
2076  int x;
2077  for (x = 0; x < width; ++x) {
2078    row_sum[0] += row[x * 4 + 0];
2079    row_sum[1] += row[x * 4 + 1];
2080    row_sum[2] += row[x * 4 + 2];
2081    row_sum[3] += row[x * 4 + 3];
2082    cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
2083    cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
2084    cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
2085    cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
2086  }
2087}
2088
2089void CumulativeSumToAverageRow_C(const int32* tl,
2090                                 const int32* bl,
2091                                 int w,
2092                                 int area,
2093                                 uint8* dst,
2094                                 int count) {
2095  float ooa = 1.0f / area;
2096  int i;
2097  for (i = 0; i < count; ++i) {
2098    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
2099    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
2100    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
2101    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
2102    dst += 4;
2103    tl += 4;
2104    bl += 4;
2105  }
2106}
2107
2108// Copy pixels from rotated source to destination row with a slope.
2109LIBYUV_API
2110void ARGBAffineRow_C(const uint8* src_argb,
2111                     int src_argb_stride,
2112                     uint8* dst_argb,
2113                     const float* uv_dudv,
2114                     int width) {
2115  int i;
2116  // Render a row of pixels from source into a buffer.
2117  float uv[2];
2118  uv[0] = uv_dudv[0];
2119  uv[1] = uv_dudv[1];
2120  for (i = 0; i < width; ++i) {
2121    int x = (int)(uv[0]);
2122    int y = (int)(uv[1]);
2123    *(uint32*)(dst_argb) =
2124        *(const uint32*)(src_argb + y * src_argb_stride + x * 4);
2125    dst_argb += 4;
2126    uv[0] += uv_dudv[2];
2127    uv[1] += uv_dudv[3];
2128  }
2129}
2130
2131// Blend 2 rows into 1.
2132static void HalfRow_C(const uint8* src_uv,
2133                      ptrdiff_t src_uv_stride,
2134                      uint8* dst_uv,
2135                      int width) {
2136  int x;
2137  for (x = 0; x < width; ++x) {
2138    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
2139  }
2140}
2141
2142static void HalfRow_16_C(const uint16* src_uv,
2143                         ptrdiff_t src_uv_stride,
2144                         uint16* dst_uv,
2145                         int width) {
2146  int x;
2147  for (x = 0; x < width; ++x) {
2148    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
2149  }
2150}
2151
2152// C version 2x2 -> 2x1.
2153void InterpolateRow_C(uint8* dst_ptr,
2154                      const uint8* src_ptr,
2155                      ptrdiff_t src_stride,
2156                      int width,
2157                      int source_y_fraction) {
2158  int y1_fraction = source_y_fraction;
2159  int y0_fraction = 256 - y1_fraction;
2160  const uint8* src_ptr1 = src_ptr + src_stride;
2161  int x;
2162  if (y1_fraction == 0) {
2163    memcpy(dst_ptr, src_ptr, width);
2164    return;
2165  }
2166  if (y1_fraction == 128) {
2167    HalfRow_C(src_ptr, src_stride, dst_ptr, width);
2168    return;
2169  }
2170  for (x = 0; x < width - 1; x += 2) {
2171    dst_ptr[0] =
2172        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
2173    dst_ptr[1] =
2174        (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
2175    src_ptr += 2;
2176    src_ptr1 += 2;
2177    dst_ptr += 2;
2178  }
2179  if (width & 1) {
2180    dst_ptr[0] =
2181        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
2182  }
2183}
2184
2185void InterpolateRow_16_C(uint16* dst_ptr,
2186                         const uint16* src_ptr,
2187                         ptrdiff_t src_stride,
2188                         int width,
2189                         int source_y_fraction) {
2190  int y1_fraction = source_y_fraction;
2191  int y0_fraction = 256 - y1_fraction;
2192  const uint16* src_ptr1 = src_ptr + src_stride;
2193  int x;
2194  if (source_y_fraction == 0) {
2195    memcpy(dst_ptr, src_ptr, width * 2);
2196    return;
2197  }
2198  if (source_y_fraction == 128) {
2199    HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
2200    return;
2201  }
2202  for (x = 0; x < width - 1; x += 2) {
2203    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2204    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
2205    src_ptr += 2;
2206    src_ptr1 += 2;
2207    dst_ptr += 2;
2208  }
2209  if (width & 1) {
2210    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2211  }
2212}
2213
2214// Use first 4 shuffler values to reorder ARGB channels.
2215void ARGBShuffleRow_C(const uint8* src_argb,
2216                      uint8* dst_argb,
2217                      const uint8* shuffler,
2218                      int width) {
2219  int index0 = shuffler[0];
2220  int index1 = shuffler[1];
2221  int index2 = shuffler[2];
2222  int index3 = shuffler[3];
2223  // Shuffle a row of ARGB.
2224  int x;
2225  for (x = 0; x < width; ++x) {
2226    // To support in-place conversion.
2227    uint8 b = src_argb[index0];
2228    uint8 g = src_argb[index1];
2229    uint8 r = src_argb[index2];
2230    uint8 a = src_argb[index3];
2231    dst_argb[0] = b;
2232    dst_argb[1] = g;
2233    dst_argb[2] = r;
2234    dst_argb[3] = a;
2235    src_argb += 4;
2236    dst_argb += 4;
2237  }
2238}
2239
2240void I422ToYUY2Row_C(const uint8* src_y,
2241                     const uint8* src_u,
2242                     const uint8* src_v,
2243                     uint8* dst_frame,
2244                     int width) {
2245  int x;
2246  for (x = 0; x < width - 1; x += 2) {
2247    dst_frame[0] = src_y[0];
2248    dst_frame[1] = src_u[0];
2249    dst_frame[2] = src_y[1];
2250    dst_frame[3] = src_v[0];
2251    dst_frame += 4;
2252    src_y += 2;
2253    src_u += 1;
2254    src_v += 1;
2255  }
2256  if (width & 1) {
2257    dst_frame[0] = src_y[0];
2258    dst_frame[1] = src_u[0];
2259    dst_frame[2] = 0;
2260    dst_frame[3] = src_v[0];
2261  }
2262}
2263
2264void I422ToUYVYRow_C(const uint8* src_y,
2265                     const uint8* src_u,
2266                     const uint8* src_v,
2267                     uint8* dst_frame,
2268                     int width) {
2269  int x;
2270  for (x = 0; x < width - 1; x += 2) {
2271    dst_frame[0] = src_u[0];
2272    dst_frame[1] = src_y[0];
2273    dst_frame[2] = src_v[0];
2274    dst_frame[3] = src_y[1];
2275    dst_frame += 4;
2276    src_y += 2;
2277    src_u += 1;
2278    src_v += 1;
2279  }
2280  if (width & 1) {
2281    dst_frame[0] = src_u[0];
2282    dst_frame[1] = src_y[0];
2283    dst_frame[2] = src_v[0];
2284    dst_frame[3] = 0;
2285  }
2286}
2287
2288void ARGBPolynomialRow_C(const uint8* src_argb,
2289                         uint8* dst_argb,
2290                         const float* poly,
2291                         int width) {
2292  int i;
2293  for (i = 0; i < width; ++i) {
2294    float b = (float)(src_argb[0]);
2295    float g = (float)(src_argb[1]);
2296    float r = (float)(src_argb[2]);
2297    float a = (float)(src_argb[3]);
2298    float b2 = b * b;
2299    float g2 = g * g;
2300    float r2 = r * r;
2301    float a2 = a * a;
2302    float db = poly[0] + poly[4] * b;
2303    float dg = poly[1] + poly[5] * g;
2304    float dr = poly[2] + poly[6] * r;
2305    float da = poly[3] + poly[7] * a;
2306    float b3 = b2 * b;
2307    float g3 = g2 * g;
2308    float r3 = r2 * r;
2309    float a3 = a2 * a;
2310    db += poly[8] * b2;
2311    dg += poly[9] * g2;
2312    dr += poly[10] * r2;
2313    da += poly[11] * a2;
2314    db += poly[12] * b3;
2315    dg += poly[13] * g3;
2316    dr += poly[14] * r3;
2317    da += poly[15] * a3;
2318
2319    dst_argb[0] = Clamp((int32)(db));
2320    dst_argb[1] = Clamp((int32)(dg));
2321    dst_argb[2] = Clamp((int32)(dr));
2322    dst_argb[3] = Clamp((int32)(da));
2323    src_argb += 4;
2324    dst_argb += 4;
2325  }
2326}
2327
2328// Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
2329// adjust the source integer range to the half float range desired.
2330
2331// This magic constant is 2^-112. Multiplying by this
2332// is the same as subtracting 112 from the exponent, which
2333// is the difference in exponent bias between 32-bit and
2334// 16-bit floats. Once we've done this subtraction, we can
2335// simply extract the low bits of the exponent and the high
2336// bits of the mantissa from our float and we're done.
2337
2338void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width) {
2339  int i;
2340  float mult = 1.9259299444e-34f * scale;
2341  for (i = 0; i < width; ++i) {
2342    float value = src[i] * mult;
2343    dst[i] = (uint16)((*(uint32_t*)&value) >> 13);
2344  }
2345}
2346
2347void ARGBLumaColorTableRow_C(const uint8* src_argb,
2348                             uint8* dst_argb,
2349                             int width,
2350                             const uint8* luma,
2351                             uint32 lumacoeff) {
2352  uint32 bc = lumacoeff & 0xff;
2353  uint32 gc = (lumacoeff >> 8) & 0xff;
2354  uint32 rc = (lumacoeff >> 16) & 0xff;
2355
2356  int i;
2357  for (i = 0; i < width - 1; i += 2) {
2358    // Luminance in rows, color values in columns.
2359    const uint8* luma0 =
2360        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
2361        luma;
2362    const uint8* luma1;
2363    dst_argb[0] = luma0[src_argb[0]];
2364    dst_argb[1] = luma0[src_argb[1]];
2365    dst_argb[2] = luma0[src_argb[2]];
2366    dst_argb[3] = src_argb[3];
2367    luma1 =
2368        ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
2369        luma;
2370    dst_argb[4] = luma1[src_argb[4]];
2371    dst_argb[5] = luma1[src_argb[5]];
2372    dst_argb[6] = luma1[src_argb[6]];
2373    dst_argb[7] = src_argb[7];
2374    src_argb += 8;
2375    dst_argb += 8;
2376  }
2377  if (width & 1) {
2378    // Luminance in rows, color values in columns.
2379    const uint8* luma0 =
2380        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
2381        luma;
2382    dst_argb[0] = luma0[src_argb[0]];
2383    dst_argb[1] = luma0[src_argb[1]];
2384    dst_argb[2] = luma0[src_argb[2]];
2385    dst_argb[3] = src_argb[3];
2386  }
2387}
2388
2389void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
2390  int i;
2391  for (i = 0; i < width - 1; i += 2) {
2392    dst[3] = src[3];
2393    dst[7] = src[7];
2394    dst += 8;
2395    src += 8;
2396  }
2397  if (width & 1) {
2398    dst[3] = src[3];
2399  }
2400}
2401
2402void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {
2403  int i;
2404  for (i = 0; i < width - 1; i += 2) {
2405    dst_a[0] = src_argb[3];
2406    dst_a[1] = src_argb[7];
2407    dst_a += 2;
2408    src_argb += 8;
2409  }
2410  if (width & 1) {
2411    dst_a[0] = src_argb[3];
2412  }
2413}
2414
2415void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
2416  int i;
2417  for (i = 0; i < width - 1; i += 2) {
2418    dst[3] = src[0];
2419    dst[7] = src[1];
2420    dst += 8;
2421    src += 2;
2422  }
2423  if (width & 1) {
2424    dst[3] = src[0];
2425  }
2426}
2427
2428// Maximum temporary width for wrappers to process at a time, in pixels.
2429#define MAXTWIDTH 2048
2430
2431#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
2432    defined(HAS_I422TORGB565ROW_SSSE3)
2433// row_win.cc has asm version, but GCC uses 2 step wrapper.
2434void I422ToRGB565Row_SSSE3(const uint8* src_y,
2435                           const uint8* src_u,
2436                           const uint8* src_v,
2437                           uint8* dst_rgb565,
2438                           const struct YuvConstants* yuvconstants,
2439                           int width) {
2440  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2441  while (width > 0) {
2442    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2443    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
2444    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
2445    src_y += twidth;
2446    src_u += twidth / 2;
2447    src_v += twidth / 2;
2448    dst_rgb565 += twidth * 2;
2449    width -= twidth;
2450  }
2451}
2452#endif
2453
2454#if defined(HAS_I422TOARGB1555ROW_SSSE3)
2455void I422ToARGB1555Row_SSSE3(const uint8* src_y,
2456                             const uint8* src_u,
2457                             const uint8* src_v,
2458                             uint8* dst_argb1555,
2459                             const struct YuvConstants* yuvconstants,
2460                             int width) {
2461  // Row buffer for intermediate ARGB pixels.
2462  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2463  while (width > 0) {
2464    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2465    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
2466    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
2467    src_y += twidth;
2468    src_u += twidth / 2;
2469    src_v += twidth / 2;
2470    dst_argb1555 += twidth * 2;
2471    width -= twidth;
2472  }
2473}
2474#endif
2475
2476#if defined(HAS_I422TOARGB4444ROW_SSSE3)
2477void I422ToARGB4444Row_SSSE3(const uint8* src_y,
2478                             const uint8* src_u,
2479                             const uint8* src_v,
2480                             uint8* dst_argb4444,
2481                             const struct YuvConstants* yuvconstants,
2482                             int width) {
2483  // Row buffer for intermediate ARGB pixels.
2484  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2485  while (width > 0) {
2486    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2487    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
2488    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
2489    src_y += twidth;
2490    src_u += twidth / 2;
2491    src_v += twidth / 2;
2492    dst_argb4444 += twidth * 2;
2493    width -= twidth;
2494  }
2495}
2496#endif
2497
2498#if defined(HAS_NV12TORGB565ROW_SSSE3)
2499void NV12ToRGB565Row_SSSE3(const uint8* src_y,
2500                           const uint8* src_uv,
2501                           uint8* dst_rgb565,
2502                           const struct YuvConstants* yuvconstants,
2503                           int width) {
2504  // Row buffer for intermediate ARGB pixels.
2505  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2506  while (width > 0) {
2507    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2508    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
2509    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
2510    src_y += twidth;
2511    src_uv += twidth;
2512    dst_rgb565 += twidth * 2;
2513    width -= twidth;
2514  }
2515}
2516#endif
2517
2518#if defined(HAS_I422TORGB565ROW_AVX2)
2519void I422ToRGB565Row_AVX2(const uint8* src_y,
2520                          const uint8* src_u,
2521                          const uint8* src_v,
2522                          uint8* dst_rgb565,
2523                          const struct YuvConstants* yuvconstants,
2524                          int width) {
2525  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2526  while (width > 0) {
2527    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2528    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
2529#if defined(HAS_ARGBTORGB565ROW_AVX2)
2530    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
2531#else
2532    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
2533#endif
2534    src_y += twidth;
2535    src_u += twidth / 2;
2536    src_v += twidth / 2;
2537    dst_rgb565 += twidth * 2;
2538    width -= twidth;
2539  }
2540}
2541#endif
2542
2543#if defined(HAS_I422TOARGB1555ROW_AVX2)
2544void I422ToARGB1555Row_AVX2(const uint8* src_y,
2545                            const uint8* src_u,
2546                            const uint8* src_v,
2547                            uint8* dst_argb1555,
2548                            const struct YuvConstants* yuvconstants,
2549                            int width) {
2550  // Row buffer for intermediate ARGB pixels.
2551  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2552  while (width > 0) {
2553    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2554    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
2555#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
2556    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
2557#else
2558    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
2559#endif
2560    src_y += twidth;
2561    src_u += twidth / 2;
2562    src_v += twidth / 2;
2563    dst_argb1555 += twidth * 2;
2564    width -= twidth;
2565  }
2566}
2567#endif
2568
2569#if defined(HAS_I422TOARGB4444ROW_AVX2)
2570void I422ToARGB4444Row_AVX2(const uint8* src_y,
2571                            const uint8* src_u,
2572                            const uint8* src_v,
2573                            uint8* dst_argb4444,
2574                            const struct YuvConstants* yuvconstants,
2575                            int width) {
2576  // Row buffer for intermediate ARGB pixels.
2577  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2578  while (width > 0) {
2579    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2580    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
2581#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
2582    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
2583#else
2584    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
2585#endif
2586    src_y += twidth;
2587    src_u += twidth / 2;
2588    src_v += twidth / 2;
2589    dst_argb4444 += twidth * 2;
2590    width -= twidth;
2591  }
2592}
2593#endif
2594
2595#if defined(HAS_I422TORGB24ROW_AVX2)
2596void I422ToRGB24Row_AVX2(const uint8* src_y,
2597                         const uint8* src_u,
2598                         const uint8* src_v,
2599                         uint8* dst_rgb24,
2600                         const struct YuvConstants* yuvconstants,
2601                         int width) {
2602  // Row buffer for intermediate ARGB pixels.
2603  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2604  while (width > 0) {
2605    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2606    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
2607    // TODO(fbarchard): ARGBToRGB24Row_AVX2
2608    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
2609    src_y += twidth;
2610    src_u += twidth / 2;
2611    src_v += twidth / 2;
2612    dst_rgb24 += twidth * 3;
2613    width -= twidth;
2614  }
2615}
2616#endif
2617
2618#if defined(HAS_NV12TORGB565ROW_AVX2)
2619void NV12ToRGB565Row_AVX2(const uint8* src_y,
2620                          const uint8* src_uv,
2621                          uint8* dst_rgb565,
2622                          const struct YuvConstants* yuvconstants,
2623                          int width) {
2624  // Row buffer for intermediate ARGB pixels.
2625  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
2626  while (width > 0) {
2627    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
2628    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
2629#if defined(HAS_ARGBTORGB565ROW_AVX2)
2630    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
2631#else
2632    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
2633#endif
2634    src_y += twidth;
2635    src_uv += twidth;
2636    dst_rgb565 += twidth * 2;
2637    width -= twidth;
2638  }
2639}
2640#endif
2641
2642#ifdef __cplusplus
2643}  // extern "C"
2644}  // namespace libyuv
2645#endif
2646