1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#include <string.h>  // For memcpy
14
15#include "libyuv/basic_types.h"
16
17#ifdef __cplusplus
18namespace libyuv {
19extern "C" {
20#endif
21
22void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) {
23  for (int x = 0; x < width; ++x) {
24    // To support in-place conversion.
25    uint8 a = src_bgra[0];
26    uint8 r = src_bgra[1];
27    uint8 g = src_bgra[2];
28    uint8 b = src_bgra[3];
29    dst_argb[0] = b;
30    dst_argb[1] = g;
31    dst_argb[2] = r;
32    dst_argb[3] = a;
33    dst_argb += 4;
34    src_bgra += 4;
35  }
36}
37
38void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
39  for (int x = 0; x < width; ++x) {
40    // To support in-place conversion.
41    uint8 r = src_abgr[0];
42    uint8 g = src_abgr[1];
43    uint8 b = src_abgr[2];
44    uint8 a = src_abgr[3];
45    dst_argb[0] = b;
46    dst_argb[1] = g;
47    dst_argb[2] = r;
48    dst_argb[3] = a;
49    dst_argb += 4;
50    src_abgr += 4;
51  }
52}
53
54void RGBAToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
55  for (int x = 0; x < width; ++x) {
56    // To support in-place conversion.
57    uint8 a = src_abgr[0];
58    uint8 b = src_abgr[1];
59    uint8 g = src_abgr[2];
60    uint8 r = src_abgr[3];
61    dst_argb[0] = b;
62    dst_argb[1] = g;
63    dst_argb[2] = r;
64    dst_argb[3] = a;
65    dst_argb += 4;
66    src_abgr += 4;
67  }
68}
69
70void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
71  for (int x = 0; x < width; ++x) {
72    uint8 b = src_rgb24[0];
73    uint8 g = src_rgb24[1];
74    uint8 r = src_rgb24[2];
75    dst_argb[0] = b;
76    dst_argb[1] = g;
77    dst_argb[2] = r;
78    dst_argb[3] = 255u;
79    dst_argb += 4;
80    src_rgb24 += 3;
81  }
82}
83
84void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
85  for (int x = 0; x < width; ++x) {
86    uint8 r = src_raw[0];
87    uint8 g = src_raw[1];
88    uint8 b = src_raw[2];
89    dst_argb[0] = b;
90    dst_argb[1] = g;
91    dst_argb[2] = r;
92    dst_argb[3] = 255u;
93    dst_argb += 4;
94    src_raw += 3;
95  }
96}
97
98void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
99  for (int x = 0; x < width; ++x) {
100    uint8 b = src_rgb[0] & 0x1f;
101    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3);
102    uint8 r = src_rgb[1] >> 3;
103    dst_argb[0] = (b << 3) | (b >> 2);
104    dst_argb[1] = (g << 2) | (g >> 4);
105    dst_argb[2] = (r << 3) | (r >> 2);
106    dst_argb[3] = 255u;
107    dst_argb += 4;
108    src_rgb += 2;
109  }
110}
111
112void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
113  for (int x = 0; x < width; ++x) {
114    uint8 b = src_rgb[0] & 0x1f;
115    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3);
116    uint8 r = (src_rgb[1] & 0x7c) >> 2;
117    uint8 a = src_rgb[1] >> 7;
118    dst_argb[0] = (b << 3) | (b >> 2);
119    dst_argb[1] = (g << 3) | (g >> 2);
120    dst_argb[2] = (r << 3) | (r >> 2);
121    dst_argb[3] = -a;
122    dst_argb += 4;
123    src_rgb += 2;
124  }
125}
126
127void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
128  for (int x = 0; x < width; ++x) {
129    uint8 b = src_rgb[0] & 0x0f;
130    uint8 g = src_rgb[0] >> 4;
131    uint8 r = src_rgb[1] & 0x0f;
132    uint8 a = src_rgb[1] >> 4;
133    dst_argb[0] = (b << 4) | b;
134    dst_argb[1] = (g << 4) | g;
135    dst_argb[2] = (r << 4) | r;
136    dst_argb[3] = (a << 4) | a;
137    dst_argb += 4;
138    src_rgb += 2;
139  }
140}
141
142void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
143  for (int x = 0; x < width; ++x) {
144    uint8 b = src_argb[0];
145    uint8 g = src_argb[1];
146    uint8 r = src_argb[2];
147    uint8 a = src_argb[3];
148    dst_rgb[0] = a;
149    dst_rgb[1] = b;
150    dst_rgb[2] = g;
151    dst_rgb[3] = r;
152    dst_rgb += 4;
153    src_argb += 4;
154  }
155}
156
157void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
158  for (int x = 0; x < width; ++x) {
159    uint8 b = src_argb[0];
160    uint8 g = src_argb[1];
161    uint8 r = src_argb[2];
162    dst_rgb[0] = b;
163    dst_rgb[1] = g;
164    dst_rgb[2] = r;
165    dst_rgb += 3;
166    src_argb += 4;
167  }
168}
169
170void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
171  for (int x = 0; x < width; ++x) {
172    uint8 b = src_argb[0];
173    uint8 g = src_argb[1];
174    uint8 r = src_argb[2];
175    dst_rgb[0] = r;
176    dst_rgb[1] = g;
177    dst_rgb[2] = b;
178    dst_rgb += 3;
179    src_argb += 4;
180  }
181}
182
183// TODO(fbarchard): support big endian CPU
184void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
185  for (int x = 0; x < width - 1; x += 2) {
186    uint8 b0 = src_argb[0] >> 3;
187    uint8 g0 = src_argb[1] >> 2;
188    uint8 r0 = src_argb[2] >> 3;
189    uint8 b1 = src_argb[4] >> 3;
190    uint8 g1 = src_argb[5] >> 2;
191    uint8 r1 = src_argb[6] >> 3;
192    *reinterpret_cast<uint32*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11) |
193        (b1 << 16) | (g1 << 21) | (r1 << 27);
194    dst_rgb += 4;
195    src_argb += 8;
196  }
197  if (width & 1) {
198    uint8 b0 = src_argb[0] >> 3;
199    uint8 g0 = src_argb[1] >> 2;
200    uint8 r0 = src_argb[2] >> 3;
201    *reinterpret_cast<uint16*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
202  }
203}
204
205void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
206  for (int x = 0; x < width - 1; x += 2) {
207    uint8 b0 = src_argb[0] >> 3;
208    uint8 g0 = src_argb[1] >> 3;
209    uint8 r0 = src_argb[2] >> 3;
210    uint8 a0 = src_argb[3] >> 7;
211    uint8 b1 = src_argb[4] >> 3;
212    uint8 g1 = src_argb[5] >> 3;
213    uint8 r1 = src_argb[6] >> 3;
214    uint8 a1 = src_argb[7] >> 7;
215    *reinterpret_cast<uint32*>(dst_rgb) =
216        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
217        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
218    dst_rgb += 4;
219    src_argb += 8;
220  }
221  if (width & 1) {
222    uint8 b0 = src_argb[0] >> 3;
223    uint8 g0 = src_argb[1] >> 3;
224    uint8 r0 = src_argb[2] >> 3;
225    uint8 a0 = src_argb[3] >> 7;
226    *reinterpret_cast<uint16*>(dst_rgb) =
227        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
228  }
229}
230
231void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
232  for (int x = 0; x < width - 1; x += 2) {
233    uint8 b0 = src_argb[0] >> 4;
234    uint8 g0 = src_argb[1] >> 4;
235    uint8 r0 = src_argb[2] >> 4;
236    uint8 a0 = src_argb[3] >> 4;
237    uint8 b1 = src_argb[4] >> 4;
238    uint8 g1 = src_argb[5] >> 4;
239    uint8 r1 = src_argb[6] >> 4;
240    uint8 a1 = src_argb[7] >> 4;
241    *reinterpret_cast<uint32*>(dst_rgb) =
242        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
243        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
244    dst_rgb += 4;
245    src_argb += 8;
246  }
247  if (width & 1) {
248    uint8 b0 = src_argb[0] >> 4;
249    uint8 g0 = src_argb[1] >> 4;
250    uint8 r0 = src_argb[2] >> 4;
251    uint8 a0 = src_argb[3] >> 4;
252    *reinterpret_cast<uint16*>(dst_rgb) =
253        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
254  }
255}
256
257static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
258  return (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
259}
260
261static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
262  return ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
263}
264static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
265  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
266}
267
268#define MAKEROWY(NAME, R, G, B) \
269void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
270  for (int x = 0; x < width; ++x) {                                            \
271    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
272    src_argb0 += 4;                                                            \
273    dst_y += 1;                                                                \
274  }                                                                            \
275}                                                                              \
276void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
277                       uint8* dst_u, uint8* dst_v, int width) {                \
278  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
279  for (int x = 0; x < width - 1; x += 2) {                                     \
280    uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] +                                \
281               src_rgb1[B] + src_rgb1[B + 4]) >> 2;                            \
282    uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] +                                \
283               src_rgb1[G] + src_rgb1[G + 4]) >> 2;                            \
284    uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] +                                \
285               src_rgb1[R] + src_rgb1[R + 4]) >> 2;                            \
286    dst_u[0] = RGBToU(ar, ag, ab);                                             \
287    dst_v[0] = RGBToV(ar, ag, ab);                                             \
288    src_rgb0 += 8;                                                             \
289    src_rgb1 += 8;                                                             \
290    dst_u += 1;                                                                \
291    dst_v += 1;                                                                \
292  }                                                                            \
293  if (width & 1) {                                                             \
294    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
295    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
296    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
297    dst_u[0] = RGBToU(ar, ag, ab);                                             \
298    dst_v[0] = RGBToV(ar, ag, ab);                                             \
299  }                                                                            \
300}
301
302MAKEROWY(ARGB, 2, 1, 0)
303MAKEROWY(BGRA, 1, 2, 3)
304MAKEROWY(ABGR, 0, 1, 2)
305MAKEROWY(RGBA, 3, 2, 1)
306
307// http://en.wikipedia.org/wiki/Grayscale.
308// 0.11 * B + 0.59 * G + 0.30 * R
309// Coefficients rounded to multiple of 2 for consistency with SSSE3 version.
310static __inline int RGBToGray(uint8 r, uint8 g, uint8 b) {
311  return (( 76 * r + 152 * g +  28 * b) >> 8);
312}
313
314void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
315  for (int x = 0; x < width; ++x) {
316    uint8 y = RGBToGray(src_argb[2], src_argb[1], src_argb[0]);
317    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
318    dst_argb[3] = src_argb[3];
319    dst_argb += 4;
320    src_argb += 4;
321  }
322}
323
324// Convert a row of image to Sepia tone.
325void ARGBSepiaRow_C(uint8* dst_argb, int width) {
326  for (int x = 0; x < width; ++x) {
327    int b = dst_argb[0];
328    int g = dst_argb[1];
329    int r = dst_argb[2];
330    int sb = (b * 17 + g * 68 + r * 35) >> 7;
331    int sg = (b * 22 + g * 88 + r * 45) >> 7;
332    int sr = (b * 24 + g * 98 + r * 50) >> 7;
333    // b does not over flow. a is preserved from original.
334    if (sg > 255) {
335      sg = 255;
336    }
337    if (sr > 255) {
338      sr = 255;
339    }
340    dst_argb[0] = sb;
341    dst_argb[1] = sg;
342    dst_argb[2] = sr;
343    dst_argb += 4;
344  }
345}
346
347// Apply color matrix to a row of image. Matrix is signed.
348void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
349  for (int x = 0; x < width; ++x) {
350    int b = dst_argb[0];
351    int g = dst_argb[1];
352    int r = dst_argb[2];
353    int a = dst_argb[3];
354    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
355              r * matrix_argb[2] + a * matrix_argb[3]) >> 7;
356    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
357              r * matrix_argb[6] + a * matrix_argb[7]) >> 7;
358    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
359              r * matrix_argb[10] + a * matrix_argb[11]) >> 7;
360    if (sb < 0) {
361      sb = 0;
362    }
363    if (sb > 255) {
364      sb = 255;
365    }
366    if (sg < 0) {
367      sg = 0;
368    }
369    if (sg > 255) {
370      sg = 255;
371    }
372    if (sr < 0) {
373      sr = 0;
374    }
375    if (sr > 255) {
376      sr = 255;
377    }
378    dst_argb[0] = sb;
379    dst_argb[1] = sg;
380    dst_argb[2] = sr;
381    dst_argb += 4;
382  }
383}
384
385// Apply color table to a row of image.
386void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
387  for (int x = 0; x < width; ++x) {
388    int b = dst_argb[0];
389    int g = dst_argb[1];
390    int r = dst_argb[2];
391    int a = dst_argb[3];
392    dst_argb[0] = table_argb[b * 4 + 0];
393    dst_argb[1] = table_argb[g * 4 + 1];
394    dst_argb[2] = table_argb[r * 4 + 2];
395    dst_argb[3] = table_argb[a * 4 + 3];
396    dst_argb += 4;
397  }
398}
399
400void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
401                       int interval_offset, int width) {
402  for (int x = 0; x < width; ++x) {
403    int b = dst_argb[0];
404    int g = dst_argb[1];
405    int r = dst_argb[2];
406    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
407    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
408    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
409    dst_argb += 4;
410  }
411}
412
413void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
414  // Copy a Y to RGB.
415  for (int x = 0; x < width; ++x) {
416    uint8 y = src_y[0];
417    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
418    dst_argb[3] = 255u;
419    dst_argb += 4;
420    ++src_y;
421  }
422}
423
424// C reference code that mimics the YUV assembly.
425
426#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
427
428#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
429#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
430#define UR 0
431
432#define VB 0
433#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
434#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
435
436// Bias
437#define BB UB * 128 + VB * 128
438#define BG UG * 128 + VG * 128
439#define BR UR * 128 + VR * 128
440
441static __inline uint32 Clip(int32 val) {
442  if (val < 0) {
443    return static_cast<uint32>(0);
444  } else if (val > 255) {
445    return static_cast<uint32>(255);
446  }
447  return static_cast<uint32>(val);
448}
449
450static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
451                              int ashift, int rshift, int gshift, int bshift) {
452  int32 y1 = (static_cast<int32>(y) - 16) * YG;
453  uint32 b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
454  uint32 g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
455  uint32 r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
456  *reinterpret_cast<uint32*>(rgb_buf) = (b << bshift) |
457                                        (g << gshift) |
458                                        (r << rshift) |
459                                        (255u << ashift);
460}
461
462static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
463                               uint8* b, uint8* g, uint8* r) {
464  int32 y1 = (static_cast<int32>(y) - 16) * YG;
465  *b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
466  *g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
467  *r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
468}
469
470void I444ToARGBRow_C(const uint8* y_buf,
471                     const uint8* u_buf,
472                     const uint8* v_buf,
473                     uint8* rgb_buf,
474                     int width) {
475  for (int x = 0; x < width; ++x) {
476    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
477    y_buf += 1;
478    u_buf += 1;
479    v_buf += 1;
480    rgb_buf += 4;  // Advance 1 pixel.
481  }
482}
483
484// Also used for 420
485void I422ToARGBRow_C(const uint8* y_buf,
486                     const uint8* u_buf,
487                     const uint8* v_buf,
488                     uint8* rgb_buf,
489                     int width) {
490  for (int x = 0; x < width - 1; x += 2) {
491    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
492    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
493    y_buf += 2;
494    u_buf += 1;
495    v_buf += 1;
496    rgb_buf += 8;  // Advance 2 pixels.
497  }
498  if (width & 1) {
499    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
500  }
501}
502
503void I422ToRGB24Row_C(const uint8* y_buf,
504                      const uint8* u_buf,
505                      const uint8* v_buf,
506                      uint8* rgb_buf,
507                      int width) {
508  for (int x = 0; x < width - 1; x += 2) {
509    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
510              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
511    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
512              rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
513    y_buf += 2;
514    u_buf += 1;
515    v_buf += 1;
516    rgb_buf += 6;  // Advance 2 pixels.
517  }
518  if (width & 1) {
519    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
520              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
521  }
522}
523
524void I422ToRAWRow_C(const uint8* y_buf,
525                    const uint8* u_buf,
526                    const uint8* v_buf,
527                    uint8* rgb_buf,
528                    int width) {
529  for (int x = 0; x < width - 1; x += 2) {
530    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
531              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
532    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
533              rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
534    y_buf += 2;
535    u_buf += 1;
536    v_buf += 1;
537    rgb_buf += 6;  // Advance 2 pixels.
538  }
539  if (width & 1) {
540    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
541              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
542  }
543}
544
545void I411ToARGBRow_C(const uint8* y_buf,
546                     const uint8* u_buf,
547                     const uint8* v_buf,
548                     uint8* rgb_buf,
549                     int width) {
550  for (int x = 0; x < width - 3; x += 4) {
551    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
552    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
553    YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
554    YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
555    y_buf += 4;
556    u_buf += 1;
557    v_buf += 1;
558    rgb_buf += 16;  // Advance 4 pixels.
559  }
560  if (width & 2) {
561    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
562    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
563    y_buf += 2;
564    rgb_buf += 8;  // Advance 2 pixels.
565  }
566  if (width & 1) {
567    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
568  }
569}
570
571void NV12ToARGBRow_C(const uint8* y_buf,
572                     const uint8* uv_buf,
573                     uint8* rgb_buf,
574                     int width) {
575  for (int x = 0; x < width - 1; x += 2) {
576    YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
577    YuvPixel(y_buf[1], uv_buf[0], uv_buf[1], rgb_buf + 4, 24, 16, 8, 0);
578    y_buf += 2;
579    uv_buf += 2;
580    rgb_buf += 8;  // Advance 2 pixels.
581  }
582  if (width & 1) {
583    YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
584  }
585}
586
587void NV21ToARGBRow_C(const uint8* y_buf,
588                     const uint8* vu_buf,
589                     uint8* rgb_buf,
590                     int width) {
591  for (int x = 0; x < width - 1; x += 2) {
592    YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
593    YuvPixel(y_buf[1], vu_buf[1], vu_buf[0], rgb_buf + 4, 24, 16, 8, 0);
594    y_buf += 2;
595    vu_buf += 2;
596    rgb_buf += 8;  // Advance 2 pixels.
597  }
598  if (width & 1) {
599    YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
600  }
601}
602
603void I422ToBGRARow_C(const uint8* y_buf,
604                     const uint8* u_buf,
605                     const uint8* v_buf,
606                     uint8* rgb_buf,
607                     int width) {
608  for (int x = 0; x < width - 1; x += 2) {
609    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
610    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
611    y_buf += 2;
612    u_buf += 1;
613    v_buf += 1;
614    rgb_buf += 8;  // Advance 2 pixels.
615  }
616  if (width & 1) {
617    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
618  }
619}
620
621void I422ToABGRRow_C(const uint8* y_buf,
622                     const uint8* u_buf,
623                     const uint8* v_buf,
624                     uint8* rgb_buf,
625                     int width) {
626  for (int x = 0; x < width - 1; x += 2) {
627    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
628    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
629    y_buf += 2;
630    u_buf += 1;
631    v_buf += 1;
632    rgb_buf += 8;  // Advance 2 pixels.
633  }
634  if (width & 1) {
635    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
636  }
637}
638
639void I422ToRGBARow_C(const uint8* y_buf,
640                     const uint8* u_buf,
641                     const uint8* v_buf,
642                     uint8* rgb_buf,
643                     int width) {
644  for (int x = 0; x < width - 1; x += 2) {
645    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
646    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 24, 16, 8);
647    y_buf += 2;
648    u_buf += 1;
649    v_buf += 1;
650    rgb_buf += 8;  // Advance 2 pixels.
651  }
652  if (width & 1) {
653    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
654  }
655}
656
657void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
658  for (int x = 0; x < width; ++x) {
659    YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
660    y_buf += 1;
661    rgb_buf += 4;  // Advance 1 pixel.
662  }
663}
664
665void MirrorRow_C(const uint8* src, uint8* dst, int width) {
666  src += width - 1;
667  for (int x = 0; x < width - 1; x += 2) {
668    dst[x] = src[0];
669    dst[x + 1] = src[-1];
670    src -= 2;
671  }
672  if (width & 1) {
673    dst[width - 1] = src[0];
674  }
675}
676
677void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
678  src_uv += (width - 1) << 1;
679  for (int x = 0; x < width - 1; x += 2) {
680    dst_u[x] = src_uv[0];
681    dst_u[x + 1] = src_uv[-2];
682    dst_v[x] = src_uv[1];
683    dst_v[x + 1] = src_uv[-2 + 1];
684    src_uv -= 4;
685  }
686  if (width & 1) {
687    dst_u[width - 1] = src_uv[0];
688    dst_v[width - 1] = src_uv[1];
689  }
690}
691
692void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
693  const uint32* src32 = reinterpret_cast<const uint32*>(src);
694  uint32* dst32 = reinterpret_cast<uint32*>(dst);
695  src32 += width - 1;
696  for (int x = 0; x < width - 1; x += 2) {
697    dst32[x] = src32[0];
698    dst32[x + 1] = src32[-1];
699    src32 -= 2;
700  }
701  if (width & 1) {
702    dst32[width - 1] = src32[0];
703  }
704}
705
706void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
707  for (int x = 0; x < width - 1; x += 2) {
708    dst_u[x] = src_uv[0];
709    dst_u[x + 1] = src_uv[2];
710    dst_v[x] = src_uv[1];
711    dst_v[x + 1] = src_uv[3];
712    src_uv += 4;
713  }
714  if (width & 1) {
715    dst_u[width - 1] = src_uv[0];
716    dst_v[width - 1] = src_uv[1];
717  }
718}
719
720void CopyRow_C(const uint8* src, uint8* dst, int count) {
721  memcpy(dst, src, count);
722}
723
724void SetRow8_C(uint8* dst, uint32 v8, int count) {
725#ifdef _MSC_VER
726  // VC will generate rep stosb.
727  for (int x = 0; x < count; ++x) {
728    dst[x] = v8;
729  }
730#else
731  memset(dst, v8, count);
732#endif
733}
734
735void SetRows32_C(uint8* dst, uint32 v32, int width,
736                 int dst_stride, int height) {
737  for (int y = 0; y < height; ++y) {
738    uint32* d = reinterpret_cast<uint32*>(dst);
739    for (int x = 0; x < width; ++x) {
740      d[x] = v32;
741    }
742    dst += dst_stride;
743  }
744}
745
746// Filter 2 rows of YUY2 UV's (422) into U and V (420).
747void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
748                   uint8* dst_u, uint8* dst_v, int width) {
749  // Output a row of UV values, filtering 2 rows of YUY2.
750  for (int x = 0; x < width; x += 2) {
751    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
752    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
753    src_yuy2 += 4;
754    dst_u += 1;
755    dst_v += 1;
756  }
757}
758
759// Copy row of YUY2 UV's (422) into U and V (422).
760void YUY2ToUV422Row_C(const uint8* src_yuy2,
761                      uint8* dst_u, uint8* dst_v, int width) {
762  // Output a row of UV values.
763  for (int x = 0; x < width; x += 2) {
764    dst_u[0] = src_yuy2[1];
765    dst_v[0] = src_yuy2[3];
766    src_yuy2 += 4;
767    dst_u += 1;
768    dst_v += 1;
769  }
770}
771
772// Copy row of YUY2 Y's (422) into Y (420/422).
773void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
774  // Output a row of Y values.
775  for (int x = 0; x < width - 1; x += 2) {
776    dst_y[x] = src_yuy2[0];
777    dst_y[x + 1] = src_yuy2[2];
778    src_yuy2 += 4;
779  }
780  if (width & 1) {
781    dst_y[width - 1] = src_yuy2[0];
782  }
783}
784
785// Filter 2 rows of UYVY UV's (422) into U and V (420).
786void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
787                   uint8* dst_u, uint8* dst_v, int width) {
788  // Output a row of UV values.
789  for (int x = 0; x < width; x += 2) {
790    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
791    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
792    src_uyvy += 4;
793    dst_u += 1;
794    dst_v += 1;
795  }
796}
797
798// Copy row of UYVY UV's (422) into U and V (422).
799void UYVYToUV422Row_C(const uint8* src_uyvy,
800                      uint8* dst_u, uint8* dst_v, int width) {
801  // Output a row of UV values.
802  for (int x = 0; x < width; x += 2) {
803    dst_u[0] = src_uyvy[0];
804    dst_v[0] = src_uyvy[2];
805    src_uyvy += 4;
806    dst_u += 1;
807    dst_v += 1;
808  }
809}
810
811// Copy row of UYVY Y's (422) into Y (420/422).
812void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
813  // Output a row of Y values.
814  for (int x = 0; x < width - 1; x += 2) {
815    dst_y[x] = src_uyvy[1];
816    dst_y[x + 1] = src_uyvy[3];
817    src_uyvy += 4;
818  }
819  if (width & 1) {
820    dst_y[width - 1] = src_uyvy[1];
821  }
822}
823
824#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
825
826// Blend src_argb0 over src_argb1 and store to dst_argb.
827// dst_argb may be src_argb0 or src_argb1.
828// This code mimics the SSSE3 version for better testability.
829void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
830                    uint8* dst_argb, int width) {
831  for (int x = 0; x < width - 1; x += 2) {
832    uint32 fb = src_argb0[0];
833    uint32 fg = src_argb0[1];
834    uint32 fr = src_argb0[2];
835    uint32 a = src_argb0[3];
836    uint32 bb = src_argb1[0];
837    uint32 bg = src_argb1[1];
838    uint32 br = src_argb1[2];
839    dst_argb[0] = BLEND(fb, bb, a);
840    dst_argb[1] = BLEND(fg, bg, a);
841    dst_argb[2] = BLEND(fr, br, a);
842    dst_argb[3] = 255u;
843
844    fb = src_argb0[4 + 0];
845    fg = src_argb0[4 + 1];
846    fr = src_argb0[4 + 2];
847    a = src_argb0[4 + 3];
848    bb = src_argb1[4 + 0];
849    bg = src_argb1[4 + 1];
850    br = src_argb1[4 + 2];
851    dst_argb[4 + 0] = BLEND(fb, bb, a);
852    dst_argb[4 + 1] = BLEND(fg, bg, a);
853    dst_argb[4 + 2] = BLEND(fr, br, a);
854    dst_argb[4 + 3] = 255u;
855    src_argb0 += 8;
856    src_argb1 += 8;
857    dst_argb += 8;
858  }
859
860  if (width & 1) {
861    uint32 fb = src_argb0[0];
862    uint32 fg = src_argb0[1];
863    uint32 fr = src_argb0[2];
864    uint32 a = src_argb0[3];
865    uint32 bb = src_argb1[0];
866    uint32 bg = src_argb1[1];
867    uint32 br = src_argb1[2];
868    dst_argb[0] = BLEND(fb, bb, a);
869    dst_argb[1] = BLEND(fg, bg, a);
870    dst_argb[2] = BLEND(fr, br, a);
871    dst_argb[3] = 255u;
872  }
873}
874#undef BLEND
875#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
876
877// Multiply source RGB by alpha and store to destination.
878// This code mimics the SSSE3 version for better testability.
879void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
880  for (int i = 0; i < width - 1; i += 2) {
881    uint32 b = src_argb[0];
882    uint32 g = src_argb[1];
883    uint32 r = src_argb[2];
884    uint32 a = src_argb[3];
885    dst_argb[0] = ATTENUATE(b, a);
886    dst_argb[1] = ATTENUATE(g, a);
887    dst_argb[2] = ATTENUATE(r, a);
888    dst_argb[3] = a;
889    b = src_argb[4];
890    g = src_argb[5];
891    r = src_argb[6];
892    a = src_argb[7];
893    dst_argb[4] = ATTENUATE(b, a);
894    dst_argb[5] = ATTENUATE(g, a);
895    dst_argb[6] = ATTENUATE(r, a);
896    dst_argb[7] = a;
897    src_argb += 8;
898    dst_argb += 8;
899  }
900
901  if (width & 1) {
902    const uint32 b = src_argb[0];
903    const uint32 g = src_argb[1];
904    const uint32 r = src_argb[2];
905    const uint32 a = src_argb[3];
906    dst_argb[0] = ATTENUATE(b, a);
907    dst_argb[1] = ATTENUATE(g, a);
908    dst_argb[2] = ATTENUATE(r, a);
909    dst_argb[3] = a;
910  }
911}
912#undef ATTENUATE
913
914// Divide source RGB by alpha and store to destination.
915// b = (b * 255 + (a / 2)) / a;
916// g = (g * 255 + (a / 2)) / a;
917// r = (r * 255 + (a / 2)) / a;
918// Reciprocal method is off by 1 on some values. ie 125
919// 8.16 fixed point inverse table
920#define T(a) 0x10000 / a
921uint32 fixed_invtbl8[256] = {
922  0x0100, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
923  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
924  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
925  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
926  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
927  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
928  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
929  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
930  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
931  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
932  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
933  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
934  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
935  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
936  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
937  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
938  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
939  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
940  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
941  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
942  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
943  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
944  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
945  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
946  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
947  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
948  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
949  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
950  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
951  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
952  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
953  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x0100 };
954#undef T
955
956void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
957  for (int i = 0; i < width; ++i) {
958    uint32 b = src_argb[0];
959    uint32 g = src_argb[1];
960    uint32 r = src_argb[2];
961    const uint32 a = src_argb[3];
962    if (a) {
963      const uint32 ia = fixed_invtbl8[a];  // 8.16 fixed point
964      b = (b * ia) >> 8;
965      g = (g * ia) >> 8;
966      r = (r * ia) >> 8;
967      // Clamping should not be necessary but is free in assembly.
968      if (b > 255) {
969        b = 255;
970      }
971      if (g > 255) {
972        g = 255;
973      }
974      if (r > 255) {
975        r = 255;
976      }
977    }
978    dst_argb[0] = b;
979    dst_argb[1] = g;
980    dst_argb[2] = r;
981    dst_argb[3] = a;
982    src_argb += 4;
983    dst_argb += 4;
984  }
985}
986
987// Wrappers to handle odd width
988#define YANY(NAMEANY, I420TORGB_SSE, I420TORGB_C, UV_SHIFT)                    \
989    void NAMEANY(const uint8* y_buf,                                           \
990                 const uint8* u_buf,                                           \
991                 const uint8* v_buf,                                           \
992                 uint8* rgb_buf,                                               \
993                 int width) {                                                  \
994      int n = width & ~7;                                                      \
995      I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n);                          \
996      I420TORGB_C(y_buf + n,                                                   \
997                  u_buf + (n >> UV_SHIFT),                                     \
998                  v_buf + (n >> UV_SHIFT),                                     \
999                  rgb_buf + n * 4, width & 7);                                 \
1000    }
1001
1002// Wrappers to handle odd width
1003#define Y2NY(NAMEANY, NV12TORGB_SSE, NV12TORGB_C, UV_SHIFT)                    \
1004    void NAMEANY(const uint8* y_buf,                                           \
1005                 const uint8* uv_buf,                                          \
1006                 uint8* rgb_buf,                                               \
1007                 int width) {                                                  \
1008      int n = width & ~7;                                                      \
1009      NV12TORGB_SSE(y_buf, uv_buf, rgb_buf, n);                                \
1010      NV12TORGB_C(y_buf + n,                                                   \
1011                  uv_buf + (n >> UV_SHIFT),                                    \
1012                  rgb_buf + n * 4, width & 7);                                 \
1013    }
1014
1015
1016#ifdef HAS_I422TOARGBROW_SSSE3
1017YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0)
1018YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1)
1019YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2)
1020Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0)
1021Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
1022YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
1023YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
1024#endif
1025#ifdef HAS_I422TORGB24ROW_SSSE3
1026YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3,                 \
1027     I422ToRGB24Row_C, 1)
1028YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
1029#endif
1030#ifdef HAS_I422TORGBAROW_SSSE3
1031YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
1032#endif
1033#ifdef HAS_I422TOARGBROW_NEON
1034YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
1035YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
1036YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
1037YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
1038Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0)
1039Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0)
1040YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1)
1041YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1)
1042#endif
1043#undef YANY
1044
1045#define RGBANY(NAMEANY, ARGBTORGB, BPP)                                        \
1046    void NAMEANY(const uint8* argb_buf,                                        \
1047                 uint8* rgb_buf,                                               \
1048                 int width) {                                                  \
1049      SIMD_ALIGNED(uint8 row[kMaxStride]);                                     \
1050      ARGBTORGB(argb_buf, row, width);                                         \
1051      memcpy(rgb_buf, row, width * BPP);                                       \
1052    }
1053
1054#if defined(HAS_ARGBTORGB24ROW_SSSE3)
1055RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3)
1056RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3)
1057RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2)
1058RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2)
1059RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
1060#endif
1061#if defined(HAS_ARGBTORGB24ROW_NEON)
1062RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 3)
1063RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 3)
1064#endif
1065#undef RGBANY
1066
1067#define YANY(NAMEANY, ARGBTOY_SSE, BPP)                                        \
1068    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
1069      ARGBTOY_SSE(src_argb, dst_y, width - 16);                                \
1070      ARGBTOY_SSE(src_argb + (width - 16) * BPP, dst_y + (width - 16), 16);    \
1071    }
1072
1073#ifdef HAS_ARGBTOYROW_SSSE3
1074YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4)
1075YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4)
1076YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4)
1077#endif
1078#ifdef HAS_RGBATOYROW_SSSE3
1079YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4)
1080#endif
1081#ifdef HAS_YUY2TOYROW_SSE2
1082YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2)
1083YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
1084#endif
1085#ifdef HAS_YUY2TOYROW_NEON
1086YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2)
1087YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2)
1088#endif
1089#undef YANY
1090
1091#define UVANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP)                            \
1092    void NAMEANY(const uint8* src_argb, int src_stride_argb,                   \
1093                 uint8* dst_u, uint8* dst_v, int width) {                      \
1094      int n = width & ~15;                                                     \
1095      ANYTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n);                 \
1096      ANYTOUV_C(src_argb  + n * BPP, src_stride_argb,                          \
1097                 dst_u + (n >> 1),                                             \
1098                 dst_v + (n >> 1),                                             \
1099                 width & 15);                                                  \
1100    }
1101
1102#ifdef HAS_ARGBTOUVROW_SSSE3
1103UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
1104UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
1105UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
1106#endif
1107#ifdef HAS_RGBATOYROW_SSSE3
1108UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
1109#endif
1110#ifdef HAS_YUY2TOUVROW_SSE2
1111UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
1112UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
1113#endif
1114#ifdef HAS_YUY2TOUVROW_NEON
1115UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2)
1116UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2)
1117#endif
1118#undef UVANY
1119
1120#define UV422ANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP)                         \
1121    void NAMEANY(const uint8* src_argb,                                        \
1122                 uint8* dst_u, uint8* dst_v, int width) {                      \
1123      int n = width & ~15;                                                     \
1124      ANYTOUV_SSE(src_argb, dst_u, dst_v, n);                                  \
1125      ANYTOUV_C(src_argb  + n * BPP,                                           \
1126                 dst_u + (n >> 1),                                             \
1127                 dst_v + (n >> 1),                                             \
1128                 width & 15);                                                  \
1129    }
1130
1131#ifdef HAS_YUY2TOUV422ROW_SSE2
1132UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,               \
1133         YUY2ToUV422Row_C, 2)
1134UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,               \
1135         UYVYToUV422Row_C, 2)
1136#endif
1137#ifdef HAS_YUY2TOUV422ROW_NEON
1138UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,                         \
1139         YUY2ToUV422Row_C, 2)
1140UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,                         \
1141         UYVYToUV422Row_C, 2)
1142#endif
1143#undef UV422ANY
1144
1145void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
1146                               const int32* previous_cumsum, int width) {
1147  int32 row_sum[4] = {0, 0, 0, 0};
1148  for (int x = 0; x < width; ++x) {
1149    row_sum[0] += row[x * 4 + 0];
1150    row_sum[1] += row[x * 4 + 1];
1151    row_sum[2] += row[x * 4 + 2];
1152    row_sum[3] += row[x * 4 + 3];
1153    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
1154    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
1155    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
1156    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
1157  }
1158}
1159
1160void CumulativeSumToAverage_C(const int32* tl, const int32* bl,
1161                              int w, int area, uint8* dst, int count) {
1162  float ooa = 1.0f / area;
1163  for (int i = 0; i < count; ++i) {
1164    dst[0] = static_cast<uint8>((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
1165    dst[1] = static_cast<uint8>((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
1166    dst[2] = static_cast<uint8>((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
1167    dst[3] = static_cast<uint8>((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
1168    dst += 4;
1169    tl += 4;
1170    bl += 4;
1171  }
1172}
1173
1174#define REPEAT8(v) (v) | ((v) << 8)
1175#define SHADE(f, v) v * f >> 24
1176
1177void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
1178                    uint32 value) {
1179  const uint32 b_scale = REPEAT8(value & 0xff);
1180  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
1181  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
1182  const uint32 a_scale = REPEAT8(value >> 24);
1183
1184  for (int i = 0; i < width; ++i) {
1185    const uint32 b = REPEAT8(src_argb[0]);
1186    const uint32 g = REPEAT8(src_argb[1]);
1187    const uint32 r = REPEAT8(src_argb[2]);
1188    const uint32 a = REPEAT8(src_argb[3]);
1189    dst_argb[0] = SHADE(b, b_scale);
1190    dst_argb[1] = SHADE(g, g_scale);
1191    dst_argb[2] = SHADE(r, r_scale);
1192    dst_argb[3] = SHADE(a, a_scale);
1193    src_argb += 4;
1194    dst_argb += 4;
1195  }
1196}
1197#undef REPEAT8
1198#undef SHADE
1199
1200// Copy pixels from rotated source to destination row with a slope.
1201LIBYUV_API
1202void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
1203                     uint8* dst_argb, const float* uv_dudv, int width) {
1204  // Render a row of pixels from source into a buffer.
1205  float uv[2];
1206  uv[0] = uv_dudv[0];
1207  uv[1] = uv_dudv[1];
1208  for (int i = 0; i < width; ++i) {
1209    int x = static_cast<int>(uv[0]);
1210    int y = static_cast<int>(uv[1]);
1211    *reinterpret_cast<uint32*>(dst_argb) =
1212        *reinterpret_cast<const uint32*>(src_argb + y * src_argb_stride +
1213                                         x * 4);
1214    dst_argb += 4;
1215    uv[0] += uv_dudv[2];
1216    uv[1] += uv_dudv[3];
1217  }
1218}
1219
1220// C version 2x2 -> 2x1.
1221void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
1222                          ptrdiff_t src_stride,
1223                          int dst_width, int source_y_fraction) {
1224  int y1_fraction = source_y_fraction;
1225  int y0_fraction = 256 - y1_fraction;
1226  const uint8* src_ptr1 = src_ptr + src_stride;
1227  uint8* end = dst_ptr + (dst_width << 2);
1228  do {
1229    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
1230    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
1231    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
1232    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
1233    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
1234    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
1235    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
1236    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
1237    src_ptr += 8;
1238    src_ptr1 += 8;
1239    dst_ptr += 8;
1240  } while (dst_ptr < end);
1241}
1242
1243#ifdef __cplusplus
1244}  // extern "C"
1245}  // namespace libyuv
1246#endif
1247