1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "build/build_config.h"
6#include "media/base/simd/convert_rgb_to_yuv.h"
7#include "media/base/simd/yuv_to_rgb_table.h"
8
9#if defined(COMPILER_MSVC)
10#include <intrin.h>
11#else
12#include <mmintrin.h>
13#include <emmintrin.h>
14#endif
15
16namespace media {
17
18#define FIX_SHIFT 12
19#define FIX(x) ((x) * (1 << FIX_SHIFT))
20
21// Define a convenient macro to do static cast.
22#define INT16_FIX(x) static_cast<int16>(FIX(x))
23
24// Android's pixel layout is RGBA, while other platforms
25// are BGRA.
26#if defined(OS_ANDROID)
27SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
28  INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
29  INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
30  -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
31  -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
32  INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
33  INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
34};
35#else
36SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
37  INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
38  INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
39  INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
40  INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
41  -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
42  -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
43};
44#endif
45
46#undef INT16_FIX
47
48// This is the final offset for the conversion from signed yuv values to
49// unsigned values. It is arranged so that offset of 16 is applied to Y
50// components and 128 is added to UV components for 2 pixels.
51SIMD_ALIGNED(const int32 kYOffset[4]) = {16, 16, 16, 16};
52
53static inline int Clamp(int value) {
54  if (value < 0)
55    return 0;
56  if (value > 255)
57    return 255;
58  return value;
59}
60
61static inline int RGBToY(int r, int g, int b) {
62  int y = ConvertRGBAToYUV_kTable[0] * b +
63      ConvertRGBAToYUV_kTable[1] * g +
64      ConvertRGBAToYUV_kTable[2] * r;
65  y >>= FIX_SHIFT;
66  return Clamp(y + 16);
67}
68
69static inline int RGBToU(int r, int g, int b, int shift) {
70  int u = ConvertRGBAToYUV_kTable[8] * b +
71      ConvertRGBAToYUV_kTable[9] * g +
72      ConvertRGBAToYUV_kTable[10] * r;
73  u >>= FIX_SHIFT + shift;
74  return Clamp(u + 128);
75}
76
77static inline int RGBToV(int r, int g, int b, int shift) {
78  int v = ConvertRGBAToYUV_kTable[16] * b +
79      ConvertRGBAToYUV_kTable[17] * g +
80      ConvertRGBAToYUV_kTable[18] * r;
81  v >>= FIX_SHIFT + shift;
82  return Clamp(v + 128);
83}
84
85#define CONVERT_Y(rgb_buf, y_buf) \
86  b = *rgb_buf++; \
87  g = *rgb_buf++; \
88  r = *rgb_buf++; \
89  ++rgb_buf;      \
90  sum_b += b;     \
91  sum_g += g;     \
92  sum_r += r;     \
93  *y_buf++ = RGBToY(r, g, b);
94
95static inline void ConvertRGBToYUV_V2H2(const uint8* rgb_buf_1,
96                                        const uint8* rgb_buf_2,
97                                        uint8* y_buf_1,
98                                        uint8* y_buf_2,
99                                        uint8* u_buf,
100                                        uint8* v_buf) {
101  int sum_b = 0;
102  int sum_g = 0;
103  int sum_r = 0;
104  int r, g, b;
105
106
107
108  CONVERT_Y(rgb_buf_1, y_buf_1);
109  CONVERT_Y(rgb_buf_1, y_buf_1);
110  CONVERT_Y(rgb_buf_2, y_buf_2);
111  CONVERT_Y(rgb_buf_2, y_buf_2);
112  *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 2);
113  *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 2);
114}
115
116static inline void ConvertRGBToYUV_V2H1(const uint8* rgb_buf_1,
117                                        const uint8* rgb_buf_2,
118                                        uint8* y_buf_1,
119                                        uint8* y_buf_2,
120                                        uint8* u_buf,
121                                        uint8* v_buf) {
122  int sum_b = 0;
123  int sum_g = 0;
124  int sum_r = 0;
125  int r, g, b;
126
127  CONVERT_Y(rgb_buf_1, y_buf_1);
128  CONVERT_Y(rgb_buf_2, y_buf_2);
129  *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
130  *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
131}
132
133static inline void ConvertRGBToYUV_V1H2(const uint8* rgb_buf,
134                                       uint8* y_buf,
135                                       uint8* u_buf,
136                                       uint8* v_buf) {
137  int sum_b = 0;
138  int sum_g = 0;
139  int sum_r = 0;
140  int r, g, b;
141
142  CONVERT_Y(rgb_buf, y_buf);
143  CONVERT_Y(rgb_buf, y_buf);
144  *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
145  *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
146}
147
148static inline void ConvertRGBToYUV_V1H1(const uint8* rgb_buf,
149                                       uint8* y_buf,
150                                       uint8* u_buf,
151                                       uint8* v_buf) {
152  int sum_b = 0;
153  int sum_g = 0;
154  int sum_r = 0;
155  int r, g, b;
156
157  CONVERT_Y(rgb_buf, y_buf);
158  *u_buf++ = RGBToU(r, g, b, 0);
159  *v_buf++ = RGBToV(r, g, b, 0);
160}
161
162static void ConvertRGB32ToYUVRow_SSE2(const uint8* rgb_buf_1,
163                                      const uint8* rgb_buf_2,
164                                      uint8* y_buf_1,
165                                      uint8* y_buf_2,
166                                      uint8* u_buf,
167                                      uint8* v_buf,
168                                      int width) {
169  while (width >= 4) {
170    // Name for the Y pixels:
171    // Row 1: a b c d
172    // Row 2: e f g h
173    //
174    // First row 4 pixels.
175    __m128i rgb_row_1 = _mm_loadu_si128(
176        reinterpret_cast<const __m128i*>(rgb_buf_1));
177    __m128i zero_1 = _mm_xor_si128(rgb_row_1, rgb_row_1);
178
179    __m128i y_table = _mm_load_si128(
180        reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable));
181
182    __m128i rgb_a_b = _mm_unpackhi_epi8(rgb_row_1, zero_1);
183    rgb_a_b = _mm_madd_epi16(rgb_a_b, y_table);
184
185    __m128i rgb_c_d = _mm_unpacklo_epi8(rgb_row_1, zero_1);
186    rgb_c_d = _mm_madd_epi16(rgb_c_d, y_table);
187
188    // Do a crazh shuffle so that we get:
189    //  v------------ Multiply Add
190    // BG: a b c d
191    // A0: a b c d
192    __m128i bg_abcd = _mm_castps_si128(
193        _mm_shuffle_ps(
194            _mm_castsi128_ps(rgb_c_d),
195            _mm_castsi128_ps(rgb_a_b),
196            (3 << 6) | (1 << 4) | (3 << 2) | 1));
197    __m128i r_abcd = _mm_castps_si128(
198        _mm_shuffle_ps(
199            _mm_castsi128_ps(rgb_c_d),
200            _mm_castsi128_ps(rgb_a_b),
201            (2 << 6) | (2 << 2)));
202    __m128i y_abcd = _mm_add_epi32(bg_abcd, r_abcd);
203
204    // Down shift back to 8bits range.
205    __m128i y_offset = _mm_load_si128(
206        reinterpret_cast<const __m128i*>(kYOffset));
207    y_abcd = _mm_srai_epi32(y_abcd, FIX_SHIFT);
208    y_abcd = _mm_add_epi32(y_abcd, y_offset);
209    y_abcd = _mm_packs_epi32(y_abcd, y_abcd);
210    y_abcd = _mm_packus_epi16(y_abcd, y_abcd);
211    *reinterpret_cast<uint32*>(y_buf_1) = _mm_cvtsi128_si32(y_abcd);
212    y_buf_1 += 4;
213
214    // Second row 4 pixels.
215    __m128i rgb_row_2 = _mm_loadu_si128(
216        reinterpret_cast<const __m128i*>(rgb_buf_2));
217    __m128i zero_2 = _mm_xor_si128(rgb_row_2, rgb_row_2);
218    __m128i rgb_e_f = _mm_unpackhi_epi8(rgb_row_2, zero_2);
219    __m128i rgb_g_h = _mm_unpacklo_epi8(rgb_row_2, zero_2);
220
221    // Add two rows together.
222    __m128i rgb_ae_bf =
223        _mm_add_epi16(_mm_unpackhi_epi8(rgb_row_1, zero_2), rgb_e_f);
224    __m128i rgb_cg_dh =
225        _mm_add_epi16(_mm_unpacklo_epi8(rgb_row_1, zero_2), rgb_g_h);
226
227    // Multiply add like the previous row.
228    rgb_e_f = _mm_madd_epi16(rgb_e_f, y_table);
229    rgb_g_h = _mm_madd_epi16(rgb_g_h, y_table);
230
231    __m128i bg_efgh = _mm_castps_si128(
232        _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
233                       _mm_castsi128_ps(rgb_e_f),
234                       (3 << 6) | (1 << 4) | (3 << 2) | 1));
235    __m128i r_efgh = _mm_castps_si128(
236        _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
237                       _mm_castsi128_ps(rgb_e_f),
238                       (2 << 6) | (2 << 2)));
239    __m128i y_efgh = _mm_add_epi32(bg_efgh, r_efgh);
240    y_efgh = _mm_srai_epi32(y_efgh, FIX_SHIFT);
241    y_efgh = _mm_add_epi32(y_efgh, y_offset);
242    y_efgh = _mm_packs_epi32(y_efgh, y_efgh);
243    y_efgh = _mm_packus_epi16(y_efgh, y_efgh);
244    *reinterpret_cast<uint32*>(y_buf_2) = _mm_cvtsi128_si32(y_efgh);
245    y_buf_2 += 4;
246
247    __m128i rgb_ae_cg = _mm_castps_si128(
248        _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
249                       _mm_castsi128_ps(rgb_ae_bf),
250                       (3 << 6) | (2 << 4) | (3 << 2) | 2));
251    __m128i rgb_bf_dh = _mm_castps_si128(
252        _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
253                       _mm_castsi128_ps(rgb_ae_bf),
254                       (1 << 6) | (1 << 2)));
255
256    // This is a 2x2 subsampling for 2 pixels.
257    __m128i rgb_abef_cdgh = _mm_add_epi16(rgb_ae_cg, rgb_bf_dh);
258
259    // Do a multiply add with U table.
260    __m128i u_a_b = _mm_madd_epi16(
261        rgb_abef_cdgh,
262        _mm_load_si128(
263            reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 8)));
264    u_a_b = _mm_add_epi32(_mm_shuffle_epi32(u_a_b, ((3 << 2) | 1)),
265                          _mm_shuffle_epi32(u_a_b, (2 << 2)));
266    // Right shift 14 because of 12 from fixed point and 2 from subsampling.
267    u_a_b = _mm_srai_epi32(u_a_b, FIX_SHIFT + 2);
268    __m128i uv_offset = _mm_slli_epi32(y_offset, 3);
269    u_a_b = _mm_add_epi32(u_a_b, uv_offset);
270    u_a_b = _mm_packs_epi32(u_a_b, u_a_b);
271    u_a_b = _mm_packus_epi16(u_a_b, u_a_b);
272    *reinterpret_cast<uint16*>(u_buf) = _mm_extract_epi16(u_a_b, 0);
273    u_buf += 2;
274
275    __m128i v_a_b = _mm_madd_epi16(
276        rgb_abef_cdgh,
277        _mm_load_si128(
278            reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 16)));
279    v_a_b = _mm_add_epi32(_mm_shuffle_epi32(v_a_b, ((3 << 2) | 1)),
280                          _mm_shuffle_epi32(v_a_b, (2 << 2)));
281    v_a_b = _mm_srai_epi32(v_a_b, FIX_SHIFT + 2);
282    v_a_b = _mm_add_epi32(v_a_b, uv_offset);
283    v_a_b = _mm_packs_epi32(v_a_b, v_a_b);
284    v_a_b = _mm_packus_epi16(v_a_b, v_a_b);
285    *reinterpret_cast<uint16*>(v_buf) = _mm_extract_epi16(v_a_b, 0);
286    v_buf += 2;
287
288    rgb_buf_1 += 16;
289    rgb_buf_2 += 16;
290
291    // Move forward by 4 pixels.
292    width -= 4;
293  }
294
295  // Just use C code to convert the remaining pixels.
296  if (width >= 2) {
297    ConvertRGBToYUV_V2H2(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
298    rgb_buf_1 += 8;
299    rgb_buf_2 += 8;
300    y_buf_1 += 2;
301    y_buf_2 += 2;
302    ++u_buf;
303    ++v_buf;
304    width -= 2;
305  }
306
307  if (width)
308    ConvertRGBToYUV_V2H1(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
309}
310
311extern void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe,
312                                   uint8* yplane,
313                                   uint8* uplane,
314                                   uint8* vplane,
315                                   int width,
316                                   int height,
317                                   int rgbstride,
318                                   int ystride,
319                                   int uvstride) {
320  while (height >= 2) {
321    ConvertRGB32ToYUVRow_SSE2(rgbframe,
322                              rgbframe + rgbstride,
323                              yplane,
324                              yplane + ystride,
325                              uplane,
326                              vplane,
327                              width);
328    rgbframe += 2 * rgbstride;
329    yplane += 2 * ystride;
330    uplane += uvstride;
331    vplane += uvstride;
332    height -= 2;
333  }
334
335  if (!height)
336    return;
337
338  // Handle the last row.
339  while (width >= 2) {
340    ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
341    rgbframe += 8;
342    yplane += 2;
343    ++uplane;
344    ++vplane;
345    width -= 2;
346  }
347
348  if (width)
349    ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
350}
351
352void ConvertRGB32ToYUV_SSE2_Reference(const uint8* rgbframe,
353                                      uint8* yplane,
354                                      uint8* uplane,
355                                      uint8* vplane,
356                                      int width,
357                                      int height,
358                                      int rgbstride,
359                                      int ystride,
360                                      int uvstride) {
361  while (height >= 2) {
362    int i = 0;
363
364    // Convert a 2x2 block.
365    while (i + 2 <= width) {
366      ConvertRGBToYUV_V2H2(rgbframe + i * 4,
367                           rgbframe + rgbstride + i * 4,
368                           yplane + i,
369                           yplane + ystride + i,
370                           uplane + i / 2,
371                           vplane + i / 2);
372      i += 2;
373    }
374
375    // Convert the last pixel of two rows.
376    if (i < width) {
377      ConvertRGBToYUV_V2H1(rgbframe + i * 4,
378                           rgbframe + rgbstride + i * 4,
379                           yplane + i,
380                           yplane + ystride + i,
381                           uplane + i / 2,
382                           vplane + i / 2);
383    }
384
385    rgbframe += 2 * rgbstride;
386    yplane += 2 * ystride;
387    uplane += uvstride;
388    vplane += uvstride;
389    height -= 2;
390  }
391
392  if (!height)
393    return;
394
395  // Handle the last row.
396  while (width >= 2) {
397    ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
398    rgbframe += 8;
399    yplane += 2;
400    ++uplane;
401    ++vplane;
402    width -= 2;
403  }
404
405  // Handle the last pixel in the last row.
406  if (width)
407    ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
408}
409
410}  // namespace media
411