1af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Copyright 2014 Google Inc. All Rights Reserved.
2af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//
3af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Use of this source code is governed by a BSD-style license
4af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// that can be found in the COPYING file in the root of the source
5af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// tree. An additional intellectual property rights grant can be found
6af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// in the file PATENTS. All contributing project authors may
7af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// be found in the AUTHORS file in the root of the source tree.
8af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// -----------------------------------------------------------------------------
9af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//
10af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// YUV->RGB conversion functions
11af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//
12af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Author: Skal (pascal.massimino@gmail.com)
13af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
14af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include "./yuv.h"
15af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
16af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(WEBP_USE_SSE2)
17af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
18af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include <emmintrin.h>
19af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include <string.h>   // for memcpy
20af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
21af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroratypedef union {   // handy struct for converting SSE2 registers
22af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int32_t i32[4];
23af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint8_t u8[16];
24af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  __m128i m;
25af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} VP8kCstSSE2;
26af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
27af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(WEBP_YUV_USE_SSE2_TABLES)
28af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
29af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include "./yuv_tables_sse2.h"
30af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
31af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YUVInitSSE2(void) {}
32af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
33af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#else
34af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
35af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int done_sse2 = 0;
36af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
37af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
38af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YUVInitSSE2(void) {
39af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  if (!done_sse2) {
40af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    int i;
41af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    for (i = 0; i < 256; ++i) {
42af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      VP8kYtoRGBA[i].i32[0] =
43af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora        VP8kYtoRGBA[i].i32[1] =
44af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora        VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
45af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
46af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
47af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      VP8kUtoRGBA[i].i32[0] = 0;
48af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
49af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      VP8kUtoRGBA[i].i32[2] =  kUToB * (i - 128);
50af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      VP8kUtoRGBA[i].i32[3] = 0;
51af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
52af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      VP8kVtoRGBA[i].i32[0] =  kVToR * (i - 128);
53af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
54af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      VP8kVtoRGBA[i].i32[2] = 0;
55af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      VP8kVtoRGBA[i].i32[3] = 0;
56af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    }
57af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    done_sse2 = 1;
58af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
59af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if 0   // code used to generate 'yuv_tables_sse2.h'
60af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    printf("static const VP8kCstSSE2 VP8kYtoRGBA[256] = {\n");
61af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    for (i = 0; i < 256; ++i) {
62af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      printf("  {{0x%.8x, 0x%.8x, 0x%.8x, 0x%.8x}},\n",
63af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora             VP8kYtoRGBA[i].i32[0], VP8kYtoRGBA[i].i32[1],
64af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora             VP8kYtoRGBA[i].i32[2], VP8kYtoRGBA[i].i32[3]);
65af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    }
66af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    printf("};\n\n");
67af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    printf("static const VP8kCstSSE2 VP8kUtoRGBA[256] = {\n");
68af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    for (i = 0; i < 256; ++i) {
69af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      printf("  {{0, 0x%.8x, 0x%.8x, 0}},\n",
70af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora             VP8kUtoRGBA[i].i32[1], VP8kUtoRGBA[i].i32[2]);
71af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    }
72af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    printf("};\n\n");
73af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    printf("static VP8kCstSSE2 VP8kVtoRGBA[256] = {\n");
74af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    for (i = 0; i < 256; ++i) {
75af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      printf("  {{0x%.8x, 0x%.8x, 0, 0}},\n",
76af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora             VP8kVtoRGBA[i].i32[0], VP8kVtoRGBA[i].i32[1]);
77af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    }
78af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    printf("};\n\n");
79af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif
80af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
81af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
82af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
83af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif  // WEBP_YUV_USE_SSE2_TABLES
84af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
85af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//-----------------------------------------------------------------------------
86af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
87af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE __m128i LoadUVPart(int u, int v) {
88af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
89af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
90af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
91af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return uv_part;
92af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
93af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
94af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) {
95af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
96af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
97af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
98af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return rgba2;
99af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
100af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
101af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE __m128i GetRGBA32b(int y, int u, int v) {
102af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i uv_part = LoadUVPart(u, v);
103af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return GetRGBA32bWithUV(y, uv_part);
104af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
105af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
106af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
107af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                     uint8_t* const rgb) {
108af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i tmp0 = GetRGBA32b(y, u, v);
109af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
110af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
111af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
112af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  _mm_storel_epi64((__m128i*)rgb, tmp2);
113af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
114af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
115af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
116af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                     uint8_t* const bgr) {
117af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i tmp0 = GetRGBA32b(y, u, v);
118af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
119af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
120af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
121af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
122af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  _mm_storel_epi64((__m128i*)bgr, tmp3);
123af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
124af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
125af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//-----------------------------------------------------------------------------
126af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
127af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
128af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#ifdef FANCY_UPSAMPLING
129af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
130af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
131af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                    uint8_t* dst) {
132af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int n;
133af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (n = 0; n < 32; n += 4) {
134af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
135af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
136af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_3 = GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
137af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_4 = GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
138af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
139af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
140af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
141af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    _mm_storeu_si128((__m128i*)dst, tmp2);
142af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    dst += 4 * 4;
143af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
144af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
145af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
146af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
147af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                    uint8_t* dst) {
148af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int n;
149af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (n = 0; n < 32; n += 2) {
150af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
151af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
152af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
153af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
154af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
155af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
156af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    _mm_storel_epi64((__m128i*)dst, tmp3);
157af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    dst += 4 * 2;
158af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
159af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
160af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
161af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
162af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                   uint8_t* dst) {
163af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int n;
164af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint8_t tmp0[2 * 3 + 5 + 15];
165af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
166af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (n = 0; n < 30; ++n) {   // we directly stomp the *dst memory
167af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
168af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
169af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // Last two pixels are special: we write in a tmp buffer before sending
170af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // to dst.
171af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
172af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
173af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  memcpy(dst + n * 3, tmp, 2 * 3);
174af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
175af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
176af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
177af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                   uint8_t* dst) {
178af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int n;
179af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint8_t tmp0[2 * 3 + 5 + 15];
180af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
181af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (n = 0; n < 30; ++n) {
182af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
183af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
184af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
185af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
186af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  memcpy(dst + n * 3, tmp, 2 * 3);
187af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
188af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
189af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif  // FANCY_UPSAMPLING
190af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
191af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//-----------------------------------------------------------------------------
192af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Arbitrary-length row conversion functions
193af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
194af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void YuvToRgbaRowSSE2(const uint8_t* y,
195af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                             const uint8_t* u, const uint8_t* v,
196af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                             uint8_t* dst, int len) {
197af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int n;
198af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (n = 0; n + 4 <= len; n += 4) {
199af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
200af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i uv_1 = LoadUVPart(u[1], v[1]);
201af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
202af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
203af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_3 = GetRGBA32bWithUV(y[2], uv_1);
204af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_4 = GetRGBA32bWithUV(y[3], uv_1);
205af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
206af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
207af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
208af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    _mm_storeu_si128((__m128i*)dst, tmp2);
209af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    dst += 4 * 4;
210af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    y += 4;
211af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    u += 2;
212af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    v += 2;
213af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
214af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // Finish off
215af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  while (n < len) {
216af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VP8YuvToRgba(y[0], u[0], v[0], dst);
217af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    dst += 4;
218af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ++y;
219af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    u += (n & 1);
220af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    v += (n & 1);
221af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ++n;
222af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
223af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
224af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
225af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void YuvToBgraRowSSE2(const uint8_t* y,
226af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                             const uint8_t* u, const uint8_t* v,
227af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                             uint8_t* dst, int len) {
228af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int n;
229af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (n = 0; n + 2 <= len; n += 2) {
230af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
231af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
232af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
233af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
234af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
235af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
236af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
237af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    _mm_storel_epi64((__m128i*)dst, tmp3);
238af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    dst += 4 * 2;
239af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    y += 2;
240af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ++u;
241af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ++v;
242af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
243af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // Finish off
244af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  if (len & 1) {
245af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VP8YuvToBgra(y[0], u[0], v[0], dst);
246af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
247af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
248af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
249af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void YuvToArgbRowSSE2(const uint8_t* y,
250af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                             const uint8_t* u, const uint8_t* v,
251af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                             uint8_t* dst, int len) {
252af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int n;
253af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (n = 0; n + 2 <= len; n += 2) {
254af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
255af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
256af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
257af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3));
258af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3));
259af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
260af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
261af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    _mm_storel_epi64((__m128i*)dst, tmp3);
262af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    dst += 4 * 2;
263af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    y += 2;
264af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ++u;
265af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ++v;
266af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
267af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // Finish off
268af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  if (len & 1) {
269af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VP8YuvToArgb(y[0], u[0], v[0], dst);
270af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
271af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
272af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
273af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void YuvToRgbRowSSE2(const uint8_t* y,
274af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                            const uint8_t* u, const uint8_t* v,
275af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                            uint8_t* dst, int len) {
276af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int n;
277af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (n = 0; n + 2 < len; ++n) {   // we directly stomp the *dst memory
278af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    YuvToRgbSSE2(y[0], u[0], v[0], dst);  // stomps 8 bytes
279af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    dst += 3;
280af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ++y;
281af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    u += (n & 1);
282af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    v += (n & 1);
283af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
284af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8YuvToRgb(y[0], u[0], v[0], dst);
285af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  if (len > 1) {
286af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VP8YuvToRgb(y[1], u[n & 1], v[n & 1], dst + 3);
287af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
288af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
289af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
290af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void YuvToBgrRowSSE2(const uint8_t* y,
291af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                            const uint8_t* u, const uint8_t* v,
292af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                            uint8_t* dst, int len) {
293af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int n;
294af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (n = 0; n + 2 < len; ++n) {   // we directly stomp the *dst memory
295af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    YuvToBgrSSE2(y[0], u[0], v[0], dst);  // stomps 8 bytes
296af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    dst += 3;
297af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ++y;
298af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    u += (n & 1);
299af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    v += (n & 1);
300af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
301af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8YuvToBgr(y[0], u[0], v[0], dst + 0);
302af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  if (len > 1) {
303af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VP8YuvToBgr(y[1], u[n & 1], v[n & 1], dst + 3);
304af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
305af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
306af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
307af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif  // WEBP_USE_SSE2
308af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
309af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//------------------------------------------------------------------------------
310af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Entry point
311af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
312af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroraextern void WebPInitSamplersSSE2(void);
313af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
314af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid WebPInitSamplersSSE2(void) {
315af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(WEBP_USE_SSE2)
316af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  WebPSamplers[MODE_RGB]  = YuvToRgbRowSSE2;
317af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  WebPSamplers[MODE_RGBA] = YuvToRgbaRowSSE2;
318af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  WebPSamplers[MODE_BGR]  = YuvToBgrRowSSE2;
319af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  WebPSamplers[MODE_BGRA] = YuvToBgraRowSSE2;
320af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  WebPSamplers[MODE_ARGB] = YuvToArgbRowSSE2;
321af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif  // WEBP_USE_SSE2
322af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
323