133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Copyright 2014 Google Inc. All Rights Reserved.
233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//
333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Use of this source code is governed by a BSD-style license
433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// that can be found in the COPYING file in the root of the source
533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// tree. An additional intellectual property rights grant can be found
633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// in the file PATENTS. All contributing project authors may
733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// be found in the AUTHORS file in the root of the source tree.
833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// -----------------------------------------------------------------------------
933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//
1033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// YUV->RGB conversion functions
1133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//
1233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Author: Skal (pascal.massimino@gmail.com)
1333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
1433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include "./yuv.h"
1533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
1633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(WEBP_USE_SSE2)
1733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
1833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include <emmintrin.h>
1933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include <string.h>   // for memcpy
2033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
2133f74dabbc7920a65ed435d7417987589febdc16Vikas Aroratypedef union {   // handy struct for converting SSE2 registers
2233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int32_t i32[4];
2333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8_t u8[16];
2433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  __m128i m;
2533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} VP8kCstSSE2;
2633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
2733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(WEBP_YUV_USE_SSE2_TABLES)
2833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
2933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include "./yuv_tables_sse2.h"
3033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3133f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YUVInitSSE2(void) {}
3233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else
3433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int done_sse2 = 0;
3633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
3733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3833f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YUVInitSSE2(void) {
3933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (!done_sse2) {
4033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    int i;
4133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    for (i = 0; i < 256; ++i) {
4233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      VP8kYtoRGBA[i].i32[0] =
4333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora        VP8kYtoRGBA[i].i32[1] =
4433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora        VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
4533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
4633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
4733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      VP8kUtoRGBA[i].i32[0] = 0;
4833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
4933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      VP8kUtoRGBA[i].i32[2] =  kUToB * (i - 128);
5033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      VP8kUtoRGBA[i].i32[3] = 0;
5133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
5233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      VP8kVtoRGBA[i].i32[0] =  kVToR * (i - 128);
5333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
5433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      VP8kVtoRGBA[i].i32[2] = 0;
5533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      VP8kVtoRGBA[i].i32[3] = 0;
5633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    }
5733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    done_sse2 = 1;
5833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
5933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if 0   // code used to generate 'yuv_tables_sse2.h'
6033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    printf("static const VP8kCstSSE2 VP8kYtoRGBA[256] = {\n");
6133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    for (i = 0; i < 256; ++i) {
6233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      printf("  {{0x%.8x, 0x%.8x, 0x%.8x, 0x%.8x}},\n",
6333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora             VP8kYtoRGBA[i].i32[0], VP8kYtoRGBA[i].i32[1],
6433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora             VP8kYtoRGBA[i].i32[2], VP8kYtoRGBA[i].i32[3]);
6533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    }
6633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    printf("};\n\n");
6733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    printf("static const VP8kCstSSE2 VP8kUtoRGBA[256] = {\n");
6833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    for (i = 0; i < 256; ++i) {
6933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      printf("  {{0, 0x%.8x, 0x%.8x, 0}},\n",
7033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora             VP8kUtoRGBA[i].i32[1], VP8kUtoRGBA[i].i32[2]);
7133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    }
7233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    printf("};\n\n");
7333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    printf("static VP8kCstSSE2 VP8kVtoRGBA[256] = {\n");
7433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    for (i = 0; i < 256; ++i) {
7533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      printf("  {{0x%.8x, 0x%.8x, 0, 0}},\n",
7633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora             VP8kVtoRGBA[i].i32[0], VP8kVtoRGBA[i].i32[1]);
7733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    }
7833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    printf("};\n\n");
7933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif
8033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
8133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
8233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
8333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif  // WEBP_YUV_USE_SSE2_TABLES
8433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
8533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//-----------------------------------------------------------------------------
8633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
8733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE __m128i LoadUVPart(int u, int v) {
8833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
8933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
9033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
9133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return uv_part;
9233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
9333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
9433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) {
9533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
9633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
9733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
9833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return rgba2;
9933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
10033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
10133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE __m128i GetRGBA32b(int y, int u, int v) {
10233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i uv_part = LoadUVPart(u, v);
10333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return GetRGBA32bWithUV(y, uv_part);
10433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
10533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
10633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
10733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                     uint8_t* const rgb) {
10833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i tmp0 = GetRGBA32b(y, u, v);
10933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
11033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
11133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
11233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  _mm_storel_epi64((__m128i*)rgb, tmp2);
11333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
11433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
11533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
11633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                     uint8_t* const bgr) {
11733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i tmp0 = GetRGBA32b(y, u, v);
11833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
11933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
12033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
12133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
12233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  _mm_storel_epi64((__m128i*)bgr, tmp3);
12333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
12433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
12533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//-----------------------------------------------------------------------------
12633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
12733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
12833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#ifdef FANCY_UPSAMPLING
12933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
13033f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
13133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                    uint8_t* dst) {
13233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int n;
13333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (n = 0; n < 32; n += 4) {
13433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
13533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
13633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_3 = GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
13733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_4 = GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
13833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
13933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
14033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
14133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    _mm_storeu_si128((__m128i*)dst, tmp2);
14233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    dst += 4 * 4;
14333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
14433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
14533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
14633f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
14733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                    uint8_t* dst) {
14833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int n;
14933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (n = 0; n < 32; n += 2) {
15033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
15133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
15233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
15333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
15433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
15533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
15633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    _mm_storel_epi64((__m128i*)dst, tmp3);
15733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    dst += 4 * 2;
15833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
15933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
16033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
16133f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
16233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                   uint8_t* dst) {
16333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int n;
16433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8_t tmp0[2 * 3 + 5 + 15];
16533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
16633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (n = 0; n < 30; ++n) {   // we directly stomp the *dst memory
16733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
16833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
16933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Last two pixels are special: we write in a tmp buffer before sending
17033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // to dst.
17133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
17233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
17333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  memcpy(dst + n * 3, tmp, 2 * 3);
17433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
17533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
17633f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
17733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                   uint8_t* dst) {
17833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int n;
17933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8_t tmp0[2 * 3 + 5 + 15];
18033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
18133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (n = 0; n < 30; ++n) {
18233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
18333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
18433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
18533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
18633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  memcpy(dst + n * 3, tmp, 2 * 3);
18733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
18833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
18933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif  // FANCY_UPSAMPLING
19033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
19133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//-----------------------------------------------------------------------------
19233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Arbitrary-length row conversion functions
19333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
19433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void YuvToRgbaRowSSE2(const uint8_t* y,
19533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                             const uint8_t* u, const uint8_t* v,
19633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                             uint8_t* dst, int len) {
19733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int n;
19833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (n = 0; n + 4 <= len; n += 4) {
19933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
20033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i uv_1 = LoadUVPart(u[1], v[1]);
20133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
20233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
20333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_3 = GetRGBA32bWithUV(y[2], uv_1);
20433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_4 = GetRGBA32bWithUV(y[3], uv_1);
20533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
20633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
20733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
20833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    _mm_storeu_si128((__m128i*)dst, tmp2);
20933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    dst += 4 * 4;
21033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    y += 4;
21133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    u += 2;
21233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    v += 2;
21333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
21433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Finish off
21533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  while (n < len) {
21633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    VP8YuvToRgba(y[0], u[0], v[0], dst);
21733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    dst += 4;
21833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ++y;
21933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    u += (n & 1);
22033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    v += (n & 1);
22133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ++n;
22233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
22333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
22433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
22533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void YuvToBgraRowSSE2(const uint8_t* y,
22633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                             const uint8_t* u, const uint8_t* v,
22733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                             uint8_t* dst, int len) {
22833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int n;
22933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (n = 0; n + 2 <= len; n += 2) {
23033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
23133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
23233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
23333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
23433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
23533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
23633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
23733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    _mm_storel_epi64((__m128i*)dst, tmp3);
23833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    dst += 4 * 2;
23933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    y += 2;
24033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ++u;
24133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ++v;
24233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
24333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Finish off
24433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (len & 1) {
24533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    VP8YuvToBgra(y[0], u[0], v[0], dst);
24633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
24733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
24833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
24933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void YuvToArgbRowSSE2(const uint8_t* y,
25033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                             const uint8_t* u, const uint8_t* v,
25133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                             uint8_t* dst, int len) {
25233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int n;
25333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (n = 0; n + 2 <= len; n += 2) {
25433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
25533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
25633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
25733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3));
25833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3));
25933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
26033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
26133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    _mm_storel_epi64((__m128i*)dst, tmp3);
26233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    dst += 4 * 2;
26333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    y += 2;
26433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ++u;
26533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ++v;
26633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
26733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Finish off
26833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (len & 1) {
26933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    VP8YuvToArgb(y[0], u[0], v[0], dst);
27033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
27133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
27233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
27333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void YuvToRgbRowSSE2(const uint8_t* y,
27433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                            const uint8_t* u, const uint8_t* v,
27533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                            uint8_t* dst, int len) {
27633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int n;
27733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (n = 0; n + 2 < len; ++n) {   // we directly stomp the *dst memory
27833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    YuvToRgbSSE2(y[0], u[0], v[0], dst);  // stomps 8 bytes
27933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    dst += 3;
28033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ++y;
28133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    u += (n & 1);
28233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    v += (n & 1);
28333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
28433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8YuvToRgb(y[0], u[0], v[0], dst);
28533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (len > 1) {
28633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    VP8YuvToRgb(y[1], u[n & 1], v[n & 1], dst + 3);
28733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
28833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
28933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
29033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void YuvToBgrRowSSE2(const uint8_t* y,
29133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                            const uint8_t* u, const uint8_t* v,
29233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                            uint8_t* dst, int len) {
29333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int n;
29433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (n = 0; n + 2 < len; ++n) {   // we directly stomp the *dst memory
29533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    YuvToBgrSSE2(y[0], u[0], v[0], dst);  // stomps 8 bytes
29633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    dst += 3;
29733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ++y;
29833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    u += (n & 1);
29933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    v += (n & 1);
30033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
30133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8YuvToBgr(y[0], u[0], v[0], dst + 0);
30233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (len > 1) {
30333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    VP8YuvToBgr(y[1], u[n & 1], v[n & 1], dst + 3);
30433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
30533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
30633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
30733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif  // WEBP_USE_SSE2
30833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
30933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
31033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Entry point
31133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
31233f74dabbc7920a65ed435d7417987589febdc16Vikas Aroraextern void WebPInitSamplersSSE2(void);
31333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
31433f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid WebPInitSamplersSSE2(void) {
31533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(WEBP_USE_SSE2)
31633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  WebPSamplers[MODE_RGB]  = YuvToRgbRowSSE2;
31733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  WebPSamplers[MODE_RGBA] = YuvToRgbaRowSSE2;
31833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  WebPSamplers[MODE_BGR]  = YuvToBgrRowSSE2;
31933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  WebPSamplers[MODE_BGRA] = YuvToBgraRowSSE2;
32033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  WebPSamplers[MODE_ARGB] = YuvToArgbRowSSE2;
32133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif  // WEBP_USE_SSE2
32233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
323