133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Copyright 2014 Google Inc. All Rights Reserved. 233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// 333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Use of this source code is governed by a BSD-style license 433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// that can be found in the COPYING file in the root of the source 533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// tree. An additional intellectual property rights grant can be found 633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// in the file PATENTS. All contributing project authors may 733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// be found in the AUTHORS file in the root of the source tree. 833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// ----------------------------------------------------------------------------- 933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// 1033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// YUV->RGB conversion functions 1133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// 1233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Author: Skal (pascal.massimino@gmail.com) 1333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 1433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include "./yuv.h" 1533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 1633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(WEBP_USE_SSE2) 1733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 1833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include <emmintrin.h> 1933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include <string.h> // for memcpy 2033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 2133f74dabbc7920a65ed435d7417987589febdc16Vikas Aroratypedef union { // handy struct for converting SSE2 registers 2233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int32_t i32[4]; 2333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t u8[16]; 2433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora __m128i m; 2533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} VP8kCstSSE2; 2633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 2733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(WEBP_YUV_USE_SSE2_TABLES) 2833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 2933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include "./yuv_tables_sse2.h" 3033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 3133f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YUVInitSSE2(void) {} 3233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 3333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else 3433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 3533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int done_sse2 = 0; 3633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256]; 3733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 3833f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YUVInitSSE2(void) { 3933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora if (!done_sse2) { 4033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int i; 4133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (i = 0; i < 256; ++i) { 4233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kYtoRGBA[i].i32[0] = 4333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kYtoRGBA[i].i32[1] = 4433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2; 4533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2; 4633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 4733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kUtoRGBA[i].i32[0] = 0; 4833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128); 4933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kUtoRGBA[i].i32[2] = kUToB * (i - 128); 5033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kUtoRGBA[i].i32[3] = 0; 5133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 5233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kVtoRGBA[i].i32[0] = kVToR * (i - 128); 5333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128); 5433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kVtoRGBA[i].i32[2] = 0; 5533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kVtoRGBA[i].i32[3] = 0; 5633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 5733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora done_sse2 = 1; 5833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 5933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if 0 // code used to generate 'yuv_tables_sse2.h' 6033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora printf("static const VP8kCstSSE2 VP8kYtoRGBA[256] = {\n"); 6133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (i = 0; i < 256; ++i) { 6233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora printf(" {{0x%.8x, 0x%.8x, 0x%.8x, 0x%.8x}},\n", 6333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kYtoRGBA[i].i32[0], VP8kYtoRGBA[i].i32[1], 6433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kYtoRGBA[i].i32[2], VP8kYtoRGBA[i].i32[3]); 6533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 6633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora printf("};\n\n"); 6733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora printf("static const VP8kCstSSE2 VP8kUtoRGBA[256] = {\n"); 6833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (i = 0; i < 256; ++i) { 6933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora printf(" {{0, 0x%.8x, 0x%.8x, 0}},\n", 7033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kUtoRGBA[i].i32[1], VP8kUtoRGBA[i].i32[2]); 7133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 7233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora printf("};\n\n"); 7333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora printf("static VP8kCstSSE2 VP8kVtoRGBA[256] = {\n"); 7433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (i = 0; i < 256; ++i) { 7533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora printf(" {{0x%.8x, 0x%.8x, 0, 0}},\n", 7633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8kVtoRGBA[i].i32[0], VP8kVtoRGBA[i].i32[1]); 7733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 7833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora printf("};\n\n"); 7933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif 8033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 8133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 8233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 8333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif // WEBP_YUV_USE_SSE2_TABLES 8433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 8533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//----------------------------------------------------------------------------- 8633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 8733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE __m128i LoadUVPart(int u, int v) { 8833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m); 8933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m); 9033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i uv_part = _mm_add_epi32(u_part, v_part); 9133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return uv_part; 9233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 9333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 9433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) { 9533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m); 9633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rgba1 = _mm_add_epi32(y_part, uv_part); 9733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2); 9833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return rgba2; 9933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 10033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 10133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE __m128i GetRGBA32b(int y, int u, int v) { 10233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i uv_part = LoadUVPart(u, v); 10333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return GetRGBA32bWithUV(y, uv_part); 10433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 10533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 10633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v, 10733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* const rgb) { 10833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0 = GetRGBA32b(y, u, v); 10933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0); 11033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1); 11133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp 11233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storel_epi64((__m128i*)rgb, tmp2); 11333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 11433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 11533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v, 11633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* const bgr) { 11733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0 = GetRGBA32b(y, u, v); 11833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2)); 11933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1); 12033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2); 12133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp 12233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storel_epi64((__m128i*)bgr, tmp3); 12333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 12433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 12533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//----------------------------------------------------------------------------- 12633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Convert spans of 32 pixels to various RGB formats for the fancy upsampler. 12733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 12833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#ifdef FANCY_UPSAMPLING 12933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 13033f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, 13133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* dst) { 13233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int n; 13333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (n = 0; n < 32; n += 4) { 13433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); 13533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); 13633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_3 = GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]); 13733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_4 = GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]); 13833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2); 13933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4); 14033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2); 14133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storeu_si128((__m128i*)dst, tmp2); 14233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst += 4 * 4; 14333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 14433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 14533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 14633f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, 14733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* dst) { 14833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int n; 14933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (n = 0; n < 32; n += 2) { 15033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); 15133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); 15233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); 15333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); 15433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); 15533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); 15633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storel_epi64((__m128i*)dst, tmp3); 15733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst += 4 * 2; 15833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 15933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 16033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 16133f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, 16233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* dst) { 16333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int n; 16433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t tmp0[2 * 3 + 5 + 15]; 16533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align 16633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (n = 0; n < 30; ++n) { // we directly stomp the *dst memory 16733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3); 16833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 16933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Last two pixels are special: we write in a tmp buffer before sending 17033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // to dst. 17133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); 17233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); 17333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora memcpy(dst + n * 3, tmp, 2 * 3); 17433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 17533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 17633f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, 17733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* dst) { 17833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int n; 17933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t tmp0[2 * 3 + 5 + 15]; 18033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align 18133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (n = 0; n < 30; ++n) { 18233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3); 18333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 18433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); 18533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); 18633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora memcpy(dst + n * 3, tmp, 2 * 3); 18733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 18833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 18933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif // FANCY_UPSAMPLING 19033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 19133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//----------------------------------------------------------------------------- 19233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Arbitrary-length row conversion functions 19333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 19433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void YuvToRgbaRowSSE2(const uint8_t* y, 19533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8_t* u, const uint8_t* v, 19633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* dst, int len) { 19733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int n; 19833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (n = 0; n + 4 <= len; n += 4) { 19933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i uv_0 = LoadUVPart(u[0], v[0]); 20033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i uv_1 = LoadUVPart(u[1], v[1]); 20133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); 20233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); 20333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_3 = GetRGBA32bWithUV(y[2], uv_1); 20433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_4 = GetRGBA32bWithUV(y[3], uv_1); 20533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2); 20633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4); 20733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2); 20833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storeu_si128((__m128i*)dst, tmp2); 20933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst += 4 * 4; 21033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora y += 4; 21133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora u += 2; 21233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora v += 2; 21333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 21433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Finish off 21533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora while (n < len) { 21633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8YuvToRgba(y[0], u[0], v[0], dst); 21733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst += 4; 21833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora ++y; 21933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora u += (n & 1); 22033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora v += (n & 1); 22133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora ++n; 22233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 22333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 22433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 22533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void YuvToBgraRowSSE2(const uint8_t* y, 22633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8_t* u, const uint8_t* v, 22733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* dst, int len) { 22833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int n; 22933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (n = 0; n + 2 <= len; n += 2) { 23033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i uv_0 = LoadUVPart(u[0], v[0]); 23133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); 23233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); 23333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); 23433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); 23533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); 23633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); 23733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storel_epi64((__m128i*)dst, tmp3); 23833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst += 4 * 2; 23933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora y += 2; 24033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora ++u; 24133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora ++v; 24233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 24333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Finish off 24433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora if (len & 1) { 24533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8YuvToBgra(y[0], u[0], v[0], dst); 24633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 24733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 24833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 24933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void YuvToArgbRowSSE2(const uint8_t* y, 25033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8_t* u, const uint8_t* v, 25133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* dst, int len) { 25233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int n; 25333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (n = 0; n + 2 <= len; n += 2) { 25433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i uv_0 = LoadUVPart(u[0], v[0]); 25533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); 25633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); 25733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3)); 25833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3)); 25933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); 26033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); 26133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storel_epi64((__m128i*)dst, tmp3); 26233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst += 4 * 2; 26333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora y += 2; 26433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora ++u; 26533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora ++v; 26633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 26733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Finish off 26833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora if (len & 1) { 26933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8YuvToArgb(y[0], u[0], v[0], dst); 27033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 27133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 27233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 27333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void YuvToRgbRowSSE2(const uint8_t* y, 27433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8_t* u, const uint8_t* v, 27533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* dst, int len) { 27633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int n; 27733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory 27833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora YuvToRgbSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes 27933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst += 3; 28033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora ++y; 28133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora u += (n & 1); 28233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora v += (n & 1); 28333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 28433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8YuvToRgb(y[0], u[0], v[0], dst); 28533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora if (len > 1) { 28633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8YuvToRgb(y[1], u[n & 1], v[n & 1], dst + 3); 28733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 28833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 28933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 29033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void YuvToBgrRowSSE2(const uint8_t* y, 29133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8_t* u, const uint8_t* v, 29233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8_t* dst, int len) { 29333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int n; 29433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory 29533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora YuvToBgrSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes 29633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst += 3; 29733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora ++y; 29833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora u += (n & 1); 29933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora v += (n & 1); 30033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 30133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8YuvToBgr(y[0], u[0], v[0], dst + 0); 30233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora if (len > 1) { 30333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8YuvToBgr(y[1], u[n & 1], v[n & 1], dst + 3); 30433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 30533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 30633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 30733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif // WEBP_USE_SSE2 30833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 30933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------ 31033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Entry point 31133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 31233f74dabbc7920a65ed435d7417987589febdc16Vikas Aroraextern void WebPInitSamplersSSE2(void); 31333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 31433f74dabbc7920a65ed435d7417987589febdc16Vikas Aroravoid WebPInitSamplersSSE2(void) { 31533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(WEBP_USE_SSE2) 31633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora WebPSamplers[MODE_RGB] = YuvToRgbRowSSE2; 31733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora WebPSamplers[MODE_RGBA] = YuvToRgbaRowSSE2; 31833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora WebPSamplers[MODE_BGR] = YuvToBgrRowSSE2; 31933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora WebPSamplers[MODE_BGRA] = YuvToBgraRowSSE2; 32033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora WebPSamplers[MODE_ARGB] = YuvToArgbRowSSE2; 32133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif // WEBP_USE_SSE2 32233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 323