1af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Copyright 2014 Google Inc. All Rights Reserved. 2af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// 3af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Use of this source code is governed by a BSD-style license 4af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// that can be found in the COPYING file in the root of the source 5af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// tree. An additional intellectual property rights grant can be found 6af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// in the file PATENTS. All contributing project authors may 7af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// be found in the AUTHORS file in the root of the source tree. 8af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// ----------------------------------------------------------------------------- 9af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// 10af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// YUV->RGB conversion functions 11af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// 12af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Author: Skal (pascal.massimino@gmail.com) 13af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 14af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include "./yuv.h" 15af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 16af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(WEBP_USE_SSE2) 17af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 18af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include <emmintrin.h> 19af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include <string.h> // for memcpy 20af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 21af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroratypedef union { // handy struct for converting SSE2 registers 22af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int32_t i32[4]; 23af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t u8[16]; 24af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora __m128i m; 25af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} VP8kCstSSE2; 26af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 27af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(WEBP_YUV_USE_SSE2_TABLES) 28af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 29af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include "./yuv_tables_sse2.h" 30af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 31af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YUVInitSSE2(void) {} 32af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 33af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#else 34af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 35af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int done_sse2 = 0; 36af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256]; 37af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 38af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YUVInitSSE2(void) { 39af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora if (!done_sse2) { 40af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int i; 41af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (i = 0; i < 256; ++i) { 42af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kYtoRGBA[i].i32[0] = 43af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kYtoRGBA[i].i32[1] = 44af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2; 45af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2; 46af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 47af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kUtoRGBA[i].i32[0] = 0; 48af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128); 49af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kUtoRGBA[i].i32[2] = kUToB * (i - 128); 50af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kUtoRGBA[i].i32[3] = 0; 51af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 52af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kVtoRGBA[i].i32[0] = kVToR * (i - 128); 53af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128); 54af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kVtoRGBA[i].i32[2] = 0; 55af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kVtoRGBA[i].i32[3] = 0; 56af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 57af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora done_sse2 = 1; 58af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 59af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if 0 // code used to generate 'yuv_tables_sse2.h' 60af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora printf("static const VP8kCstSSE2 VP8kYtoRGBA[256] = {\n"); 61af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (i = 0; i < 256; ++i) { 62af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora printf(" {{0x%.8x, 0x%.8x, 0x%.8x, 0x%.8x}},\n", 63af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kYtoRGBA[i].i32[0], VP8kYtoRGBA[i].i32[1], 64af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kYtoRGBA[i].i32[2], VP8kYtoRGBA[i].i32[3]); 65af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 66af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora printf("};\n\n"); 67af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora printf("static const VP8kCstSSE2 VP8kUtoRGBA[256] = {\n"); 68af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (i = 0; i < 256; ++i) { 69af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora printf(" {{0, 0x%.8x, 0x%.8x, 0}},\n", 70af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kUtoRGBA[i].i32[1], VP8kUtoRGBA[i].i32[2]); 71af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 72af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora printf("};\n\n"); 73af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora printf("static VP8kCstSSE2 VP8kVtoRGBA[256] = {\n"); 74af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (i = 0; i < 256; ++i) { 75af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora printf(" {{0x%.8x, 0x%.8x, 0, 0}},\n", 76af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8kVtoRGBA[i].i32[0], VP8kVtoRGBA[i].i32[1]); 77af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 78af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora printf("};\n\n"); 79af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif 80af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 81af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 82af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 83af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif // WEBP_YUV_USE_SSE2_TABLES 84af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 85af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//----------------------------------------------------------------------------- 86af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 87af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE __m128i LoadUVPart(int u, int v) { 88af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m); 89af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m); 90af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i uv_part = _mm_add_epi32(u_part, v_part); 91af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return uv_part; 92af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 93af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 94af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) { 95af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m); 96af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i rgba1 = _mm_add_epi32(y_part, uv_part); 97af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2); 98af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return rgba2; 99af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 100af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 101af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE __m128i GetRGBA32b(int y, int u, int v) { 102af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i uv_part = LoadUVPart(u, v); 103af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return GetRGBA32bWithUV(y, uv_part); 104af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 105af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 106af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v, 107af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* const rgb) { 108af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0 = GetRGBA32b(y, u, v); 109af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0); 110af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1); 111af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp 112af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora _mm_storel_epi64((__m128i*)rgb, tmp2); 113af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 114af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 115af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v, 116af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* const bgr) { 117af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0 = GetRGBA32b(y, u, v); 118af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2)); 119af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1); 120af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2); 121af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp 122af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora _mm_storel_epi64((__m128i*)bgr, tmp3); 123af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 124af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 125af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//----------------------------------------------------------------------------- 126af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Convert spans of 32 pixels to various RGB formats for the fancy upsampler. 127af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 128af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#ifdef FANCY_UPSAMPLING 129af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 130af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, 131af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* dst) { 132af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int n; 133af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (n = 0; n < 32; n += 4) { 134af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); 135af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); 136af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_3 = GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]); 137af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_4 = GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]); 138af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2); 139af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4); 140af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2); 141af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora _mm_storeu_si128((__m128i*)dst, tmp2); 142af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst += 4 * 4; 143af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 144af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 145af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 146af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, 147af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* dst) { 148af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int n; 149af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (n = 0; n < 32; n += 2) { 150af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); 151af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); 152af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); 153af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); 154af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); 155af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); 156af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora _mm_storel_epi64((__m128i*)dst, tmp3); 157af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst += 4 * 2; 158af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 159af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 160af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 161af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, 162af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* dst) { 163af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int n; 164af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t tmp0[2 * 3 + 5 + 15]; 165af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align 166af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (n = 0; n < 30; ++n) { // we directly stomp the *dst memory 167af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3); 168af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 169af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Last two pixels are special: we write in a tmp buffer before sending 170af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // to dst. 171af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); 172af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); 173af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora memcpy(dst + n * 3, tmp, 2 * 3); 174af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 175af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 176af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, 177af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* dst) { 178af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int n; 179af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t tmp0[2 * 3 + 5 + 15]; 180af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align 181af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (n = 0; n < 30; ++n) { 182af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3); 183af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 184af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); 185af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); 186af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora memcpy(dst + n * 3, tmp, 2 * 3); 187af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 188af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 189af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif // FANCY_UPSAMPLING 190af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 191af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//----------------------------------------------------------------------------- 192af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Arbitrary-length row conversion functions 193af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 194af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void YuvToRgbaRowSSE2(const uint8_t* y, 195af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8_t* u, const uint8_t* v, 196af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* dst, int len) { 197af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int n; 198af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (n = 0; n + 4 <= len; n += 4) { 199af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i uv_0 = LoadUVPart(u[0], v[0]); 200af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i uv_1 = LoadUVPart(u[1], v[1]); 201af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); 202af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); 203af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_3 = GetRGBA32bWithUV(y[2], uv_1); 204af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_4 = GetRGBA32bWithUV(y[3], uv_1); 205af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2); 206af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4); 207af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2); 208af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora _mm_storeu_si128((__m128i*)dst, tmp2); 209af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst += 4 * 4; 210af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora y += 4; 211af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora u += 2; 212af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora v += 2; 213af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 214af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Finish off 215af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora while (n < len) { 216af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8YuvToRgba(y[0], u[0], v[0], dst); 217af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst += 4; 218af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora ++y; 219af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora u += (n & 1); 220af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora v += (n & 1); 221af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora ++n; 222af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 223af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 224af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 225af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void YuvToBgraRowSSE2(const uint8_t* y, 226af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8_t* u, const uint8_t* v, 227af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* dst, int len) { 228af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int n; 229af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (n = 0; n + 2 <= len; n += 2) { 230af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i uv_0 = LoadUVPart(u[0], v[0]); 231af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); 232af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); 233af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); 234af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); 235af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); 236af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); 237af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora _mm_storel_epi64((__m128i*)dst, tmp3); 238af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst += 4 * 2; 239af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora y += 2; 240af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora ++u; 241af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora ++v; 242af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 243af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Finish off 244af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora if (len & 1) { 245af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8YuvToBgra(y[0], u[0], v[0], dst); 246af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 247af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 248af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 249af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void YuvToArgbRowSSE2(const uint8_t* y, 250af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8_t* u, const uint8_t* v, 251af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* dst, int len) { 252af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int n; 253af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (n = 0; n + 2 <= len; n += 2) { 254af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i uv_0 = LoadUVPart(u[0], v[0]); 255af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); 256af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); 257af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3)); 258af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3)); 259af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); 260af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); 261af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora _mm_storel_epi64((__m128i*)dst, tmp3); 262af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst += 4 * 2; 263af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora y += 2; 264af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora ++u; 265af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora ++v; 266af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 267af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Finish off 268af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora if (len & 1) { 269af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8YuvToArgb(y[0], u[0], v[0], dst); 270af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 271af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 272af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 273af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void YuvToRgbRowSSE2(const uint8_t* y, 274af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8_t* u, const uint8_t* v, 275af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* dst, int len) { 276af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int n; 277af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory 278af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora YuvToRgbSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes 279af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst += 3; 280af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora ++y; 281af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora u += (n & 1); 282af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora v += (n & 1); 283af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 284af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8YuvToRgb(y[0], u[0], v[0], dst); 285af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora if (len > 1) { 286af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8YuvToRgb(y[1], u[n & 1], v[n & 1], dst + 3); 287af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 288af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 289af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 290af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void YuvToBgrRowSSE2(const uint8_t* y, 291af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8_t* u, const uint8_t* v, 292af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8_t* dst, int len) { 293af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int n; 294af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory 295af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora YuvToBgrSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes 296af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst += 3; 297af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora ++y; 298af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora u += (n & 1); 299af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora v += (n & 1); 300af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 301af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8YuvToBgr(y[0], u[0], v[0], dst + 0); 302af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora if (len > 1) { 303af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8YuvToBgr(y[1], u[n & 1], v[n & 1], dst + 3); 304af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 305af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 306af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 307af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif // WEBP_USE_SSE2 308af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 309af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//------------------------------------------------------------------------------ 310af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Entry point 311af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 312af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroraextern void WebPInitSamplersSSE2(void); 313af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 314af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid WebPInitSamplersSSE2(void) { 315af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(WEBP_USE_SSE2) 316af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora WebPSamplers[MODE_RGB] = YuvToRgbRowSSE2; 317af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora WebPSamplers[MODE_RGBA] = YuvToRgbaRowSSE2; 318af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora WebPSamplers[MODE_BGR] = YuvToBgrRowSSE2; 319af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora WebPSamplers[MODE_BGRA] = YuvToBgraRowSSE2; 320af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora WebPSamplers[MODE_ARGB] = YuvToArgbRowSSE2; 321af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif // WEBP_USE_SSE2 322af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 323