133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Copyright 2014 Google Inc. All Rights Reserved. 233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// 333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Use of this source code is governed by a BSD-style license 433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// that can be found in the COPYING file in the root of the source 533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// tree. An additional intellectual property rights grant can be found 633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// in the file PATENTS. All contributing project authors may 733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// be found in the AUTHORS file in the root of the source tree. 833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// ----------------------------------------------------------------------------- 933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// 1033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// SSE2 variant of methods for lossless decoder 1133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// 1233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Author: Skal (pascal.massimino@gmail.com) 1333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 1433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include "./dsp.h" 1533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 1633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(WEBP_USE_SSE2) 17fa39824bb690c5806358871f46940d0450973d8aJames Zern 18fa39824bb690c5806358871f46940d0450973d8aJames Zern#include "./common_sse2.h" 19fa39824bb690c5806358871f46940d0450973d8aJames Zern#include "./lossless.h" 20fa39824bb690c5806358871f46940d0450973d8aJames Zern#include "./lossless_common.h" 217c8da7ce66017295a65ec028084b90800be377f8James Zern#include <assert.h> 2233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include <emmintrin.h> 2333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 2433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------ 2533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Predictor Transform 2633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 2733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, 2833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint32_t c2) { 2933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i zero = _mm_setzero_si128(); 3033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); 3133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); 3233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); 3333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i V1 = _mm_add_epi16(C0, C1); 3433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i V2 = _mm_sub_epi16(V1, C2); 3533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i b = _mm_packus_epi16(V2, V2); 3633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t output = _mm_cvtsi128_si32(b); 3733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return output; 3833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 3933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 4033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, 4133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint32_t c2) { 4233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i zero = _mm_setzero_si128(); 4333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); 4433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); 4533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); 4633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i avg = _mm_add_epi16(C1, C0); 4733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A0 = _mm_srli_epi16(avg, 1); 4833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A1 = _mm_sub_epi16(A0, B0); 4933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i BgtA = _mm_cmpgt_epi16(B0, A0); 5033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A2 = _mm_sub_epi16(A1, BgtA); 5133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A3 = _mm_srai_epi16(A2, 1); 5233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A4 = _mm_add_epi16(A0, A3); 5333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A5 = _mm_packus_epi16(A4, A4); 5433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t output = _mm_cvtsi128_si32(A5); 5533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return output; 5633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 5733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 5833f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) { 5933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int pa_minus_pb; 6033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i zero = _mm_setzero_si128(); 6133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A0 = _mm_cvtsi32_si128(a); 6233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i B0 = _mm_cvtsi32_si128(b); 6333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i C0 = _mm_cvtsi32_si128(c); 6433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i AC0 = _mm_subs_epu8(A0, C0); 6533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i CA0 = _mm_subs_epu8(C0, A0); 6633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i BC0 = _mm_subs_epu8(B0, C0); 6733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i CB0 = _mm_subs_epu8(C0, B0); 6833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i AC = _mm_or_si128(AC0, CA0); 6933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i BC = _mm_or_si128(BC0, CB0); 7033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c| 7133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c| 7233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i diff = _mm_sub_epi16(pb, pa); 7333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora { 7433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int16_t out[8]; 7533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storeu_si128((__m128i*)out, diff); 7633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora pa_minus_pb = out[0] + out[1] + out[2] + out[3]; 7733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 7833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return (pa_minus_pb <= 0) ? a : b; 7933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 8033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 81fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic WEBP_INLINE void Average2_m128i(const __m128i* const a0, 82fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i* const a1, 83fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i* const avg) { 84fa39824bb690c5806358871f46940d0450973d8aJames Zern // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) 85fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i ones = _mm_set1_epi8(1); 86fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i avg1 = _mm_avg_epu8(*a0, *a1); 87fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones); 88fa39824bb690c5806358871f46940d0450973d8aJames Zern *avg = _mm_sub_epi8(avg1, one); 89fa39824bb690c5806358871f46940d0450973d8aJames Zern} 90fa39824bb690c5806358871f46940d0450973d8aJames Zern 91fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1, 92fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i* const avg) { 93fa39824bb690c5806358871f46940d0450973d8aJames Zern // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) 94fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i ones = _mm_set1_epi8(1); 95fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A0 = _mm_cvtsi32_si128(a0); 96fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A1 = _mm_cvtsi32_si128(a1); 97fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i avg1 = _mm_avg_epu8(A0, A1); 98fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones); 99fa39824bb690c5806358871f46940d0450973d8aJames Zern *avg = _mm_sub_epi8(avg1, one); 100fa39824bb690c5806358871f46940d0450973d8aJames Zern} 101fa39824bb690c5806358871f46940d0450973d8aJames Zern 102fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) { 10333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i zero = _mm_setzero_si128(); 10433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero); 10533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); 10633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i sum = _mm_add_epi16(A1, A0); 107fa39824bb690c5806358871f46940d0450973d8aJames Zern return _mm_srli_epi16(sum, 1); 10833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 10933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 11033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) { 111fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i output; 112fa39824bb690c5806358871f46940d0450973d8aJames Zern Average2_uint32(a0, a1, &output); 113fa39824bb690c5806358871f46940d0450973d8aJames Zern return _mm_cvtsi128_si32(output); 11433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 11533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 11633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { 11733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i zero = _mm_setzero_si128(); 118fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i avg1 = Average2_uint32_16(a0, a2); 11933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); 12033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i sum = _mm_add_epi16(avg1, A1); 12133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i avg2 = _mm_srli_epi16(sum, 1); 12233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A2 = _mm_packus_epi16(avg2, avg2); 12333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t output = _mm_cvtsi128_si32(A2); 12433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return output; 12533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 12633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 12733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, 12833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint32_t a2, uint32_t a3) { 129fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i avg1 = Average2_uint32_16(a0, a1); 130fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i avg2 = Average2_uint32_16(a2, a3); 13133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i sum = _mm_add_epi16(avg2, avg1); 13233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i avg3 = _mm_srli_epi16(sum, 1); 13333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i A0 = _mm_packus_epi16(avg3, avg3); 13433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t output = _mm_cvtsi128_si32(A0); 13533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return output; 13633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 13733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 138fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) { 13933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t pred = Average3(left, top[0], top[1]); 14033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return pred; 14133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 142fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) { 14333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t pred = Average2(left, top[-1]); 14433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return pred; 14533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 146fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) { 14733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t pred = Average2(left, top[0]); 14833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return pred; 14933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 150fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) { 15133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t pred = Average2(top[-1], top[0]); 15233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora (void)left; 15333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return pred; 15433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 155fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) { 15633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t pred = Average2(top[0], top[1]); 15733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora (void)left; 15833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return pred; 15933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 160fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) { 16133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t pred = Average4(left, top[-1], top[0], top[1]); 16233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return pred; 16333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 164fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) { 16533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t pred = Select(top[0], left, top[-1]); 16633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return pred; 16733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 168fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) { 16933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]); 17033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return pred; 17133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 172fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) { 17333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]); 17433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return pred; 17533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 17633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 177fa39824bb690c5806358871f46940d0450973d8aJames Zern// Batch versions of those functions. 178fa39824bb690c5806358871f46940d0450973d8aJames Zern 179fa39824bb690c5806358871f46940d0450973d8aJames Zern// Predictor0: ARGB_BLACK. 180fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper, 181fa39824bb690c5806358871f46940d0450973d8aJames Zern int num_pixels, uint32_t* out) { 182fa39824bb690c5806358871f46940d0450973d8aJames Zern int i; 183fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i black = _mm_set1_epi32(ARGB_BLACK); 184fa39824bb690c5806358871f46940d0450973d8aJames Zern for (i = 0; i + 4 <= num_pixels; i += 4) { 185fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); 186fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i res = _mm_add_epi8(src, black); 187fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128((__m128i*)&out[i], res); 188fa39824bb690c5806358871f46940d0450973d8aJames Zern } 189fa39824bb690c5806358871f46940d0450973d8aJames Zern if (i != num_pixels) { 190fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i); 191fa39824bb690c5806358871f46940d0450973d8aJames Zern } 192fa39824bb690c5806358871f46940d0450973d8aJames Zern} 193fa39824bb690c5806358871f46940d0450973d8aJames Zern 194fa39824bb690c5806358871f46940d0450973d8aJames Zern// Predictor1: left. 195fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper, 196fa39824bb690c5806358871f46940d0450973d8aJames Zern int num_pixels, uint32_t* out) { 197fa39824bb690c5806358871f46940d0450973d8aJames Zern int i; 198fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i prev = _mm_set1_epi32(out[-1]); 199fa39824bb690c5806358871f46940d0450973d8aJames Zern for (i = 0; i + 4 <= num_pixels; i += 4) { 200fa39824bb690c5806358871f46940d0450973d8aJames Zern // a | b | c | d 201fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); 202fa39824bb690c5806358871f46940d0450973d8aJames Zern // 0 | a | b | c 203fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i shift0 = _mm_slli_si128(src, 4); 204fa39824bb690c5806358871f46940d0450973d8aJames Zern // a | a + b | b + c | c + d 205fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i sum0 = _mm_add_epi8(src, shift0); 206fa39824bb690c5806358871f46940d0450973d8aJames Zern // 0 | 0 | a | a + b 207fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i shift1 = _mm_slli_si128(sum0, 8); 208fa39824bb690c5806358871f46940d0450973d8aJames Zern // a | a + b | a + b + c | a + b + c + d 209fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i sum1 = _mm_add_epi8(sum0, shift1); 210fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i res = _mm_add_epi8(sum1, prev); 211fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128((__m128i*)&out[i], res); 212fa39824bb690c5806358871f46940d0450973d8aJames Zern // replicate prev output on the four lanes 213fa39824bb690c5806358871f46940d0450973d8aJames Zern prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6)); 214fa39824bb690c5806358871f46940d0450973d8aJames Zern } 215fa39824bb690c5806358871f46940d0450973d8aJames Zern if (i != num_pixels) { 216fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i); 217fa39824bb690c5806358871f46940d0450973d8aJames Zern } 218fa39824bb690c5806358871f46940d0450973d8aJames Zern} 219fa39824bb690c5806358871f46940d0450973d8aJames Zern 220fa39824bb690c5806358871f46940d0450973d8aJames Zern// Macro that adds 32-bit integers from IN using mod 256 arithmetic 221fa39824bb690c5806358871f46940d0450973d8aJames Zern// per 8 bit channel. 222fa39824bb690c5806358871f46940d0450973d8aJames Zern#define GENERATE_PREDICTOR_1(X, IN) \ 223fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ 224fa39824bb690c5806358871f46940d0450973d8aJames Zern int num_pixels, uint32_t* out) { \ 225fa39824bb690c5806358871f46940d0450973d8aJames Zern int i; \ 226fa39824bb690c5806358871f46940d0450973d8aJames Zern for (i = 0; i + 4 <= num_pixels; i += 4) { \ 227fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ 228fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \ 229fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i res = _mm_add_epi8(src, other); \ 230fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128((__m128i*)&out[i], res); \ 231fa39824bb690c5806358871f46940d0450973d8aJames Zern } \ 232fa39824bb690c5806358871f46940d0450973d8aJames Zern if (i != num_pixels) { \ 233fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ 234fa39824bb690c5806358871f46940d0450973d8aJames Zern } \ 235fa39824bb690c5806358871f46940d0450973d8aJames Zern} 236fa39824bb690c5806358871f46940d0450973d8aJames Zern 237fa39824bb690c5806358871f46940d0450973d8aJames Zern// Predictor2: Top. 238fa39824bb690c5806358871f46940d0450973d8aJames ZernGENERATE_PREDICTOR_1(2, upper[i]) 239fa39824bb690c5806358871f46940d0450973d8aJames Zern// Predictor3: Top-right. 240fa39824bb690c5806358871f46940d0450973d8aJames ZernGENERATE_PREDICTOR_1(3, upper[i + 1]) 241fa39824bb690c5806358871f46940d0450973d8aJames Zern// Predictor4: Top-left. 242fa39824bb690c5806358871f46940d0450973d8aJames ZernGENERATE_PREDICTOR_1(4, upper[i - 1]) 243fa39824bb690c5806358871f46940d0450973d8aJames Zern#undef GENERATE_PREDICTOR_1 244fa39824bb690c5806358871f46940d0450973d8aJames Zern 245fa39824bb690c5806358871f46940d0450973d8aJames Zern// Due to averages with integers, values cannot be accumulated in parallel for 246fa39824bb690c5806358871f46940d0450973d8aJames Zern// predictors 5 to 7. 247fa39824bb690c5806358871f46940d0450973d8aJames ZernGENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2) 248fa39824bb690c5806358871f46940d0450973d8aJames ZernGENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2) 249fa39824bb690c5806358871f46940d0450973d8aJames ZernGENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2) 250fa39824bb690c5806358871f46940d0450973d8aJames Zern 251fa39824bb690c5806358871f46940d0450973d8aJames Zern#define GENERATE_PREDICTOR_2(X, IN) \ 252fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ 253fa39824bb690c5806358871f46940d0450973d8aJames Zern int num_pixels, uint32_t* out) { \ 254fa39824bb690c5806358871f46940d0450973d8aJames Zern int i; \ 255fa39824bb690c5806358871f46940d0450973d8aJames Zern for (i = 0; i + 4 <= num_pixels; i += 4) { \ 256fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \ 257fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); \ 258fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ 259fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i avg, res; \ 260fa39824bb690c5806358871f46940d0450973d8aJames Zern Average2_m128i(&T, &Tother, &avg); \ 261fa39824bb690c5806358871f46940d0450973d8aJames Zern res = _mm_add_epi8(avg, src); \ 262fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128((__m128i*)&out[i], res); \ 263fa39824bb690c5806358871f46940d0450973d8aJames Zern } \ 264fa39824bb690c5806358871f46940d0450973d8aJames Zern if (i != num_pixels) { \ 265fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ 266fa39824bb690c5806358871f46940d0450973d8aJames Zern } \ 267fa39824bb690c5806358871f46940d0450973d8aJames Zern} 268fa39824bb690c5806358871f46940d0450973d8aJames Zern// Predictor8: average TL T. 269fa39824bb690c5806358871f46940d0450973d8aJames ZernGENERATE_PREDICTOR_2(8, upper[i - 1]) 270fa39824bb690c5806358871f46940d0450973d8aJames Zern// Predictor9: average T TR. 271fa39824bb690c5806358871f46940d0450973d8aJames ZernGENERATE_PREDICTOR_2(9, upper[i + 1]) 272fa39824bb690c5806358871f46940d0450973d8aJames Zern#undef GENERATE_PREDICTOR_2 273fa39824bb690c5806358871f46940d0450973d8aJames Zern 274fa39824bb690c5806358871f46940d0450973d8aJames Zern// Predictor10: average of (average of (L,TL), average of (T, TR)). 275fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, 276fa39824bb690c5806358871f46940d0450973d8aJames Zern int num_pixels, uint32_t* out) { 277fa39824bb690c5806358871f46940d0450973d8aJames Zern int i, j; 278fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i L = _mm_cvtsi32_si128(out[-1]); 279fa39824bb690c5806358871f46940d0450973d8aJames Zern for (i = 0; i + 4 <= num_pixels; i += 4) { 280fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); 281fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); 282fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); 283fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]); 284fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i avgTTR; 285fa39824bb690c5806358871f46940d0450973d8aJames Zern Average2_m128i(&T, &TR, &avgTTR); 286fa39824bb690c5806358871f46940d0450973d8aJames Zern for (j = 0; j < 4; ++j) { 287fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i avgLTL, avg; 288fa39824bb690c5806358871f46940d0450973d8aJames Zern Average2_m128i(&L, &TL, &avgLTL); 289fa39824bb690c5806358871f46940d0450973d8aJames Zern Average2_m128i(&avgTTR, &avgLTL, &avg); 290fa39824bb690c5806358871f46940d0450973d8aJames Zern L = _mm_add_epi8(avg, src); 291fa39824bb690c5806358871f46940d0450973d8aJames Zern out[i + j] = _mm_cvtsi128_si32(L); 292fa39824bb690c5806358871f46940d0450973d8aJames Zern // Rotate the pre-computed values for the next iteration. 293fa39824bb690c5806358871f46940d0450973d8aJames Zern avgTTR = _mm_srli_si128(avgTTR, 4); 294fa39824bb690c5806358871f46940d0450973d8aJames Zern TL = _mm_srli_si128(TL, 4); 295fa39824bb690c5806358871f46940d0450973d8aJames Zern src = _mm_srli_si128(src, 4); 296fa39824bb690c5806358871f46940d0450973d8aJames Zern } 297fa39824bb690c5806358871f46940d0450973d8aJames Zern } 298fa39824bb690c5806358871f46940d0450973d8aJames Zern if (i != num_pixels) { 299fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i); 300fa39824bb690c5806358871f46940d0450973d8aJames Zern } 301fa39824bb690c5806358871f46940d0450973d8aJames Zern} 302fa39824bb690c5806358871f46940d0450973d8aJames Zern 303fa39824bb690c5806358871f46940d0450973d8aJames Zern// Predictor11: select. 304fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B, 305fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i* const out) { 306fa39824bb690c5806358871f46940d0450973d8aJames Zern // We can unpack with any value on the upper 32 bits, provided it's the same 307fa39824bb690c5806358871f46940d0450973d8aJames Zern // on both operands (to that their sum of abs diff is zero). Here we use *A. 308fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A_lo = _mm_unpacklo_epi32(*A, *A); 309fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i B_lo = _mm_unpacklo_epi32(*B, *A); 310fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A_hi = _mm_unpackhi_epi32(*A, *A); 311fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i B_hi = _mm_unpackhi_epi32(*B, *A); 312fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo); 313fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi); 314fa39824bb690c5806358871f46940d0450973d8aJames Zern *out = _mm_packs_epi32(s_lo, s_hi); 315fa39824bb690c5806358871f46940d0450973d8aJames Zern} 316fa39824bb690c5806358871f46940d0450973d8aJames Zern 317fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, 318fa39824bb690c5806358871f46940d0450973d8aJames Zern int num_pixels, uint32_t* out) { 319fa39824bb690c5806358871f46940d0450973d8aJames Zern int i, j; 320fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i L = _mm_cvtsi32_si128(out[-1]); 321fa39824bb690c5806358871f46940d0450973d8aJames Zern for (i = 0; i + 4 <= num_pixels; i += 4) { 322fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); 323fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); 324fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); 325fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i pa; 326fa39824bb690c5806358871f46940d0450973d8aJames Zern GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL| 327fa39824bb690c5806358871f46940d0450973d8aJames Zern for (j = 0; j < 4; ++j) { 328fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i L_lo = _mm_unpacklo_epi32(L, L); 329fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i TL_lo = _mm_unpacklo_epi32(TL, L); 330fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL| 331fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i mask = _mm_cmpgt_epi32(pb, pa); 332fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A = _mm_and_si128(mask, L); 333fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i B = _mm_andnot_si128(mask, T); 334fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T 335fa39824bb690c5806358871f46940d0450973d8aJames Zern L = _mm_add_epi8(src, pred); 336fa39824bb690c5806358871f46940d0450973d8aJames Zern out[i + j] = _mm_cvtsi128_si32(L); 337fa39824bb690c5806358871f46940d0450973d8aJames Zern // Shift the pre-computed value for the next iteration. 338fa39824bb690c5806358871f46940d0450973d8aJames Zern T = _mm_srli_si128(T, 4); 339fa39824bb690c5806358871f46940d0450973d8aJames Zern TL = _mm_srli_si128(TL, 4); 340fa39824bb690c5806358871f46940d0450973d8aJames Zern src = _mm_srli_si128(src, 4); 341fa39824bb690c5806358871f46940d0450973d8aJames Zern pa = _mm_srli_si128(pa, 4); 342fa39824bb690c5806358871f46940d0450973d8aJames Zern } 343fa39824bb690c5806358871f46940d0450973d8aJames Zern } 344fa39824bb690c5806358871f46940d0450973d8aJames Zern if (i != num_pixels) { 345fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); 346fa39824bb690c5806358871f46940d0450973d8aJames Zern } 347fa39824bb690c5806358871f46940d0450973d8aJames Zern} 348fa39824bb690c5806358871f46940d0450973d8aJames Zern 349fa39824bb690c5806358871f46940d0450973d8aJames Zern// Predictor12: ClampedAddSubtractFull. 350fa39824bb690c5806358871f46940d0450973d8aJames Zern#define DO_PRED12(DIFF, LANE, OUT) \ 351fa39824bb690c5806358871f46940d0450973d8aJames Zerndo { \ 352fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i all = _mm_add_epi16(L, (DIFF)); \ 353fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i alls = _mm_packus_epi16(all, all); \ 354fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i res = _mm_add_epi8(src, alls); \ 355fa39824bb690c5806358871f46940d0450973d8aJames Zern out[i + (OUT)] = _mm_cvtsi128_si32(res); \ 356fa39824bb690c5806358871f46940d0450973d8aJames Zern L = _mm_unpacklo_epi8(res, zero); \ 357fa39824bb690c5806358871f46940d0450973d8aJames Zern /* Shift the pre-computed value for the next iteration.*/ \ 358fa39824bb690c5806358871f46940d0450973d8aJames Zern if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \ 359fa39824bb690c5806358871f46940d0450973d8aJames Zern src = _mm_srli_si128(src, 4); \ 360fa39824bb690c5806358871f46940d0450973d8aJames Zern} while (0) 361fa39824bb690c5806358871f46940d0450973d8aJames Zern 362fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, 363fa39824bb690c5806358871f46940d0450973d8aJames Zern int num_pixels, uint32_t* out) { 364fa39824bb690c5806358871f46940d0450973d8aJames Zern int i; 365fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i zero = _mm_setzero_si128(); 366fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i L8 = _mm_cvtsi32_si128(out[-1]); 367fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i L = _mm_unpacklo_epi8(L8, zero); 368fa39824bb690c5806358871f46940d0450973d8aJames Zern for (i = 0; i + 4 <= num_pixels; i += 4) { 369fa39824bb690c5806358871f46940d0450973d8aJames Zern // Load 4 pixels at a time. 370fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); 371fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); 372fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i T_lo = _mm_unpacklo_epi8(T, zero); 373fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i T_hi = _mm_unpackhi_epi8(T, zero); 374fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); 375fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); 376fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero); 377fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo); 378fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi); 379fa39824bb690c5806358871f46940d0450973d8aJames Zern DO_PRED12(diff_lo, 0, 0); 380fa39824bb690c5806358871f46940d0450973d8aJames Zern DO_PRED12(diff_lo, 1, 1); 381fa39824bb690c5806358871f46940d0450973d8aJames Zern DO_PRED12(diff_hi, 0, 2); 382fa39824bb690c5806358871f46940d0450973d8aJames Zern DO_PRED12(diff_hi, 1, 3); 383fa39824bb690c5806358871f46940d0450973d8aJames Zern } 384fa39824bb690c5806358871f46940d0450973d8aJames Zern if (i != num_pixels) { 385fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i); 386fa39824bb690c5806358871f46940d0450973d8aJames Zern } 387fa39824bb690c5806358871f46940d0450973d8aJames Zern} 388fa39824bb690c5806358871f46940d0450973d8aJames Zern#undef DO_PRED12 389fa39824bb690c5806358871f46940d0450973d8aJames Zern 390fa39824bb690c5806358871f46940d0450973d8aJames Zern// Due to averages with integers, values cannot be accumulated in parallel for 391fa39824bb690c5806358871f46940d0450973d8aJames Zern// predictors 13. 392fa39824bb690c5806358871f46940d0450973d8aJames ZernGENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2) 393fa39824bb690c5806358871f46940d0450973d8aJames Zern 39433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------ 39533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Subtract-Green Transform 39633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 397fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels, 398fa39824bb690c5806358871f46940d0450973d8aJames Zern uint32_t* dst) { 39933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int i; 40033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (i = 0; i + 4 <= num_pixels; i += 4) { 401fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb 4027c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g 4037c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); 4047c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g 4057c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i out = _mm_add_epi8(in, C); 406fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128((__m128i*)&dst[i], out); 40733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 40833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // fallthrough and finish off with plain-C 409fa39824bb690c5806358871f46940d0450973d8aJames Zern if (i != num_pixels) { 410fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i); 411fa39824bb690c5806358871f46940d0450973d8aJames Zern } 41233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 41333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 41433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------ 41533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Color Transform 41633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 4177c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void TransformColorInverse(const VP8LMultipliers* const m, 418fa39824bb690c5806358871f46940d0450973d8aJames Zern const uint32_t* const src, int num_pixels, 419fa39824bb690c5806358871f46940d0450973d8aJames Zern uint32_t* dst) { 420fa39824bb690c5806358871f46940d0450973d8aJames Zern// sign-extended multiplying constants, pre-shifted by 5. 4217c8da7ce66017295a65ec028084b90800be377f8James Zern#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend 4227c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mults_rb = _mm_set_epi16( 4237c8da7ce66017295a65ec028084b90800be377f8James Zern CST(green_to_red_), CST(green_to_blue_), 4247c8da7ce66017295a65ec028084b90800be377f8James Zern CST(green_to_red_), CST(green_to_blue_), 4257c8da7ce66017295a65ec028084b90800be377f8James Zern CST(green_to_red_), CST(green_to_blue_), 4267c8da7ce66017295a65ec028084b90800be377f8James Zern CST(green_to_red_), CST(green_to_blue_)); 4277c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mults_b2 = _mm_set_epi16( 4287c8da7ce66017295a65ec028084b90800be377f8James Zern CST(red_to_blue_), 0, CST(red_to_blue_), 0, 4297c8da7ce66017295a65ec028084b90800be377f8James Zern CST(red_to_blue_), 0, CST(red_to_blue_), 0); 4307c8da7ce66017295a65ec028084b90800be377f8James Zern#undef CST 4317c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks 43233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int i; 43333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (i = 0; i + 4 <= num_pixels; i += 4) { 434fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb 4357c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 4367c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); 4377c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 4387c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 4397c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i E = _mm_add_epi8(in, D); // x r' x b' 4407c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0 4417c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0 4427c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0 4437c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0 4447c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b'' 4457c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i out = _mm_or_si128(J, A); 446fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128((__m128i*)&dst[i], out); 44733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 44833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Fall-back to C-version for left-overs. 449fa39824bb690c5806358871f46940d0450973d8aJames Zern if (i != num_pixels) { 450fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); 451fa39824bb690c5806358871f46940d0450973d8aJames Zern } 45233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 45333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 45433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------ 45533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Color-space conversion functions 45633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 457fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, 458fa39824bb690c5806358871f46940d0450973d8aJames Zern uint8_t* dst) { 459fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i* in = (const __m128i*)src; 460fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i* out = (__m128i*)dst; 461fa39824bb690c5806358871f46940d0450973d8aJames Zern 462fa39824bb690c5806358871f46940d0450973d8aJames Zern while (num_pixels >= 32) { 463fa39824bb690c5806358871f46940d0450973d8aJames Zern // Load the BGRA buffers. 464fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i in0 = _mm_loadu_si128(in + 0); 465fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i in1 = _mm_loadu_si128(in + 1); 466fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i in2 = _mm_loadu_si128(in + 2); 467fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i in3 = _mm_loadu_si128(in + 3); 468fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i in4 = _mm_loadu_si128(in + 4); 469fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i in5 = _mm_loadu_si128(in + 5); 470fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i in6 = _mm_loadu_si128(in + 6); 471fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i in7 = _mm_loadu_si128(in + 7); 472fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8L32bToPlanar(&in0, &in1, &in2, &in3); 473fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8L32bToPlanar(&in4, &in5, &in6, &in7); 474fa39824bb690c5806358871f46940d0450973d8aJames Zern // At this points, in1/in5 contains red only, in2/in6 green only ... 475fa39824bb690c5806358871f46940d0450973d8aJames Zern // Pack the colors in 24b RGB. 476fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7); 477fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128(out + 0, in1); 478fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128(out + 1, in5); 479fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128(out + 2, in2); 480fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128(out + 3, in6); 481fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128(out + 4, in3); 482fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128(out + 5, in7); 483fa39824bb690c5806358871f46940d0450973d8aJames Zern in += 8; 484fa39824bb690c5806358871f46940d0450973d8aJames Zern out += 6; 485fa39824bb690c5806358871f46940d0450973d8aJames Zern num_pixels -= 32; 486fa39824bb690c5806358871f46940d0450973d8aJames Zern } 487fa39824bb690c5806358871f46940d0450973d8aJames Zern // left-overs 488fa39824bb690c5806358871f46940d0450973d8aJames Zern if (num_pixels > 0) { 489fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out); 490fa39824bb690c5806358871f46940d0450973d8aJames Zern } 491fa39824bb690c5806358871f46940d0450973d8aJames Zern} 492fa39824bb690c5806358871f46940d0450973d8aJames Zern 49333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ConvertBGRAToRGBA(const uint32_t* src, 49433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int num_pixels, uint8_t* dst) { 49533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i* in = (const __m128i*)src; 49633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora __m128i* out = (__m128i*)dst; 49733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora while (num_pixels >= 8) { 49833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 49933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 50033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... 50133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... 50233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... 50333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... 50433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 50533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 50633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 50733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 50833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7 50933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7 51033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1... 51133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5... 51233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storeu_si128(out++, rgba0); 51333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storeu_si128(out++, rgba4); 51433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora num_pixels -= 8; 51533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 51633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // left-overs 517fa39824bb690c5806358871f46940d0450973d8aJames Zern if (num_pixels > 0) { 518fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out); 519fa39824bb690c5806358871f46940d0450973d8aJames Zern } 52033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 52133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 52233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ConvertBGRAToRGBA4444(const uint32_t* src, 52333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int num_pixels, uint8_t* dst) { 52433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i mask_0x0f = _mm_set1_epi8(0x0f); 52533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); 52633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i* in = (const __m128i*)src; 52733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora __m128i* out = (__m128i*)dst; 52833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora while (num_pixels >= 8) { 52933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 53033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 53133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... 53233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... 53333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... 53433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... 53533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 53633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 53733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 53833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 53933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7- 54033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7 54133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7- 54233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7 54333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0 54433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#ifdef WEBP_SWAP_16BIT_CSP 54533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7 54633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else 54733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7 54833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif 54933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storeu_si128(out++, rgba); 55033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora num_pixels -= 8; 55133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 55233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // left-overs 553fa39824bb690c5806358871f46940d0450973d8aJames Zern if (num_pixels > 0) { 554fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out); 555fa39824bb690c5806358871f46940d0450973d8aJames Zern } 55633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 55733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 55833f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ConvertBGRAToRGB565(const uint32_t* src, 55933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int num_pixels, uint8_t* dst) { 56033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i mask_0xe0 = _mm_set1_epi8(0xe0); 56133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i mask_0xf8 = _mm_set1_epi8(0xf8); 56233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i mask_0x07 = _mm_set1_epi8(0x07); 56333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i* in = (const __m128i*)src; 56433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora __m128i* out = (__m128i*)dst; 56533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora while (num_pixels >= 8) { 56633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 56733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 56833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... 56933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... 57033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... 57133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... 57233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 57333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 57433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 57533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 57633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7 57733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i g_lo1 = _mm_srli_epi16(ga0, 5); 57833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b) 57933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i g_hi1 = _mm_slli_epi16(ga0, 3); 58033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b) 58133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0 58233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx 58333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i b1 = _mm_srli_epi16(b0, 3); 58433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx 58533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#ifdef WEBP_SWAP_16BIT_CSP 58633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7 58733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else 58833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7 58933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif 59033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storeu_si128(out++, rgba); 59133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora num_pixels -= 8; 59233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 59333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // left-overs 594fa39824bb690c5806358871f46940d0450973d8aJames Zern if (num_pixels > 0) { 595fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out); 596fa39824bb690c5806358871f46940d0450973d8aJames Zern } 59733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 59833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 59933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ConvertBGRAToBGR(const uint32_t* src, 60033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int num_pixels, uint8_t* dst) { 60133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff); 60233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0); 60333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i* in = (const __m128i*)src; 60433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8_t* const end = dst + num_pixels * 3; 60533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // the last storel_epi64 below writes 8 bytes starting at offset 18 60633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora while (dst + 26 <= end) { 60733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 60833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 60933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0 61033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0 61133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0 61233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0 61333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00 61433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00 61533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00 61633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00 61733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i c2 = _mm_srli_si128(c0, 8); 61833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i c6 = _mm_srli_si128(c4, 8); 61933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storel_epi64((__m128i*)(dst + 0), c0); 62033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storel_epi64((__m128i*)(dst + 6), c2); 62133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storel_epi64((__m128i*)(dst + 12), c4); 62233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora _mm_storel_epi64((__m128i*)(dst + 18), c6); 62333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst += 24; 62433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora num_pixels -= 8; 62533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 62633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // left-overs 627fa39824bb690c5806358871f46940d0450973d8aJames Zern if (num_pixels > 0) { 628fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst); 629fa39824bb690c5806358871f46940d0450973d8aJames Zern } 63033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 63133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 63233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------ 6337c8da7ce66017295a65ec028084b90800be377f8James Zern// Entry point 63433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 63533f74dabbc7920a65ed435d7417987589febdc16Vikas Aroraextern void VP8LDspInitSSE2(void); 63633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 6377c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) { 638fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictors[5] = Predictor5_SSE2; 639fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictors[6] = Predictor6_SSE2; 640fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictors[7] = Predictor7_SSE2; 641fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictors[8] = Predictor8_SSE2; 642fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictors[9] = Predictor9_SSE2; 643fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictors[10] = Predictor10_SSE2; 644fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictors[11] = Predictor11_SSE2; 645fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictors[12] = Predictor12_SSE2; 646fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictors[13] = Predictor13_SSE2; 647fa39824bb690c5806358871f46940d0450973d8aJames Zern 648fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[0] = PredictorAdd0_SSE2; 649fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[1] = PredictorAdd1_SSE2; 650fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[2] = PredictorAdd2_SSE2; 651fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[3] = PredictorAdd3_SSE2; 652fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[4] = PredictorAdd4_SSE2; 653fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[5] = PredictorAdd5_SSE2; 654fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[6] = PredictorAdd6_SSE2; 655fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[7] = PredictorAdd7_SSE2; 656fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[8] = PredictorAdd8_SSE2; 657fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[9] = PredictorAdd9_SSE2; 658fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[10] = PredictorAdd10_SSE2; 659fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[11] = PredictorAdd11_SSE2; 660fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[12] = PredictorAdd12_SSE2; 661fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LPredictorsAdd[13] = PredictorAdd13_SSE2; 66233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 66333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; 66433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8LTransformColorInverse = TransformColorInverse; 66533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 666fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8LConvertBGRAToRGB = ConvertBGRAToRGB; 66733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; 66833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444; 66933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565; 67033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8LConvertBGRAToBGR = ConvertBGRAToBGR; 67133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 67233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 6737c8da7ce66017295a65ec028084b90800be377f8James Zern#else // !WEBP_USE_SSE2 6747c8da7ce66017295a65ec028084b90800be377f8James Zern 6757c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_DSP_INIT_STUB(VP8LDspInitSSE2) 6767c8da7ce66017295a65ec028084b90800be377f8James Zern 6777c8da7ce66017295a65ec028084b90800be377f8James Zern#endif // WEBP_USE_SSE2 678