18c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// Copyright 2014 Google Inc. All Rights Reserved. 28c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// 38c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// Use of this source code is governed by a BSD-style license 48c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// that can be found in the COPYING file in the root of the source 58c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// tree. An additional intellectual property rights grant can be found 68c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// in the file PATENTS. All contributing project authors may 78c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// be found in the AUTHORS file in the root of the source tree. 88c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// ----------------------------------------------------------------------------- 98c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// 108c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// Utilities for processing transparent channel. 118c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// 128c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// Author: Skal (pascal.massimino@gmail.com) 138c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora 148c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#include "./dsp.h" 158c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora 168c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#if defined(WEBP_USE_SSE2) 178c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#include <emmintrin.h> 188c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora 198c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora//------------------------------------------------------------------------------ 208c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora 217c8da7ce66017295a65ec028084b90800be377f8James Zernstatic int DispatchAlpha(const uint8_t* alpha, int alpha_stride, 227c8da7ce66017295a65ec028084b90800be377f8James Zern int width, int height, 237c8da7ce66017295a65ec028084b90800be377f8James Zern uint8_t* dst, int dst_stride) { 247c8da7ce66017295a65ec028084b90800be377f8James Zern // alpha_and stores an 'and' operation of all the alpha[] values. The final 257c8da7ce66017295a65ec028084b90800be377f8James Zern // value is not 0xff if any of the alpha[] is not equal to 0xff. 267c8da7ce66017295a65ec028084b90800be377f8James Zern uint32_t alpha_and = 0xff; 277c8da7ce66017295a65ec028084b90800be377f8James Zern int i, j; 287c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 297c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB 307c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); 317c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i all_alphas = all_0xff; 327c8da7ce66017295a65ec028084b90800be377f8James Zern 337c8da7ce66017295a65ec028084b90800be377f8James Zern // We must be able to access 3 extra bytes after the last written byte 347c8da7ce66017295a65ec028084b90800be377f8James Zern // 'dst[4 * width - 4]', because we don't know if alpha is the first or the 357c8da7ce66017295a65ec028084b90800be377f8James Zern // last byte of the quadruplet. 367c8da7ce66017295a65ec028084b90800be377f8James Zern const int limit = (width - 1) & ~7; 377c8da7ce66017295a65ec028084b90800be377f8James Zern 387c8da7ce66017295a65ec028084b90800be377f8James Zern for (j = 0; j < height; ++j) { 397c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i* out = (__m128i*)dst; 407c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i < limit; i += 8) { 417c8da7ce66017295a65ec028084b90800be377f8James Zern // load 8 alpha bytes 427c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); 437c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a1 = _mm_unpacklo_epi8(a0, zero); 447c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); 457c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); 467c8da7ce66017295a65ec028084b90800be377f8James Zern // load 8 dst pixels (32 bytes) 477c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b0_lo = _mm_loadu_si128(out + 0); 487c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b0_hi = _mm_loadu_si128(out + 1); 497c8da7ce66017295a65ec028084b90800be377f8James Zern // mask dst alpha values 507c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask); 517c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask); 527c8da7ce66017295a65ec028084b90800be377f8James Zern // combine 537c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo); 547c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi); 557c8da7ce66017295a65ec028084b90800be377f8James Zern // store 567c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128(out + 0, b2_lo); 577c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128(out + 1, b2_hi); 587c8da7ce66017295a65ec028084b90800be377f8James Zern // accumulate eight alpha 'and' in parallel 597c8da7ce66017295a65ec028084b90800be377f8James Zern all_alphas = _mm_and_si128(all_alphas, a0); 607c8da7ce66017295a65ec028084b90800be377f8James Zern out += 2; 617c8da7ce66017295a65ec028084b90800be377f8James Zern } 627c8da7ce66017295a65ec028084b90800be377f8James Zern for (; i < width; ++i) { 637c8da7ce66017295a65ec028084b90800be377f8James Zern const uint32_t alpha_value = alpha[i]; 647c8da7ce66017295a65ec028084b90800be377f8James Zern dst[4 * i] = alpha_value; 657c8da7ce66017295a65ec028084b90800be377f8James Zern alpha_and &= alpha_value; 667c8da7ce66017295a65ec028084b90800be377f8James Zern } 677c8da7ce66017295a65ec028084b90800be377f8James Zern alpha += alpha_stride; 687c8da7ce66017295a65ec028084b90800be377f8James Zern dst += dst_stride; 697c8da7ce66017295a65ec028084b90800be377f8James Zern } 707c8da7ce66017295a65ec028084b90800be377f8James Zern // Combine the eight alpha 'and' into a 8-bit mask. 717c8da7ce66017295a65ec028084b90800be377f8James Zern alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); 727c8da7ce66017295a65ec028084b90800be377f8James Zern return (alpha_and != 0xff); 737c8da7ce66017295a65ec028084b90800be377f8James Zern} 747c8da7ce66017295a65ec028084b90800be377f8James Zern 757c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride, 767c8da7ce66017295a65ec028084b90800be377f8James Zern int width, int height, 777c8da7ce66017295a65ec028084b90800be377f8James Zern uint32_t* dst, int dst_stride) { 787c8da7ce66017295a65ec028084b90800be377f8James Zern int i, j; 797c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 807c8da7ce66017295a65ec028084b90800be377f8James Zern const int limit = width & ~15; 817c8da7ce66017295a65ec028084b90800be377f8James Zern for (j = 0; j < height; ++j) { 827c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i < limit; i += 16) { // process 16 alpha bytes 837c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); 847c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first! 857c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b1 = _mm_unpackhi_epi8(zero, a0); 867c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); 877c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero); 887c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); 897c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero); 907c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&dst[i + 0], a2_lo); 917c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&dst[i + 4], a2_hi); 927c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&dst[i + 8], b2_lo); 937c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi); 947c8da7ce66017295a65ec028084b90800be377f8James Zern } 957c8da7ce66017295a65ec028084b90800be377f8James Zern for (; i < width; ++i) dst[i] = alpha[i] << 8; 967c8da7ce66017295a65ec028084b90800be377f8James Zern alpha += alpha_stride; 977c8da7ce66017295a65ec028084b90800be377f8James Zern dst += dst_stride; 987c8da7ce66017295a65ec028084b90800be377f8James Zern } 997c8da7ce66017295a65ec028084b90800be377f8James Zern} 1007c8da7ce66017295a65ec028084b90800be377f8James Zern 1018c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arorastatic int ExtractAlpha(const uint8_t* argb, int argb_stride, 1028c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora int width, int height, 1038c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora uint8_t* alpha, int alpha_stride) { 1048c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora // alpha_and stores an 'and' operation of all the alpha[] values. The final 1058c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora // value is not 0xff if any of the alpha[] is not equal to 0xff. 1068c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora uint32_t alpha_and = 0xff; 1078c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora int i, j; 1088c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha 1098c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); 1108c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora __m128i all_alphas = all_0xff; 1118c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora 1128c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora // We must be able to access 3 extra bytes after the last written byte 1138c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora // 'src[4 * width - 4]', because we don't know if alpha is the first or the 1148c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora // last byte of the quadruplet. 1158c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora const int limit = (width - 1) & ~7; 1168c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora 1178c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora for (j = 0; j < height; ++j) { 1188c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora const __m128i* src = (const __m128i*)argb; 1198c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora for (i = 0; i < limit; i += 8) { 1208c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora // load 32 argb bytes 1218c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora const __m128i a0 = _mm_loadu_si128(src + 0); 1228c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora const __m128i a1 = _mm_loadu_si128(src + 1); 1238c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora const __m128i b0 = _mm_and_si128(a0, a_mask); 1248c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora const __m128i b1 = _mm_and_si128(a1, a_mask); 1258c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora const __m128i c0 = _mm_packs_epi32(b0, b1); 1268c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora const __m128i d0 = _mm_packus_epi16(c0, c0); 1278c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora // store 1288c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora _mm_storel_epi64((__m128i*)&alpha[i], d0); 1298c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora // accumulate eight alpha 'and' in parallel 1308c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora all_alphas = _mm_and_si128(all_alphas, d0); 1318c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora src += 2; 1328c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora } 1338c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora for (; i < width; ++i) { 1348c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora const uint32_t alpha_value = argb[4 * i]; 1358c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora alpha[i] = alpha_value; 1368c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora alpha_and &= alpha_value; 1378c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora } 1388c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora argb += argb_stride; 1398c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora alpha += alpha_stride; 1408c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora } 1418c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora // Combine the eight alpha 'and' into a 8-bit mask. 1428c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); 1438c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora return (alpha_and == 0xff); 1448c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora} 1458c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora 1467c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 1477c8da7ce66017295a65ec028084b90800be377f8James Zern// Non-dither premultiplied modes 1487c8da7ce66017295a65ec028084b90800be377f8James Zern 1497c8da7ce66017295a65ec028084b90800be377f8James Zern#define MULTIPLIER(a) ((a) * 0x8081) 1507c8da7ce66017295a65ec028084b90800be377f8James Zern#define PREMULTIPLY(x, m) (((x) * (m)) >> 23) 1517c8da7ce66017295a65ec028084b90800be377f8James Zern 1527c8da7ce66017295a65ec028084b90800be377f8James Zern// We can't use a 'const int' for the SHUFFLE value, because it has to be an 153fa39824bb690c5806358871f46940d0450973d8aJames Zern// immediate in the _mm_shufflexx_epi16() instruction. We really need a macro. 154fa39824bb690c5806358871f46940d0450973d8aJames Zern// We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit 155fa39824bb690c5806358871f46940d0450973d8aJames Zern// value. 156fa39824bb690c5806358871f46940d0450973d8aJames Zern#define APPLY_ALPHA(RGBX, SHUFFLE) do { \ 157fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX)); \ 158fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero); \ 159fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero); \ 160fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask); \ 161fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask); \ 162fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \ 163fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \ 164fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \ 165fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \ 166fa39824bb690c5806358871f46940d0450973d8aJames Zern /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */ \ 167fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo); \ 168fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi); \ 169fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult); \ 170fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult); \ 171fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7); \ 172fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7); \ 173fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi); \ 174fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128((__m128i*)&(RGBX), A3); \ 1757c8da7ce66017295a65ec028084b90800be377f8James Zern} while (0) 1767c8da7ce66017295a65ec028084b90800be377f8James Zern 177fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first, 178fa39824bb690c5806358871f46940d0450973d8aJames Zern int w, int h, int stride) { 1797c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 180fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i kMult = _mm_set1_epi16(0x8081u); 181fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0); 182fa39824bb690c5806358871f46940d0450973d8aJames Zern const int kSpan = 4; 1837c8da7ce66017295a65ec028084b90800be377f8James Zern while (h-- > 0) { 1847c8da7ce66017295a65ec028084b90800be377f8James Zern uint32_t* const rgbx = (uint32_t*)rgba; 1857c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 1867c8da7ce66017295a65ec028084b90800be377f8James Zern if (!alpha_first) { 187fa39824bb690c5806358871f46940d0450973d8aJames Zern for (i = 0; i + kSpan <= w; i += kSpan) { 188fa39824bb690c5806358871f46940d0450973d8aJames Zern APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3)); 1897c8da7ce66017295a65ec028084b90800be377f8James Zern } 1907c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 191fa39824bb690c5806358871f46940d0450973d8aJames Zern for (i = 0; i + kSpan <= w; i += kSpan) { 192fa39824bb690c5806358871f46940d0450973d8aJames Zern APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1)); 1937c8da7ce66017295a65ec028084b90800be377f8James Zern } 1947c8da7ce66017295a65ec028084b90800be377f8James Zern } 1957c8da7ce66017295a65ec028084b90800be377f8James Zern // Finish with left-overs. 1967c8da7ce66017295a65ec028084b90800be377f8James Zern for (; i < w; ++i) { 1977c8da7ce66017295a65ec028084b90800be377f8James Zern uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); 1987c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); 1997c8da7ce66017295a65ec028084b90800be377f8James Zern const uint32_t a = alpha[4 * i]; 2007c8da7ce66017295a65ec028084b90800be377f8James Zern if (a != 0xff) { 2017c8da7ce66017295a65ec028084b90800be377f8James Zern const uint32_t mult = MULTIPLIER(a); 2027c8da7ce66017295a65ec028084b90800be377f8James Zern rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); 2037c8da7ce66017295a65ec028084b90800be377f8James Zern rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); 2047c8da7ce66017295a65ec028084b90800be377f8James Zern rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); 2057c8da7ce66017295a65ec028084b90800be377f8James Zern } 2067c8da7ce66017295a65ec028084b90800be377f8James Zern } 2077c8da7ce66017295a65ec028084b90800be377f8James Zern rgba += stride; 2087c8da7ce66017295a65ec028084b90800be377f8James Zern } 2097c8da7ce66017295a65ec028084b90800be377f8James Zern} 2107c8da7ce66017295a65ec028084b90800be377f8James Zern#undef MULTIPLIER 2117c8da7ce66017295a65ec028084b90800be377f8James Zern#undef PREMULTIPLY 2127c8da7ce66017295a65ec028084b90800be377f8James Zern 2137c8da7ce66017295a65ec028084b90800be377f8James Zern// ----------------------------------------------------------------------------- 2147c8da7ce66017295a65ec028084b90800be377f8James Zern// Apply alpha value to rows 2157c8da7ce66017295a65ec028084b90800be377f8James Zern 216fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) { 2177c8da7ce66017295a65ec028084b90800be377f8James Zern int x = 0; 2187c8da7ce66017295a65ec028084b90800be377f8James Zern if (!inverse) { 2197c8da7ce66017295a65ec028084b90800be377f8James Zern const int kSpan = 2; 2207c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 221fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i k128 = _mm_set1_epi16(128); 222fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i kMult = _mm_set1_epi16(0x0101); 223fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0); 224fa39824bb690c5806358871f46940d0450973d8aJames Zern for (x = 0; x + kSpan <= width; x += kSpan) { 225fa39824bb690c5806358871f46940d0450973d8aJames Zern // To compute 'result = (int)(a * x / 255. + .5)', we use: 226fa39824bb690c5806358871f46940d0450973d8aJames Zern // tmp = a * v + 128, result = (tmp * 0x0101u) >> 16 227fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]); 228fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A1 = _mm_unpacklo_epi8(A0, zero); 229fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A2 = _mm_or_si128(A1, kMask); 230fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3)); 231fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3)); 232fa39824bb690c5806358871f46940d0450973d8aJames Zern // here, A4 = [ff a0 a0 a0][ff a1 a1 a1] 233fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A5 = _mm_mullo_epi16(A4, A1); 234fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A6 = _mm_add_epi16(A5, k128); 235fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A7 = _mm_mulhi_epu16(A6, kMult); 236fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i A10 = _mm_packus_epi16(A7, zero); 237fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storel_epi64((__m128i*)&ptr[x], A10); 2387c8da7ce66017295a65ec028084b90800be377f8James Zern } 2397c8da7ce66017295a65ec028084b90800be377f8James Zern } 2407c8da7ce66017295a65ec028084b90800be377f8James Zern width -= x; 2417c8da7ce66017295a65ec028084b90800be377f8James Zern if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); 2427c8da7ce66017295a65ec028084b90800be377f8James Zern} 2437c8da7ce66017295a65ec028084b90800be377f8James Zern 244fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha, 245fa39824bb690c5806358871f46940d0450973d8aJames Zern int width, int inverse) { 2467c8da7ce66017295a65ec028084b90800be377f8James Zern int x = 0; 2477c8da7ce66017295a65ec028084b90800be377f8James Zern if (!inverse) { 2487c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 249fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i k128 = _mm_set1_epi16(128); 250fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i kMult = _mm_set1_epi16(0x0101); 251fa39824bb690c5806358871f46940d0450973d8aJames Zern for (x = 0; x + 8 <= width; x += 8) { 2527c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); 253fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); 2547c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i v1 = _mm_unpacklo_epi8(v0, zero); 255fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a1 = _mm_unpacklo_epi8(a0, zero); 256fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i v2 = _mm_mullo_epi16(v1, a1); 257fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i v3 = _mm_add_epi16(v2, k128); 258fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i v4 = _mm_mulhi_epu16(v3, kMult); 259fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i v5 = _mm_packus_epi16(v4, zero); 260fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storel_epi64((__m128i*)&ptr[x], v5); 2617c8da7ce66017295a65ec028084b90800be377f8James Zern } 2627c8da7ce66017295a65ec028084b90800be377f8James Zern } 2637c8da7ce66017295a65ec028084b90800be377f8James Zern width -= x; 2647c8da7ce66017295a65ec028084b90800be377f8James Zern if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); 2657c8da7ce66017295a65ec028084b90800be377f8James Zern} 2668c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora 2678c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora//------------------------------------------------------------------------------ 2687c8da7ce66017295a65ec028084b90800be377f8James Zern// Entry point 2698c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora 2708c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Aroraextern void WebPInitAlphaProcessingSSE2(void); 2718c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora 2727c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { 273fa39824bb690c5806358871f46940d0450973d8aJames Zern WebPMultARGBRow = MultARGBRow_SSE2; 274fa39824bb690c5806358871f46940d0450973d8aJames Zern WebPMultRow = MultRow_SSE2; 275fa39824bb690c5806358871f46940d0450973d8aJames Zern WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2; 2767c8da7ce66017295a65ec028084b90800be377f8James Zern WebPDispatchAlpha = DispatchAlpha; 2777c8da7ce66017295a65ec028084b90800be377f8James Zern WebPDispatchAlphaToGreen = DispatchAlphaToGreen; 2788c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora WebPExtractAlpha = ExtractAlpha; 2798c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora} 2807c8da7ce66017295a65ec028084b90800be377f8James Zern 2817c8da7ce66017295a65ec028084b90800be377f8James Zern#else // !WEBP_USE_SSE2 2827c8da7ce66017295a65ec028084b90800be377f8James Zern 2837c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2) 2847c8da7ce66017295a65ec028084b90800be377f8James Zern 2857c8da7ce66017295a65ec028084b90800be377f8James Zern#endif // WEBP_USE_SSE2 286