18c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// Copyright 2014 Google Inc. All Rights Reserved.
28c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora//
38c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// Use of this source code is governed by a BSD-style license
48c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// that can be found in the COPYING file in the root of the source
58c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// tree. An additional intellectual property rights grant can be found
68c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// in the file PATENTS. All contributing project authors may
78c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// be found in the AUTHORS file in the root of the source tree.
88c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// -----------------------------------------------------------------------------
98c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora//
108c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// Utilities for processing transparent channel.
118c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora//
128c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// Author: Skal (pascal.massimino@gmail.com)
138c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora
148c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#include "./dsp.h"
158c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora
168c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#if defined(WEBP_USE_SSE2)
178c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#include <emmintrin.h>
188c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora
198c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora//------------------------------------------------------------------------------
208c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora
217c8da7ce66017295a65ec028084b90800be377f8James Zernstatic int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
227c8da7ce66017295a65ec028084b90800be377f8James Zern                         int width, int height,
237c8da7ce66017295a65ec028084b90800be377f8James Zern                         uint8_t* dst, int dst_stride) {
247c8da7ce66017295a65ec028084b90800be377f8James Zern  // alpha_and stores an 'and' operation of all the alpha[] values. The final
257c8da7ce66017295a65ec028084b90800be377f8James Zern  // value is not 0xff if any of the alpha[] is not equal to 0xff.
267c8da7ce66017295a65ec028084b90800be377f8James Zern  uint32_t alpha_and = 0xff;
277c8da7ce66017295a65ec028084b90800be377f8James Zern  int i, j;
287c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i zero = _mm_setzero_si128();
297c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB
307c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
317c8da7ce66017295a65ec028084b90800be377f8James Zern  __m128i all_alphas = all_0xff;
327c8da7ce66017295a65ec028084b90800be377f8James Zern
337c8da7ce66017295a65ec028084b90800be377f8James Zern  // We must be able to access 3 extra bytes after the last written byte
347c8da7ce66017295a65ec028084b90800be377f8James Zern  // 'dst[4 * width - 4]', because we don't know if alpha is the first or the
357c8da7ce66017295a65ec028084b90800be377f8James Zern  // last byte of the quadruplet.
367c8da7ce66017295a65ec028084b90800be377f8James Zern  const int limit = (width - 1) & ~7;
377c8da7ce66017295a65ec028084b90800be377f8James Zern
387c8da7ce66017295a65ec028084b90800be377f8James Zern  for (j = 0; j < height; ++j) {
397c8da7ce66017295a65ec028084b90800be377f8James Zern    __m128i* out = (__m128i*)dst;
407c8da7ce66017295a65ec028084b90800be377f8James Zern    for (i = 0; i < limit; i += 8) {
417c8da7ce66017295a65ec028084b90800be377f8James Zern      // load 8 alpha bytes
427c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]);
437c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
447c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
457c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
467c8da7ce66017295a65ec028084b90800be377f8James Zern      // load 8 dst pixels (32 bytes)
477c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i b0_lo = _mm_loadu_si128(out + 0);
487c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i b0_hi = _mm_loadu_si128(out + 1);
497c8da7ce66017295a65ec028084b90800be377f8James Zern      // mask dst alpha values
507c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask);
517c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask);
527c8da7ce66017295a65ec028084b90800be377f8James Zern      // combine
537c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo);
547c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi);
557c8da7ce66017295a65ec028084b90800be377f8James Zern      // store
567c8da7ce66017295a65ec028084b90800be377f8James Zern      _mm_storeu_si128(out + 0, b2_lo);
577c8da7ce66017295a65ec028084b90800be377f8James Zern      _mm_storeu_si128(out + 1, b2_hi);
587c8da7ce66017295a65ec028084b90800be377f8James Zern      // accumulate eight alpha 'and' in parallel
597c8da7ce66017295a65ec028084b90800be377f8James Zern      all_alphas = _mm_and_si128(all_alphas, a0);
607c8da7ce66017295a65ec028084b90800be377f8James Zern      out += 2;
617c8da7ce66017295a65ec028084b90800be377f8James Zern    }
627c8da7ce66017295a65ec028084b90800be377f8James Zern    for (; i < width; ++i) {
637c8da7ce66017295a65ec028084b90800be377f8James Zern      const uint32_t alpha_value = alpha[i];
647c8da7ce66017295a65ec028084b90800be377f8James Zern      dst[4 * i] = alpha_value;
657c8da7ce66017295a65ec028084b90800be377f8James Zern      alpha_and &= alpha_value;
667c8da7ce66017295a65ec028084b90800be377f8James Zern    }
677c8da7ce66017295a65ec028084b90800be377f8James Zern    alpha += alpha_stride;
687c8da7ce66017295a65ec028084b90800be377f8James Zern    dst += dst_stride;
697c8da7ce66017295a65ec028084b90800be377f8James Zern  }
707c8da7ce66017295a65ec028084b90800be377f8James Zern  // Combine the eight alpha 'and' into a 8-bit mask.
717c8da7ce66017295a65ec028084b90800be377f8James Zern  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
727c8da7ce66017295a65ec028084b90800be377f8James Zern  return (alpha_and != 0xff);
737c8da7ce66017295a65ec028084b90800be377f8James Zern}
747c8da7ce66017295a65ec028084b90800be377f8James Zern
757c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
767c8da7ce66017295a65ec028084b90800be377f8James Zern                                 int width, int height,
777c8da7ce66017295a65ec028084b90800be377f8James Zern                                 uint32_t* dst, int dst_stride) {
787c8da7ce66017295a65ec028084b90800be377f8James Zern  int i, j;
797c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i zero = _mm_setzero_si128();
807c8da7ce66017295a65ec028084b90800be377f8James Zern  const int limit = width & ~15;
817c8da7ce66017295a65ec028084b90800be377f8James Zern  for (j = 0; j < height; ++j) {
827c8da7ce66017295a65ec028084b90800be377f8James Zern    for (i = 0; i < limit; i += 16) {   // process 16 alpha bytes
837c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]);
847c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i a1 = _mm_unpacklo_epi8(zero, a0);  // note the 'zero' first!
857c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i b1 = _mm_unpackhi_epi8(zero, a0);
867c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
877c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero);
887c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
897c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero);
907c8da7ce66017295a65ec028084b90800be377f8James Zern      _mm_storeu_si128((__m128i*)&dst[i +  0], a2_lo);
917c8da7ce66017295a65ec028084b90800be377f8James Zern      _mm_storeu_si128((__m128i*)&dst[i +  4], a2_hi);
927c8da7ce66017295a65ec028084b90800be377f8James Zern      _mm_storeu_si128((__m128i*)&dst[i +  8], b2_lo);
937c8da7ce66017295a65ec028084b90800be377f8James Zern      _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi);
947c8da7ce66017295a65ec028084b90800be377f8James Zern    }
957c8da7ce66017295a65ec028084b90800be377f8James Zern    for (; i < width; ++i) dst[i] = alpha[i] << 8;
967c8da7ce66017295a65ec028084b90800be377f8James Zern    alpha += alpha_stride;
977c8da7ce66017295a65ec028084b90800be377f8James Zern    dst += dst_stride;
987c8da7ce66017295a65ec028084b90800be377f8James Zern  }
997c8da7ce66017295a65ec028084b90800be377f8James Zern}
1007c8da7ce66017295a65ec028084b90800be377f8James Zern
1018c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arorastatic int ExtractAlpha(const uint8_t* argb, int argb_stride,
1028c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora                        int width, int height,
1038c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora                        uint8_t* alpha, int alpha_stride) {
1048c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  // alpha_and stores an 'and' operation of all the alpha[] values. The final
1058c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  // value is not 0xff if any of the alpha[] is not equal to 0xff.
1068c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint32_t alpha_and = 0xff;
1078c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  int i, j;
1088c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
1098c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
1108c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  __m128i all_alphas = all_0xff;
1118c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora
1128c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  // We must be able to access 3 extra bytes after the last written byte
1138c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
1148c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  // last byte of the quadruplet.
1158c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  const int limit = (width - 1) & ~7;
1168c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora
1178c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  for (j = 0; j < height; ++j) {
1188c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora    const __m128i* src = (const __m128i*)argb;
1198c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora    for (i = 0; i < limit; i += 8) {
1208c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      // load 32 argb bytes
1218c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      const __m128i a0 = _mm_loadu_si128(src + 0);
1228c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      const __m128i a1 = _mm_loadu_si128(src + 1);
1238c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      const __m128i b0 = _mm_and_si128(a0, a_mask);
1248c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      const __m128i b1 = _mm_and_si128(a1, a_mask);
1258c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      const __m128i c0 = _mm_packs_epi32(b0, b1);
1268c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      const __m128i d0 = _mm_packus_epi16(c0, c0);
1278c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      // store
1288c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      _mm_storel_epi64((__m128i*)&alpha[i], d0);
1298c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      // accumulate eight alpha 'and' in parallel
1308c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      all_alphas = _mm_and_si128(all_alphas, d0);
1318c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      src += 2;
1328c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora    }
1338c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora    for (; i < width; ++i) {
1348c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      const uint32_t alpha_value = argb[4 * i];
1358c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      alpha[i] = alpha_value;
1368c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora      alpha_and &= alpha_value;
1378c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora    }
1388c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora    argb += argb_stride;
1398c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora    alpha += alpha_stride;
1408c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  }
1418c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  // Combine the eight alpha 'and' into a 8-bit mask.
1428c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
1438c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  return (alpha_and == 0xff);
1448c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora}
1458c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora
1467c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------
1477c8da7ce66017295a65ec028084b90800be377f8James Zern// Non-dither premultiplied modes
1487c8da7ce66017295a65ec028084b90800be377f8James Zern
1497c8da7ce66017295a65ec028084b90800be377f8James Zern#define MULTIPLIER(a)   ((a) * 0x8081)
1507c8da7ce66017295a65ec028084b90800be377f8James Zern#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
1517c8da7ce66017295a65ec028084b90800be377f8James Zern
1527c8da7ce66017295a65ec028084b90800be377f8James Zern// We can't use a 'const int' for the SHUFFLE value, because it has to be an
153fa39824bb690c5806358871f46940d0450973d8aJames Zern// immediate in the _mm_shufflexx_epi16() instruction. We really need a macro.
154fa39824bb690c5806358871f46940d0450973d8aJames Zern// We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit
155fa39824bb690c5806358871f46940d0450973d8aJames Zern// value.
156fa39824bb690c5806358871f46940d0450973d8aJames Zern#define APPLY_ALPHA(RGBX, SHUFFLE) do {                              \
157fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX));    \
158fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero);           \
159fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero);           \
160fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask);           \
161fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask);           \
162fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \
163fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \
164fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \
165fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \
166fa39824bb690c5806358871f46940d0450973d8aJames Zern  /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */                          \
167fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo);        \
168fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi);        \
169fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult);               \
170fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult);               \
171fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7);                    \
172fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7);                    \
173fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi);                 \
174fa39824bb690c5806358871f46940d0450973d8aJames Zern  _mm_storeu_si128((__m128i*)&(RGBX), A3);                           \
1757c8da7ce66017295a65ec028084b90800be377f8James Zern} while (0)
1767c8da7ce66017295a65ec028084b90800be377f8James Zern
177fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
178fa39824bb690c5806358871f46940d0450973d8aJames Zern                                    int w, int h, int stride) {
1797c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i zero = _mm_setzero_si128();
180fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i kMult = _mm_set1_epi16(0x8081u);
181fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0);
182fa39824bb690c5806358871f46940d0450973d8aJames Zern  const int kSpan = 4;
1837c8da7ce66017295a65ec028084b90800be377f8James Zern  while (h-- > 0) {
1847c8da7ce66017295a65ec028084b90800be377f8James Zern    uint32_t* const rgbx = (uint32_t*)rgba;
1857c8da7ce66017295a65ec028084b90800be377f8James Zern    int i;
1867c8da7ce66017295a65ec028084b90800be377f8James Zern    if (!alpha_first) {
187fa39824bb690c5806358871f46940d0450973d8aJames Zern      for (i = 0; i + kSpan <= w; i += kSpan) {
188fa39824bb690c5806358871f46940d0450973d8aJames Zern        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3));
1897c8da7ce66017295a65ec028084b90800be377f8James Zern      }
1907c8da7ce66017295a65ec028084b90800be377f8James Zern    } else {
191fa39824bb690c5806358871f46940d0450973d8aJames Zern      for (i = 0; i + kSpan <= w; i += kSpan) {
192fa39824bb690c5806358871f46940d0450973d8aJames Zern        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1));
1937c8da7ce66017295a65ec028084b90800be377f8James Zern      }
1947c8da7ce66017295a65ec028084b90800be377f8James Zern    }
1957c8da7ce66017295a65ec028084b90800be377f8James Zern    // Finish with left-overs.
1967c8da7ce66017295a65ec028084b90800be377f8James Zern    for (; i < w; ++i) {
1977c8da7ce66017295a65ec028084b90800be377f8James Zern      uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
1987c8da7ce66017295a65ec028084b90800be377f8James Zern      const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
1997c8da7ce66017295a65ec028084b90800be377f8James Zern      const uint32_t a = alpha[4 * i];
2007c8da7ce66017295a65ec028084b90800be377f8James Zern      if (a != 0xff) {
2017c8da7ce66017295a65ec028084b90800be377f8James Zern        const uint32_t mult = MULTIPLIER(a);
2027c8da7ce66017295a65ec028084b90800be377f8James Zern        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
2037c8da7ce66017295a65ec028084b90800be377f8James Zern        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
2047c8da7ce66017295a65ec028084b90800be377f8James Zern        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
2057c8da7ce66017295a65ec028084b90800be377f8James Zern      }
2067c8da7ce66017295a65ec028084b90800be377f8James Zern    }
2077c8da7ce66017295a65ec028084b90800be377f8James Zern    rgba += stride;
2087c8da7ce66017295a65ec028084b90800be377f8James Zern  }
2097c8da7ce66017295a65ec028084b90800be377f8James Zern}
2107c8da7ce66017295a65ec028084b90800be377f8James Zern#undef MULTIPLIER
2117c8da7ce66017295a65ec028084b90800be377f8James Zern#undef PREMULTIPLY
2127c8da7ce66017295a65ec028084b90800be377f8James Zern
2137c8da7ce66017295a65ec028084b90800be377f8James Zern// -----------------------------------------------------------------------------
2147c8da7ce66017295a65ec028084b90800be377f8James Zern// Apply alpha value to rows
2157c8da7ce66017295a65ec028084b90800be377f8James Zern
216fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
2177c8da7ce66017295a65ec028084b90800be377f8James Zern  int x = 0;
2187c8da7ce66017295a65ec028084b90800be377f8James Zern  if (!inverse) {
2197c8da7ce66017295a65ec028084b90800be377f8James Zern    const int kSpan = 2;
2207c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i zero = _mm_setzero_si128();
221fa39824bb690c5806358871f46940d0450973d8aJames Zern    const __m128i k128 = _mm_set1_epi16(128);
222fa39824bb690c5806358871f46940d0450973d8aJames Zern    const __m128i kMult = _mm_set1_epi16(0x0101);
223fa39824bb690c5806358871f46940d0450973d8aJames Zern    const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0);
224fa39824bb690c5806358871f46940d0450973d8aJames Zern    for (x = 0; x + kSpan <= width; x += kSpan) {
225fa39824bb690c5806358871f46940d0450973d8aJames Zern      // To compute 'result = (int)(a * x / 255. + .5)', we use:
226fa39824bb690c5806358871f46940d0450973d8aJames Zern      //   tmp = a * v + 128, result = (tmp * 0x0101u) >> 16
227fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]);
228fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i A1 = _mm_unpacklo_epi8(A0, zero);
229fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i A2 = _mm_or_si128(A1, kMask);
230fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3));
231fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3));
232fa39824bb690c5806358871f46940d0450973d8aJames Zern      // here, A4 = [ff a0 a0 a0][ff a1 a1 a1]
233fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i A5 = _mm_mullo_epi16(A4, A1);
234fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i A6 = _mm_add_epi16(A5, k128);
235fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i A7 = _mm_mulhi_epu16(A6, kMult);
236fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i A10 = _mm_packus_epi16(A7, zero);
237fa39824bb690c5806358871f46940d0450973d8aJames Zern      _mm_storel_epi64((__m128i*)&ptr[x], A10);
2387c8da7ce66017295a65ec028084b90800be377f8James Zern    }
2397c8da7ce66017295a65ec028084b90800be377f8James Zern  }
2407c8da7ce66017295a65ec028084b90800be377f8James Zern  width -= x;
2417c8da7ce66017295a65ec028084b90800be377f8James Zern  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
2427c8da7ce66017295a65ec028084b90800be377f8James Zern}
2437c8da7ce66017295a65ec028084b90800be377f8James Zern
244fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
245fa39824bb690c5806358871f46940d0450973d8aJames Zern                         int width, int inverse) {
2467c8da7ce66017295a65ec028084b90800be377f8James Zern  int x = 0;
2477c8da7ce66017295a65ec028084b90800be377f8James Zern  if (!inverse) {
2487c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i zero = _mm_setzero_si128();
249fa39824bb690c5806358871f46940d0450973d8aJames Zern    const __m128i k128 = _mm_set1_epi16(128);
250fa39824bb690c5806358871f46940d0450973d8aJames Zern    const __m128i kMult = _mm_set1_epi16(0x0101);
251fa39824bb690c5806358871f46940d0450973d8aJames Zern    for (x = 0; x + 8 <= width; x += 8) {
2527c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
253fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
2547c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i v1 = _mm_unpacklo_epi8(v0, zero);
255fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
256fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i v2 = _mm_mullo_epi16(v1, a1);
257fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i v3 = _mm_add_epi16(v2, k128);
258fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i v4 = _mm_mulhi_epu16(v3, kMult);
259fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i v5 = _mm_packus_epi16(v4, zero);
260fa39824bb690c5806358871f46940d0450973d8aJames Zern      _mm_storel_epi64((__m128i*)&ptr[x], v5);
2617c8da7ce66017295a65ec028084b90800be377f8James Zern    }
2627c8da7ce66017295a65ec028084b90800be377f8James Zern  }
2637c8da7ce66017295a65ec028084b90800be377f8James Zern  width -= x;
2647c8da7ce66017295a65ec028084b90800be377f8James Zern  if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
2657c8da7ce66017295a65ec028084b90800be377f8James Zern}
2668c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora
2678c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora//------------------------------------------------------------------------------
2687c8da7ce66017295a65ec028084b90800be377f8James Zern// Entry point
2698c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora
2708c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Aroraextern void WebPInitAlphaProcessingSSE2(void);
2718c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora
2727c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
273fa39824bb690c5806358871f46940d0450973d8aJames Zern  WebPMultARGBRow = MultARGBRow_SSE2;
274fa39824bb690c5806358871f46940d0450973d8aJames Zern  WebPMultRow = MultRow_SSE2;
275fa39824bb690c5806358871f46940d0450973d8aJames Zern  WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
2767c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPDispatchAlpha = DispatchAlpha;
2777c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
2788c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  WebPExtractAlpha = ExtractAlpha;
2798c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora}
2807c8da7ce66017295a65ec028084b90800be377f8James Zern
2817c8da7ce66017295a65ec028084b90800be377f8James Zern#else  // !WEBP_USE_SSE2
2827c8da7ce66017295a65ec028084b90800be377f8James Zern
2837c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2)
2847c8da7ce66017295a65ec028084b90800be377f8James Zern
2857c8da7ce66017295a65ec028084b90800be377f8James Zern#endif  // WEBP_USE_SSE2
286