17c8da7ce66017295a65ec028084b90800be377f8James Zern// Copyright 2015 Google Inc. All Rights Reserved. 27c8da7ce66017295a65ec028084b90800be377f8James Zern// 37c8da7ce66017295a65ec028084b90800be377f8James Zern// Use of this source code is governed by a BSD-style license 47c8da7ce66017295a65ec028084b90800be377f8James Zern// that can be found in the COPYING file in the root of the source 57c8da7ce66017295a65ec028084b90800be377f8James Zern// tree. An additional intellectual property rights grant can be found 67c8da7ce66017295a65ec028084b90800be377f8James Zern// in the file PATENTS. All contributing project authors may 77c8da7ce66017295a65ec028084b90800be377f8James Zern// be found in the AUTHORS file in the root of the source tree. 87c8da7ce66017295a65ec028084b90800be377f8James Zern// ----------------------------------------------------------------------------- 97c8da7ce66017295a65ec028084b90800be377f8James Zern// 107c8da7ce66017295a65ec028084b90800be377f8James Zern// SSE2 variant of methods for lossless encoder 117c8da7ce66017295a65ec028084b90800be377f8James Zern// 127c8da7ce66017295a65ec028084b90800be377f8James Zern// Author: Skal (pascal.massimino@gmail.com) 137c8da7ce66017295a65ec028084b90800be377f8James Zern 147c8da7ce66017295a65ec028084b90800be377f8James Zern#include "./dsp.h" 157c8da7ce66017295a65ec028084b90800be377f8James Zern 167c8da7ce66017295a65ec028084b90800be377f8James Zern#if defined(WEBP_USE_SSE2) 177c8da7ce66017295a65ec028084b90800be377f8James Zern#include <assert.h> 187c8da7ce66017295a65ec028084b90800be377f8James Zern#include <emmintrin.h> 197c8da7ce66017295a65ec028084b90800be377f8James Zern#include "./lossless.h" 207c8da7ce66017295a65ec028084b90800be377f8James Zern 217c8da7ce66017295a65ec028084b90800be377f8James Zern// For sign-extended multiplying constants, pre-shifted by 5: 227c8da7ce66017295a65ec028084b90800be377f8James Zern#define CST_5b(X) (((int16_t)((uint16_t)X << 8)) >> 5) 237c8da7ce66017295a65ec028084b90800be377f8James Zern 247c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 257c8da7ce66017295a65ec028084b90800be377f8James Zern// Subtract-Green Transform 267c8da7ce66017295a65ec028084b90800be377f8James Zern 277c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { 287c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 297c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i + 4 <= num_pixels; i += 4) { 307c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb 317c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g 327c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); 337c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g 347c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i out = _mm_sub_epi8(in, C); 357c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&argb_data[i], out); 367c8da7ce66017295a65ec028084b90800be377f8James Zern } 377c8da7ce66017295a65ec028084b90800be377f8James Zern // fallthrough and finish off with plain-C 387c8da7ce66017295a65ec028084b90800be377f8James Zern VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); 397c8da7ce66017295a65ec028084b90800be377f8James Zern} 407c8da7ce66017295a65ec028084b90800be377f8James Zern 417c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 427c8da7ce66017295a65ec028084b90800be377f8James Zern// Color Transform 437c8da7ce66017295a65ec028084b90800be377f8James Zern 447c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void TransformColor(const VP8LMultipliers* const m, 457c8da7ce66017295a65ec028084b90800be377f8James Zern uint32_t* argb_data, int num_pixels) { 467c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mults_rb = _mm_set_epi16( 477c8da7ce66017295a65ec028084b90800be377f8James Zern CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), 487c8da7ce66017295a65ec028084b90800be377f8James Zern CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), 497c8da7ce66017295a65ec028084b90800be377f8James Zern CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), 507c8da7ce66017295a65ec028084b90800be377f8James Zern CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_)); 517c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mults_b2 = _mm_set_epi16( 527c8da7ce66017295a65ec028084b90800be377f8James Zern CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, 537c8da7ce66017295a65ec028084b90800be377f8James Zern CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0); 547c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks 557c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks 567c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 577c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i + 4 <= num_pixels; i += 4) { 587c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb 597c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 607c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); 617c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 627c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 637c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0 647c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0 657c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2 667c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i H = _mm_add_epi8(G, D); // x dr x db 677c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db 687c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i out = _mm_sub_epi8(in, I); 697c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&argb_data[i], out); 707c8da7ce66017295a65ec028084b90800be377f8James Zern } 717c8da7ce66017295a65ec028084b90800be377f8James Zern // fallthrough and finish off with plain-C 727c8da7ce66017295a65ec028084b90800be377f8James Zern VP8LTransformColor_C(m, argb_data + i, num_pixels - i); 737c8da7ce66017295a65ec028084b90800be377f8James Zern} 747c8da7ce66017295a65ec028084b90800be377f8James Zern 757c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 767c8da7ce66017295a65ec028084b90800be377f8James Zern#define SPAN 8 777c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void CollectColorBlueTransforms(const uint32_t* argb, int stride, 787c8da7ce66017295a65ec028084b90800be377f8James Zern int tile_width, int tile_height, 797c8da7ce66017295a65ec028084b90800be377f8James Zern int green_to_blue, int red_to_blue, 807c8da7ce66017295a65ec028084b90800be377f8James Zern int histo[]) { 817c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mults_r = _mm_set_epi16( 827c8da7ce66017295a65ec028084b90800be377f8James Zern CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0, 837c8da7ce66017295a65ec028084b90800be377f8James Zern CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0); 847c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mults_g = _mm_set_epi16( 857c8da7ce66017295a65ec028084b90800be377f8James Zern 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue), 867c8da7ce66017295a65ec028084b90800be377f8James Zern 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue)); 877c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask 887c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask 897c8da7ce66017295a65ec028084b90800be377f8James Zern int y; 907c8da7ce66017295a65ec028084b90800be377f8James Zern for (y = 0; y < tile_height; ++y) { 917c8da7ce66017295a65ec028084b90800be377f8James Zern const uint32_t* const src = argb + y * stride; 927c8da7ce66017295a65ec028084b90800be377f8James Zern int i, x; 937c8da7ce66017295a65ec028084b90800be377f8James Zern for (x = 0; x + SPAN <= tile_width; x += SPAN) { 947c8da7ce66017295a65ec028084b90800be377f8James Zern uint16_t values[SPAN]; 957c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); 967c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); 977c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0 987c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i A1 = _mm_slli_epi16(in1, 8); 997c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 1007c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i B1 = _mm_and_si128(in1, mask_g); 1017c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0 1027c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i C1 = _mm_mulhi_epi16(A1, mults_r); 1037c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db 1047c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i D1 = _mm_mulhi_epi16(B1, mults_g); 1057c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b' 1067c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i E1 = _mm_sub_epi8(in1, D1); 1077c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db 1087c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i F1 = _mm_srli_epi32(C1, 16); 1097c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b' 1107c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i G1 = _mm_sub_epi8(E1, F1); 1117c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b 1127c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i H1 = _mm_and_si128(G1, mask_b); 1137c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b' 1147c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)values, I); 1157c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i < SPAN; ++i) ++histo[values[i]]; 1167c8da7ce66017295a65ec028084b90800be377f8James Zern } 1177c8da7ce66017295a65ec028084b90800be377f8James Zern } 1187c8da7ce66017295a65ec028084b90800be377f8James Zern { 1197c8da7ce66017295a65ec028084b90800be377f8James Zern const int left_over = tile_width & (SPAN - 1); 1207c8da7ce66017295a65ec028084b90800be377f8James Zern if (left_over > 0) { 1217c8da7ce66017295a65ec028084b90800be377f8James Zern VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride, 1227c8da7ce66017295a65ec028084b90800be377f8James Zern left_over, tile_height, 1237c8da7ce66017295a65ec028084b90800be377f8James Zern green_to_blue, red_to_blue, histo); 1247c8da7ce66017295a65ec028084b90800be377f8James Zern } 1257c8da7ce66017295a65ec028084b90800be377f8James Zern } 1267c8da7ce66017295a65ec028084b90800be377f8James Zern} 1277c8da7ce66017295a65ec028084b90800be377f8James Zern 1287c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void CollectColorRedTransforms(const uint32_t* argb, int stride, 1297c8da7ce66017295a65ec028084b90800be377f8James Zern int tile_width, int tile_height, 1307c8da7ce66017295a65ec028084b90800be377f8James Zern int green_to_red, int histo[]) { 1317c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mults_g = _mm_set_epi16( 1327c8da7ce66017295a65ec028084b90800be377f8James Zern 0, CST_5b(green_to_red), 0, CST_5b(green_to_red), 1337c8da7ce66017295a65ec028084b90800be377f8James Zern 0, CST_5b(green_to_red), 0, CST_5b(green_to_red)); 1347c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask 1357c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i mask = _mm_set1_epi32(0xff); 1367c8da7ce66017295a65ec028084b90800be377f8James Zern 1377c8da7ce66017295a65ec028084b90800be377f8James Zern int y; 1387c8da7ce66017295a65ec028084b90800be377f8James Zern for (y = 0; y < tile_height; ++y) { 1397c8da7ce66017295a65ec028084b90800be377f8James Zern const uint32_t* const src = argb + y * stride; 1407c8da7ce66017295a65ec028084b90800be377f8James Zern int i, x; 1417c8da7ce66017295a65ec028084b90800be377f8James Zern for (x = 0; x + SPAN <= tile_width; x += SPAN) { 1427c8da7ce66017295a65ec028084b90800be377f8James Zern uint16_t values[SPAN]; 1437c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); 1447c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); 1457c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 1467c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i A1 = _mm_and_si128(in1, mask_g); 1477c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r 1487c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i B1 = _mm_srli_epi32(in1, 16); 1497c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr 1507c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i C1 = _mm_mulhi_epi16(A1, mults_g); 1517c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r' 1527c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i E1 = _mm_sub_epi8(B1, C1); 1537c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r' 1547c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i F1 = _mm_and_si128(E1, mask); 1557c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i I = _mm_packs_epi32(F0, F1); 1567c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)values, I); 1577c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i < SPAN; ++i) ++histo[values[i]]; 1587c8da7ce66017295a65ec028084b90800be377f8James Zern } 1597c8da7ce66017295a65ec028084b90800be377f8James Zern } 1607c8da7ce66017295a65ec028084b90800be377f8James Zern { 1617c8da7ce66017295a65ec028084b90800be377f8James Zern const int left_over = tile_width & (SPAN - 1); 1627c8da7ce66017295a65ec028084b90800be377f8James Zern if (left_over > 0) { 1637c8da7ce66017295a65ec028084b90800be377f8James Zern VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride, 1647c8da7ce66017295a65ec028084b90800be377f8James Zern left_over, tile_height, 1657c8da7ce66017295a65ec028084b90800be377f8James Zern green_to_red, histo); 1667c8da7ce66017295a65ec028084b90800be377f8James Zern } 1677c8da7ce66017295a65ec028084b90800be377f8James Zern } 1687c8da7ce66017295a65ec028084b90800be377f8James Zern} 1697c8da7ce66017295a65ec028084b90800be377f8James Zern#undef SPAN 1707c8da7ce66017295a65ec028084b90800be377f8James Zern 1717c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 1727c8da7ce66017295a65ec028084b90800be377f8James Zern 1737c8da7ce66017295a65ec028084b90800be377f8James Zern#define LINE_SIZE 16 // 8 or 16 1747c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out, 1757c8da7ce66017295a65ec028084b90800be377f8James Zern int size) { 1767c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 1777c8da7ce66017295a65ec028084b90800be377f8James Zern assert(size % LINE_SIZE == 0); 1787c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i < size; i += LINE_SIZE) { 1797c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); 1807c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); 1817c8da7ce66017295a65ec028084b90800be377f8James Zern#if (LINE_SIZE == 16) 1827c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]); 1837c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); 1847c8da7ce66017295a65ec028084b90800be377f8James Zern#endif 1857c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]); 1867c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]); 1877c8da7ce66017295a65ec028084b90800be377f8James Zern#if (LINE_SIZE == 16) 1887c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]); 1897c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]); 1907c8da7ce66017295a65ec028084b90800be377f8James Zern#endif 1917c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); 1927c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); 1937c8da7ce66017295a65ec028084b90800be377f8James Zern#if (LINE_SIZE == 16) 1947c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); 1957c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); 1967c8da7ce66017295a65ec028084b90800be377f8James Zern#endif 1977c8da7ce66017295a65ec028084b90800be377f8James Zern } 1987c8da7ce66017295a65ec028084b90800be377f8James Zern} 1997c8da7ce66017295a65ec028084b90800be377f8James Zern 2007c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void AddVectorEq(const uint32_t* a, uint32_t* out, int size) { 2017c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 2027c8da7ce66017295a65ec028084b90800be377f8James Zern assert(size % LINE_SIZE == 0); 2037c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i < size; i += LINE_SIZE) { 2047c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); 2057c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); 2067c8da7ce66017295a65ec028084b90800be377f8James Zern#if (LINE_SIZE == 16) 2077c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]); 2087c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); 2097c8da7ce66017295a65ec028084b90800be377f8James Zern#endif 2107c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]); 2117c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]); 2127c8da7ce66017295a65ec028084b90800be377f8James Zern#if (LINE_SIZE == 16) 2137c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]); 2147c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]); 2157c8da7ce66017295a65ec028084b90800be377f8James Zern#endif 2167c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); 2177c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); 2187c8da7ce66017295a65ec028084b90800be377f8James Zern#if (LINE_SIZE == 16) 2197c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); 2207c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); 2217c8da7ce66017295a65ec028084b90800be377f8James Zern#endif 2227c8da7ce66017295a65ec028084b90800be377f8James Zern } 2237c8da7ce66017295a65ec028084b90800be377f8James Zern} 2247c8da7ce66017295a65ec028084b90800be377f8James Zern#undef LINE_SIZE 2257c8da7ce66017295a65ec028084b90800be377f8James Zern 2267c8da7ce66017295a65ec028084b90800be377f8James Zern// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But 2277c8da7ce66017295a65ec028084b90800be377f8James Zern// that's ok since the histogram values are less than 1<<28 (max picture size). 2287c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void HistogramAdd(const VP8LHistogram* const a, 2297c8da7ce66017295a65ec028084b90800be377f8James Zern const VP8LHistogram* const b, 2307c8da7ce66017295a65ec028084b90800be377f8James Zern VP8LHistogram* const out) { 2317c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 2327c8da7ce66017295a65ec028084b90800be377f8James Zern const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_); 2337c8da7ce66017295a65ec028084b90800be377f8James Zern assert(a->palette_code_bits_ == b->palette_code_bits_); 2347c8da7ce66017295a65ec028084b90800be377f8James Zern if (b != out) { 2357c8da7ce66017295a65ec028084b90800be377f8James Zern AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES); 2367c8da7ce66017295a65ec028084b90800be377f8James Zern AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES); 2377c8da7ce66017295a65ec028084b90800be377f8James Zern AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES); 2387c8da7ce66017295a65ec028084b90800be377f8James Zern AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES); 2397c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 2407c8da7ce66017295a65ec028084b90800be377f8James Zern AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES); 2417c8da7ce66017295a65ec028084b90800be377f8James Zern AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES); 2427c8da7ce66017295a65ec028084b90800be377f8James Zern AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES); 2437c8da7ce66017295a65ec028084b90800be377f8James Zern AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES); 2447c8da7ce66017295a65ec028084b90800be377f8James Zern } 2457c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = NUM_LITERAL_CODES; i < literal_size; ++i) { 2467c8da7ce66017295a65ec028084b90800be377f8James Zern out->literal_[i] = a->literal_[i] + b->literal_[i]; 2477c8da7ce66017295a65ec028084b90800be377f8James Zern } 2487c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i < NUM_DISTANCE_CODES; ++i) { 2497c8da7ce66017295a65ec028084b90800be377f8James Zern out->distance_[i] = a->distance_[i] + b->distance_[i]; 2507c8da7ce66017295a65ec028084b90800be377f8James Zern } 2517c8da7ce66017295a65ec028084b90800be377f8James Zern} 2527c8da7ce66017295a65ec028084b90800be377f8James Zern 2537c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 2547c8da7ce66017295a65ec028084b90800be377f8James Zern// Entropy 2557c8da7ce66017295a65ec028084b90800be377f8James Zern 2567c8da7ce66017295a65ec028084b90800be377f8James Zern// Checks whether the X or Y contribution is worth computing and adding. 2577c8da7ce66017295a65ec028084b90800be377f8James Zern// Used in loop unrolling. 2587c8da7ce66017295a65ec028084b90800be377f8James Zern#define ANALYZE_X_OR_Y(x_or_y, j) \ 2597c8da7ce66017295a65ec028084b90800be377f8James Zern do { \ 2607c8da7ce66017295a65ec028084b90800be377f8James Zern if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \ 2617c8da7ce66017295a65ec028084b90800be377f8James Zern } while (0) 2627c8da7ce66017295a65ec028084b90800be377f8James Zern 2637c8da7ce66017295a65ec028084b90800be377f8James Zern// Checks whether the X + Y contribution is worth computing and adding. 2647c8da7ce66017295a65ec028084b90800be377f8James Zern// Used in loop unrolling. 2657c8da7ce66017295a65ec028084b90800be377f8James Zern#define ANALYZE_XY(j) \ 2667c8da7ce66017295a65ec028084b90800be377f8James Zern do { \ 2677c8da7ce66017295a65ec028084b90800be377f8James Zern if (tmp[j] != 0) { \ 2687c8da7ce66017295a65ec028084b90800be377f8James Zern retval -= VP8LFastSLog2(tmp[j]); \ 2697c8da7ce66017295a65ec028084b90800be377f8James Zern ANALYZE_X_OR_Y(X, j); \ 2707c8da7ce66017295a65ec028084b90800be377f8James Zern } \ 2717c8da7ce66017295a65ec028084b90800be377f8James Zern } while (0) 2727c8da7ce66017295a65ec028084b90800be377f8James Zern 2737c8da7ce66017295a65ec028084b90800be377f8James Zernstatic float CombinedShannonEntropy(const int X[256], const int Y[256]) { 2747c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 2757c8da7ce66017295a65ec028084b90800be377f8James Zern double retval = 0.; 2767c8da7ce66017295a65ec028084b90800be377f8James Zern int sumX, sumXY; 2777c8da7ce66017295a65ec028084b90800be377f8James Zern int32_t tmp[4]; 2787c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i zero = _mm_setzero_si128(); 2797c8da7ce66017295a65ec028084b90800be377f8James Zern // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY). 2807c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i sumXY_128 = zero; 2817c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i sumX_128 = zero; 2827c8da7ce66017295a65ec028084b90800be377f8James Zern 2837c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i < 256; i += 4) { 2847c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i x = _mm_loadu_si128((const __m128i*)(X + i)); 2857c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i)); 2867c8da7ce66017295a65ec028084b90800be377f8James Zern 2877c8da7ce66017295a65ec028084b90800be377f8James Zern // Check if any X is non-zero: this actually provides a speedup as X is 2887c8da7ce66017295a65ec028084b90800be377f8James Zern // usually sparse. 2897c8da7ce66017295a65ec028084b90800be377f8James Zern if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) { 2907c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i xy_128 = _mm_add_epi32(x, y); 2917c8da7ce66017295a65ec028084b90800be377f8James Zern sumXY_128 = _mm_add_epi32(sumXY_128, xy_128); 2927c8da7ce66017295a65ec028084b90800be377f8James Zern 2937c8da7ce66017295a65ec028084b90800be377f8James Zern sumX_128 = _mm_add_epi32(sumX_128, x); 2947c8da7ce66017295a65ec028084b90800be377f8James Zern 2957c8da7ce66017295a65ec028084b90800be377f8James Zern // Analyze the different X + Y. 2967c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)tmp, xy_128); 2977c8da7ce66017295a65ec028084b90800be377f8James Zern 2987c8da7ce66017295a65ec028084b90800be377f8James Zern ANALYZE_XY(0); 2997c8da7ce66017295a65ec028084b90800be377f8James Zern ANALYZE_XY(1); 3007c8da7ce66017295a65ec028084b90800be377f8James Zern ANALYZE_XY(2); 3017c8da7ce66017295a65ec028084b90800be377f8James Zern ANALYZE_XY(3); 3027c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 3037c8da7ce66017295a65ec028084b90800be377f8James Zern // X is fully 0, so only deal with Y. 3047c8da7ce66017295a65ec028084b90800be377f8James Zern sumXY_128 = _mm_add_epi32(sumXY_128, y); 3057c8da7ce66017295a65ec028084b90800be377f8James Zern 3067c8da7ce66017295a65ec028084b90800be377f8James Zern ANALYZE_X_OR_Y(Y, 0); 3077c8da7ce66017295a65ec028084b90800be377f8James Zern ANALYZE_X_OR_Y(Y, 1); 3087c8da7ce66017295a65ec028084b90800be377f8James Zern ANALYZE_X_OR_Y(Y, 2); 3097c8da7ce66017295a65ec028084b90800be377f8James Zern ANALYZE_X_OR_Y(Y, 3); 3107c8da7ce66017295a65ec028084b90800be377f8James Zern } 3117c8da7ce66017295a65ec028084b90800be377f8James Zern } 3127c8da7ce66017295a65ec028084b90800be377f8James Zern 3137c8da7ce66017295a65ec028084b90800be377f8James Zern // Sum up sumX_128 to get sumX. 3147c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)tmp, sumX_128); 3157c8da7ce66017295a65ec028084b90800be377f8James Zern sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0]; 3167c8da7ce66017295a65ec028084b90800be377f8James Zern 3177c8da7ce66017295a65ec028084b90800be377f8James Zern // Sum up sumXY_128 to get sumXY. 3187c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)tmp, sumXY_128); 3197c8da7ce66017295a65ec028084b90800be377f8James Zern sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0]; 3207c8da7ce66017295a65ec028084b90800be377f8James Zern 3217c8da7ce66017295a65ec028084b90800be377f8James Zern retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); 3227c8da7ce66017295a65ec028084b90800be377f8James Zern return (float)retval; 3237c8da7ce66017295a65ec028084b90800be377f8James Zern} 3247c8da7ce66017295a65ec028084b90800be377f8James Zern#undef ANALYZE_X_OR_Y 3257c8da7ce66017295a65ec028084b90800be377f8James Zern#undef ANALYZE_XY 3267c8da7ce66017295a65ec028084b90800be377f8James Zern 3277c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 3287c8da7ce66017295a65ec028084b90800be377f8James Zern// Entry point 3297c8da7ce66017295a65ec028084b90800be377f8James Zern 3307c8da7ce66017295a65ec028084b90800be377f8James Zernextern void VP8LEncDspInitSSE2(void); 3317c8da7ce66017295a65ec028084b90800be377f8James Zern 3327c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { 3337c8da7ce66017295a65ec028084b90800be377f8James Zern VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; 3347c8da7ce66017295a65ec028084b90800be377f8James Zern VP8LTransformColor = TransformColor; 3357c8da7ce66017295a65ec028084b90800be377f8James Zern VP8LCollectColorBlueTransforms = CollectColorBlueTransforms; 3367c8da7ce66017295a65ec028084b90800be377f8James Zern VP8LCollectColorRedTransforms = CollectColorRedTransforms; 3377c8da7ce66017295a65ec028084b90800be377f8James Zern VP8LHistogramAdd = HistogramAdd; 3387c8da7ce66017295a65ec028084b90800be377f8James Zern VP8LCombinedShannonEntropy = CombinedShannonEntropy; 3397c8da7ce66017295a65ec028084b90800be377f8James Zern} 3407c8da7ce66017295a65ec028084b90800be377f8James Zern 3417c8da7ce66017295a65ec028084b90800be377f8James Zern#else // !WEBP_USE_SSE2 3427c8da7ce66017295a65ec028084b90800be377f8James Zern 3437c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2) 3447c8da7ce66017295a65ec028084b90800be377f8James Zern 3457c8da7ce66017295a65ec028084b90800be377f8James Zern#endif // WEBP_USE_SSE2 346