1411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org/* 2411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org * 4411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org * Use of this source code is governed by a BSD-style license 5411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org * that can be found in the LICENSE file in the root of the source 6411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org * tree. An additional intellectual property rights grant can be found 7411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org * in the file PATENTS. All contributing project authors may 8411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org * be found in the AUTHORS file in the root of the source tree. 9411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org */ 10411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 11411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org#include <tmmintrin.h> 12411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org#include "vpx_ports/mem.h" 13411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org#include "vpx_ports/emmintrin_compat.h" 14411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 15411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org// filters only for the 4_h8 convolution 16411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 17411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 18411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}; 19411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 20411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 21411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 22411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}; 23411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 24411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org// filters for 8_h8 and 16_h8 25411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { 26411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 27411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}; 28411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 29411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { 30411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 31411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}; 32411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 33411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { 34411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 35411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}; 36411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 37411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { 38411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 39411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}; 40411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 41411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgvoid vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, 42411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int src_pixels_per_line, 43411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned char *output_ptr, 44411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int output_pitch, 45411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int output_height, 46411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int16_t *filter) { 4788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org __m128i firstFilters, secondFilters, shuffle1, shuffle2; 48411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; 49411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i addFilterReg64, filtersReg, srcReg, minReg; 50411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int i; 51411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 52411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 53411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org addFilterReg64 =_mm_set1_epi32((int)0x0400040u); 54411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filtersReg = _mm_loadu_si128((__m128i *)filter); 55411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // converting the 16 bit (short) to 8 bit (byte) and have the same data 56411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // in both lanes of 128 bit register. 57411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 58411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 59411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the first 16 bits in the filter into the first lane 60411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org firstFilters = _mm_shufflelo_epi16(filtersReg, 0); 61411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the third 16 bit in the filter into the first lane 62411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); 63411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the seconds 16 bits in the filter into the second lane 6488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 65411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); 66411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the forth 16 bits in the filter into the second lane 6788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 68411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); 69411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 70411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // loading the local filters 7188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); 7288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); 73411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 74411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org for (i = 0; i < output_height; i++) { 75411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); 76411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 77411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // filter the source buffer 7888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); 7988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); 80411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 81411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // multiply 2 adjacent elements with the filter and add the result 82411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 83411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); 84411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 85411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // extract the higher half of the lane 86411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); 87411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); 88411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 89411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); 90411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 91411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // add and saturate all the results together 92411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); 93411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); 94411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); 95411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); 96411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); 97411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 98411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // shift by 7 bit each 16 bits 99411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 100411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 101411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // shrink to 8 bit each 16 bits 102411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); 103411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org src_ptr+=src_pixels_per_line; 104411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 105411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // save only 4 bytes 106411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); 107411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 108411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org output_ptr+=output_pitch; 109411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org } 110411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org} 111411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 112411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgvoid vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, 113411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int src_pixels_per_line, 114411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned char *output_ptr, 115411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int output_pitch, 116411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int output_height, 117411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int16_t *filter) { 118411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; 119411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; 120411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; 121411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i addFilterReg64, filtersReg, minReg; 122411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int i; 123411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 124411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 125411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 126411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filtersReg = _mm_loadu_si128((__m128i *)filter); 127411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // converting the 16 bit (short) to 8 bit (byte) and have the same data 128411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // in both lanes of 128 bit register. 129411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 130411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 131411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the first 16 bits (first and second byte) 132411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // across 128 bit register 133411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 134411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the second 16 bits (third and forth byte) 135411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // across 128 bit register 136411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 137411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the third 16 bits (fifth and sixth byte) 138411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // across 128 bit register 139411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 140411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the forth 16 bits (seventh and eighth byte) 141411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // across 128 bit register 142411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 143411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 144411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filt1Reg = _mm_load_si128((__m128i const *)filt1_global); 145411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filt2Reg = _mm_load_si128((__m128i const *)filt2_global); 146411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filt3Reg = _mm_load_si128((__m128i const *)filt3_global); 147411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filt4Reg = _mm_load_si128((__m128i const *)filt4_global); 148411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 149411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org for (i = 0; i < output_height; i++) { 150411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); 151411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 152411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // filter the source buffer 153411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); 154411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); 155411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 156411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // multiply 2 adjacent elements with the filter and add the result 157411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 158411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); 159411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 160411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // filter the source buffer 161411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); 162411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); 163411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 164411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // multiply 2 adjacent elements with the filter and add the result 165411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); 166411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); 167411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 168411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // add and saturate all the results together 16988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); 17088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); 171411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 17288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3); 173411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); 17488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); 175411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); 176411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 177411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // shift by 7 bit each 16 bits 178411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 179411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 180411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // shrink to 8 bit each 16 bits 181411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); 182411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 183411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org src_ptr+=src_pixels_per_line; 184411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 185411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // save only 8 bytes 186411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); 187411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 188411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org output_ptr+=output_pitch; 189411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org } 190411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org} 191411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 192411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgvoid vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, 193411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int src_pixels_per_line, 194411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned char *output_ptr, 195411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int output_pitch, 196411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int output_height, 197411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int16_t *filter) { 198411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; 199411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; 200411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 201411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; 202411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int i; 203411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 204411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 205411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 206411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filtersReg = _mm_loadu_si128((__m128i *)filter); 207411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // converting the 16 bit (short) to 8 bit (byte) and have the same data 208411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // in both lanes of 128 bit register. 209411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 210411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 211411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the first 16 bits (first and second byte) 212411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // across 128 bit register 213411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 214411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the second 16 bits (third and forth byte) 215411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // across 128 bit register 216411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 217411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the third 16 bits (fifth and sixth byte) 218411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // across 128 bit register 219411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 220411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the forth 16 bits (seventh and eighth byte) 221411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // across 128 bit register 222411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 223411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 224411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filt1Reg = _mm_load_si128((__m128i const *)filt1_global); 225411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filt2Reg = _mm_load_si128((__m128i const *)filt2_global); 226411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filt3Reg = _mm_load_si128((__m128i const *)filt3_global); 227411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filt4Reg = _mm_load_si128((__m128i const *)filt4_global); 228411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 229411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org for (i = 0; i < output_height; i++) { 230411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); 231411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 232411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // filter the source buffer 233411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); 23488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); 235411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 236411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // multiply 2 adjacent elements with the filter and add the result 237411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); 23888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); 239411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 240411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // add and saturate the results together 241411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); 242411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 243411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // filter the source buffer 24488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); 245411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); 246411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 247411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // multiply 2 adjacent elements with the filter and add the result 24888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); 249411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); 250411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 251411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // add and saturate the results together 252411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 253411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 254411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 255411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // reading the next 16 bytes. 256411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // (part of it was being read by earlier read) 257411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); 258411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 259411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // add and saturate the results together 260411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 261411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 262411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 263411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // filter the source buffer 264411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); 26588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); 266411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 267411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // multiply 2 adjacent elements with the filter and add the result 268411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); 26988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); 270411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 271411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // add and saturate the results together 272411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); 273411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 274411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // filter the source buffer 27588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); 276411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); 277411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 278411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // multiply 2 adjacent elements with the filter and add the result 27988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); 280411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); 281411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 282411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // add and saturate the results together 283411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 284411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 285411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 286411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 287411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 288411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); 289411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); 290411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 291411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // shift by 7 bit each 16 bit 292411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); 293411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); 294411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 295411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // shrink to 8 bit each 16 bits, the first lane contain the first 296411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // convolve result and the second lane contain the second convolve 297411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // result 298411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); 299411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 300411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org src_ptr+=src_pixels_per_line; 301411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 302411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // save 16 bytes 303411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); 304411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 305411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org output_ptr+=output_pitch; 306411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org } 307411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org} 308411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 309411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgvoid vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, 310411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int src_pitch, 311411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned char *output_ptr, 312411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int out_pitch, 313411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int output_height, 314411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int16_t *filter) { 315411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6; 316411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 317411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5; 318411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int i; 319411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 320411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 321411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 322411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filtersReg = _mm_loadu_si128((__m128i *)filter); 323411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // converting the 16 bit (short) to 8 bit (byte) and have the same data 324411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // in both lanes of 128 bit register. 325411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 326411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 327411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the first 16 bits in the filter 328411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 329411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the second 16 bits in the filter 330411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 331411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the third 16 bits in the filter 332411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 333411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the forth 16 bits in the filter 334411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 335411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 336411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org for (i = 0; i < output_height; i++) { 337411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // load the first 8 bytes 338411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); 339411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // load the next 8 bytes in stride of src_pitch 340411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]); 341411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]); 342411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]); 343411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 344411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // merge the result together 345411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); 346411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); 347411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 348411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // load the next 8 bytes in stride of src_pitch 349411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]); 350411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]); 351411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]); 352411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]); 353411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 354411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // merge the result together 355411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); 356411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); 357411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 358411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // multiply 2 adjacent elements with the filter and add the result 359411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 360411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); 361411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); 362411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); 363411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 364411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // add and saturate the results together 365411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); 366411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); 367411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); 368411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); 369411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); 370411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); 371411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 372411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // shift by 7 bit each 16 bit 373411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 374411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 375411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // shrink to 8 bit each 16 bits 376411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); 377411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 378411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org src_ptr+=src_pitch; 379411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 380411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // save only 8 bytes convolve result 381411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); 382411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 383411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org output_ptr+=out_pitch; 384411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org } 385411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org} 386411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 387411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgvoid vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, 388411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int src_pitch, 389411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned char *output_ptr, 390411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int out_pitch, 391411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int output_height, 392411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int16_t *filter) { 393411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; 394411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 395411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; 396411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int i; 397411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 398411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 399411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 400411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filtersReg = _mm_loadu_si128((__m128i *)filter); 401411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // converting the 16 bit (short) to 8 bit (byte) and have the same data 402411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // in both lanes of 128 bit register. 403411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 404411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 405411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the first 16 bits in the filter 406411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 407411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the second 16 bits in the filter 408411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 409411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the third 16 bits in the filter 410411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 411411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // duplicate only the forth 16 bits in the filter 412411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 413411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 414411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org for (i = 0; i < output_height; i++) { 415411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // load the first 16 bytes 416411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); 417411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // load the next 16 bytes in stride of src_pitch 418411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); 419411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); 420411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); 421411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 422411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // merge the result together 423411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); 424411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); 425411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); 426411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); 427411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 428411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // multiply 2 adjacent elements with the filter and add the result 429411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); 430411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); 431411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 432411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); 433411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 434411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // add and saturate the results together 435411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); 436411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); 437411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 438411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // load the next 16 bytes in stride of two/three src_pitch 439411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); 440411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); 441411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 442411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // merge the result together 443411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); 444411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); 445411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 446411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // multiply 2 adjacent elements with the filter and add the result 447411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); 448411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); 449411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 450411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // load the next 16 bytes in stride of four/five src_pitch 451411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); 452411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); 453411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 454411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // merge the result together 455411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); 456411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); 457411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 458411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // multiply 2 adjacent elements with the filter and add the result 459411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); 460411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); 461411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 462411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // add and saturate the results together 463411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, 464411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_min_epi16(srcRegFilt4, srcRegFilt7)); 465411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 466411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_min_epi16(srcRegFilt6, srcRegFilt8)); 467411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 468411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // add and saturate the results together 469411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, 470411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_max_epi16(srcRegFilt4, srcRegFilt7)); 471411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 472411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_max_epi16(srcRegFilt6, srcRegFilt8)); 473411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); 474411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); 475411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 476411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // shift by 7 bit each 16 bit 477411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); 478411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 479411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 480411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // shrink to 8 bit each 16 bits, the first lane contain the first 481411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // convolve result and the second lane contain the second convolve 482411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // result 483411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); 484411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 485411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org src_ptr+=src_pitch; 486411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 487411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // save 16 bytes convolve result 488411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); 489411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 490411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org output_ptr+=out_pitch; 491411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org } 492411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org} 493