1b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* 2b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * 4b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian */ 10b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 11b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include <tmmintrin.h> 12b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include "vpx_ports/mem.h" 13b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include "vpx_ports/emmintrin_compat.h" 14b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 15b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian// filters only for the 4_h8 convolution 16b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh VenkatasubramanianDECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 17b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 18b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}; 19b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 20b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh VenkatasubramanianDECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 21b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 22b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}; 23b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 24b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian// filters for 8_h8 and 16_h8 25b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh VenkatasubramanianDECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { 26b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 27b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}; 28b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 29b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh VenkatasubramanianDECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { 30b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 31b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}; 32b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 33b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh VenkatasubramanianDECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { 34b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 35b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}; 36b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 37b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh VenkatasubramanianDECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { 38b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 39b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}; 40b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 41b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, 42b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int src_pixels_per_line, 43b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned char *output_ptr, 44b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int output_pitch, 45b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int output_height, 46b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int16_t *filter) { 47b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 48b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; 49b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i addFilterReg64, filtersReg, srcReg, minReg; 50b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int i; 51b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 52b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 53b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian addFilterReg64 =_mm_set1_epi32((int)0x0400040u); 54b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filtersReg = _mm_loadu_si128((__m128i *)filter); 55b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // converting the 16 bit (short) to 8 bit (byte) and have the same data 56b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // in both lanes of 128 bit register. 57b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 58b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 59b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the first 16 bits in the filter into the first lane 60b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian firstFilters = _mm_shufflelo_epi16(filtersReg, 0); 61b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the third 16 bit in the filter into the first lane 62b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); 63b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the seconds 16 bits in the filter into the second lane 64b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); 65b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the forth 16 bits in the filter into the second lane 66b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); 67b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 68b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // loading the local filters 69b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8); 70b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8); 71b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 72b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (i = 0; i < output_height; i++) { 73b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); 74b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 75b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // filter the source buffer 76b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters); 77b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters); 78b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 79b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // multiply 2 adjacent elements with the filter and add the result 80b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 81b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); 82b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 83b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // extract the higher half of the lane 84b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); 85b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); 86b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 87b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); 88b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 89b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add and saturate all the results together 90b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); 91b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); 92b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); 93b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); 94b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); 95b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 96b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shift by 7 bit each 16 bits 97b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 98b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 99b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shrink to 8 bit each 16 bits 100b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); 101b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian src_ptr+=src_pixels_per_line; 102b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 103b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // save only 4 bytes 104b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); 105b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 106b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian output_ptr+=output_pitch; 107b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 108b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 109b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 110b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, 111b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int src_pixels_per_line, 112b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned char *output_ptr, 113b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int output_pitch, 114b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int output_height, 115b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int16_t *filter) { 116b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; 117b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; 118b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; 119b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i addFilterReg64, filtersReg, minReg; 120b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int i; 121b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 122b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 123b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 124b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filtersReg = _mm_loadu_si128((__m128i *)filter); 125b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // converting the 16 bit (short) to 8 bit (byte) and have the same data 126b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // in both lanes of 128 bit register. 127b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 128b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 129b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the first 16 bits (first and second byte) 130b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // across 128 bit register 131b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 132b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the second 16 bits (third and forth byte) 133b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // across 128 bit register 134b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 135b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the third 16 bits (fifth and sixth byte) 136b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // across 128 bit register 137b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 138b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the forth 16 bits (seventh and eighth byte) 139b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // across 128 bit register 140b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 141b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 142b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filt1Reg = _mm_load_si128((__m128i const *)filt1_global); 143b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filt2Reg = _mm_load_si128((__m128i const *)filt2_global); 144b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filt3Reg = _mm_load_si128((__m128i const *)filt3_global); 145b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filt4Reg = _mm_load_si128((__m128i const *)filt4_global); 146b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 147b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (i = 0; i < output_height; i++) { 148b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); 149b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 150b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // filter the source buffer 151b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); 152b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); 153b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 154b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // multiply 2 adjacent elements with the filter and add the result 155b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 156b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); 157b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 158b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // filter the source buffer 159b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); 160b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); 161b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 162b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // multiply 2 adjacent elements with the filter and add the result 163b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); 164b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); 165b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 166b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add and saturate all the results together 167b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3); 168b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); 169b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 170b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3); 171b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); 172b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); 173b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); 174b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 175b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shift by 7 bit each 16 bits 176b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 177b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 178b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shrink to 8 bit each 16 bits 179b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); 180b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 181b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian src_ptr+=src_pixels_per_line; 182b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 183b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // save only 8 bytes 184b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); 185b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 186b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian output_ptr+=output_pitch; 187b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 188b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 189b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 190b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, 191b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int src_pixels_per_line, 192b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned char *output_ptr, 193b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int output_pitch, 194b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int output_height, 195b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int16_t *filter) { 196b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; 197b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; 198b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 199b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; 200b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int i; 201b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 202b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 203b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 204b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filtersReg = _mm_loadu_si128((__m128i *)filter); 205b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // converting the 16 bit (short) to 8 bit (byte) and have the same data 206b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // in both lanes of 128 bit register. 207b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 208b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 209b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the first 16 bits (first and second byte) 210b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // across 128 bit register 211b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 212b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the second 16 bits (third and forth byte) 213b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // across 128 bit register 214b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 215b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the third 16 bits (fifth and sixth byte) 216b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // across 128 bit register 217b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 218b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the forth 16 bits (seventh and eighth byte) 219b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // across 128 bit register 220b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 221b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 222b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filt1Reg = _mm_load_si128((__m128i const *)filt1_global); 223b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filt2Reg = _mm_load_si128((__m128i const *)filt2_global); 224b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filt3Reg = _mm_load_si128((__m128i const *)filt3_global); 225b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filt4Reg = _mm_load_si128((__m128i const *)filt4_global); 226b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 227b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (i = 0; i < output_height; i++) { 228b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); 229b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 230b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // filter the source buffer 231b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); 232b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg); 233b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 234b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // multiply 2 adjacent elements with the filter and add the result 235b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); 236b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); 237b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 238b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add and saturate the results together 239b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); 240b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 241b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // filter the source buffer 242b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg); 243b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); 244b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 245b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // multiply 2 adjacent elements with the filter and add the result 246b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); 247b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); 248b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 249b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add and saturate the results together 250b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 251b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 252b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 253b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // reading the next 16 bytes. 254b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // (part of it was being read by earlier read) 255b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); 256b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 257b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add and saturate the results together 258b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 259b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 260b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 261b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // filter the source buffer 262b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); 263b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg); 264b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 265b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // multiply 2 adjacent elements with the filter and add the result 266b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); 267b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); 268b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 269b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add and saturate the results together 270b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); 271b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 272b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // filter the source buffer 273b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg); 274b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); 275b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 276b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // multiply 2 adjacent elements with the filter and add the result 277b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); 278b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); 279b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 280b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add and saturate the results together 281b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 282b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 283b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 284b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 285b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 286b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); 287b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); 288b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 289b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shift by 7 bit each 16 bit 290b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); 291b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); 292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 293b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shrink to 8 bit each 16 bits, the first lane contain the first 294b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // convolve result and the second lane contain the second convolve 295b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // result 296b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); 297b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 298b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian src_ptr+=src_pixels_per_line; 299b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 300b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // save 16 bytes 301b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); 302b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 303b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian output_ptr+=output_pitch; 304b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 305b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 306b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 307b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, 308b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int src_pitch, 309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned char *output_ptr, 310b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int out_pitch, 311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int output_height, 312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int16_t *filter) { 313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6; 314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5; 316b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int i; 317b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 320b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filtersReg = _mm_loadu_si128((__m128i *)filter); 321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // converting the 16 bit (short) to 8 bit (byte) and have the same data 322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // in both lanes of 128 bit register. 323b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 324b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the first 16 bits in the filter 326b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 327b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the second 16 bits in the filter 328b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 329b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the third 16 bits in the filter 330b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 331b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the forth 16 bits in the filter 332b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 333b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 334b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (i = 0; i < output_height; i++) { 335b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // load the first 8 bytes 336b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); 337b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // load the next 8 bytes in stride of src_pitch 338b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]); 339b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]); 340b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]); 341b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 342b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // merge the result together 343b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); 344b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); 345b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 346b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // load the next 8 bytes in stride of src_pitch 347b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]); 348b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]); 349b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]); 350b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]); 351b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 352b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // merge the result together 353b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); 354b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); 355b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 356b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // multiply 2 adjacent elements with the filter and add the result 357b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 358b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); 359b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); 360b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); 361b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 362b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add and saturate the results together 363b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); 364b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); 365b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); 366b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); 367b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); 368b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); 369b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 370b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shift by 7 bit each 16 bit 371b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 372b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 373b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shrink to 8 bit each 16 bits 374b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); 375b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 376b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian src_ptr+=src_pitch; 377b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 378b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // save only 8 bytes convolve result 379b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); 380b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 381b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian output_ptr+=out_pitch; 382b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 383b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 384b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 385b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, 386b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int src_pitch, 387b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned char *output_ptr, 388b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int out_pitch, 389b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int output_height, 390b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian int16_t *filter) { 391b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; 392b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 393b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; 394b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian unsigned int i; 395b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 396b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 397b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 398b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filtersReg = _mm_loadu_si128((__m128i *)filter); 399b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // converting the 16 bit (short) to 8 bit (byte) and have the same data 400b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // in both lanes of 128 bit register. 401b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 402b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 403b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the first 16 bits in the filter 404b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 405b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the second 16 bits in the filter 406b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 407b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the third 16 bits in the filter 408b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 409b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // duplicate only the forth 16 bits in the filter 410b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 411b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 412b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian for (i = 0; i < output_height; i++) { 413b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // load the first 16 bytes 414b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); 415b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // load the next 16 bytes in stride of src_pitch 416b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); 417b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); 418b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); 419b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 420b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // merge the result together 421b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); 422b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); 423b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); 424b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); 425b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 426b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // multiply 2 adjacent elements with the filter and add the result 427b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); 428b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); 429b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 430b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); 431b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 432b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add and saturate the results together 433b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); 434b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); 435b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 436b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // load the next 16 bytes in stride of two/three src_pitch 437b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); 438b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); 439b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 440b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // merge the result together 441b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); 442b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); 443b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 444b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // multiply 2 adjacent elements with the filter and add the result 445b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); 446b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); 447b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 448b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // load the next 16 bytes in stride of four/five src_pitch 449b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); 450b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); 451b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 452b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // merge the result together 453b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); 454b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); 455b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 456b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // multiply 2 adjacent elements with the filter and add the result 457b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); 458b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); 459b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 460b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add and saturate the results together 461b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, 462b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_min_epi16(srcRegFilt4, srcRegFilt7)); 463b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 464b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_min_epi16(srcRegFilt6, srcRegFilt8)); 465b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 466b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // add and saturate the results together 467b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, 468b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_max_epi16(srcRegFilt4, srcRegFilt7)); 469b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 470b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_max_epi16(srcRegFilt6, srcRegFilt8)); 471b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); 472b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); 473b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 474b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shift by 7 bit each 16 bit 475b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); 476b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 477b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 478b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // shrink to 8 bit each 16 bits, the first lane contain the first 479b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // convolve result and the second lane contain the second convolve 480b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // result 481b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); 482b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 483b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian src_ptr+=src_pitch; 484b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 485b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian // save 16 bytes convolve result 486b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); 487b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 488b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian output_ptr+=out_pitch; 489b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian } 490b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 491