1411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org/*
2411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *
4411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  Use of this source code is governed by a BSD-style license
5411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  that can be found in the LICENSE file in the root of the source
6411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  tree. An additional intellectual property rights grant can be found
7411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  in the file PATENTS.  All contributing project authors may
8411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  be found in the AUTHORS file in the root of the source tree.
9411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org */
10411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
11411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org#include <tmmintrin.h>
12411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org#include "vpx_ports/mem.h"
13411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org#include "vpx_ports/emmintrin_compat.h"
14411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
15411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org// filters only for the 4_h8 convolution
16411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
17411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
18411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org};
19411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
20411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
21411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
22411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org};
23411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
24411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org// filters for 8_h8 and 16_h8
25411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
26411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
27411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org};
28411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
29411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
30411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
31411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org};
32411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
33411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
34411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
35411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org};
36411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
37411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
38411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
39411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org};
40411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
41411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgvoid vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
42411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned int src_pixels_per_line,
43411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned char *output_ptr,
44411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned int output_pitch,
45411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned int output_height,
46411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         int16_t *filter) {
4788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
48411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
49411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i addFilterReg64, filtersReg, srcReg, minReg;
50411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  unsigned int i;
51411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
52411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
53411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
54411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filtersReg = _mm_loadu_si128((__m128i *)filter);
55411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // converting the 16 bit (short) to  8 bit (byte) and have the same data
56411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // in both lanes of 128 bit register.
57411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
58411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
59411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the first 16 bits in the filter into the first lane
60411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
61411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the third 16 bit in the filter into the first lane
62411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
63411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the seconds 16 bits in the filter into the second lane
6488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
65411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
66411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the forth 16 bits in the filter into the second lane
6788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
68411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
69411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
70411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // loading the local filters
7188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
7288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
73411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
74411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  for (i = 0; i < output_height; i++) {
75411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
76411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
77411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // filter the source buffer
7888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
7988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
80411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
81411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // multiply 2 adjacent elements with the filter and add the result
82411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
83411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
84411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
85411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // extract the higher half of the lane
86411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
87411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
88411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
89411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
90411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
91411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // add and saturate all the results together
92411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
93411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
94411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
95411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
96411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
97411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
98411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // shift by 7 bit each 16 bits
99411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
100411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
101411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // shrink to 8 bit each 16 bits
102411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
103411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    src_ptr+=src_pixels_per_line;
104411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
105411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // save only 4 bytes
106411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
107411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
108411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    output_ptr+=output_pitch;
109411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  }
110411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}
111411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
112411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgvoid vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
113411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned int src_pixels_per_line,
114411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned char *output_ptr,
115411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned int output_pitch,
116411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned int output_height,
117411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         int16_t *filter) {
118411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
119411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
120411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
121411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i addFilterReg64, filtersReg, minReg;
122411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  unsigned int i;
123411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
124411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
125411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
126411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filtersReg = _mm_loadu_si128((__m128i *)filter);
127411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // converting the 16 bit (short) to  8 bit (byte) and have the same data
128411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // in both lanes of 128 bit register.
129411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
130411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
131411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the first 16 bits (first and second byte)
132411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // across 128 bit register
133411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
134411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the second 16 bits (third and forth byte)
135411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // across 128 bit register
136411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
137411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the third 16 bits (fifth and sixth byte)
138411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // across 128 bit register
139411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
140411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the forth 16 bits (seventh and eighth byte)
141411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // across 128 bit register
142411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
143411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
144411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
145411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
146411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
147411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
148411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
149411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  for (i = 0; i < output_height; i++) {
150411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
151411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
152411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // filter the source buffer
153411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
154411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
155411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
156411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // multiply 2 adjacent elements with the filter and add the result
157411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
158411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
159411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
160411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // filter the source buffer
161411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
162411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
163411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
164411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // multiply 2 adjacent elements with the filter and add the result
165411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
166411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
167411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
168411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // add and saturate all the results together
16988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
17088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
171411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
17288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
173411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
17488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
175411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
176411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
177411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // shift by 7 bit each 16 bits
178411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
179411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
180411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // shrink to 8 bit each 16 bits
181411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
182411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
183411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    src_ptr+=src_pixels_per_line;
184411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
185411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // save only 8 bytes
186411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
187411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
188411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    output_ptr+=output_pitch;
189411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  }
190411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}
191411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
192411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgvoid vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
193411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                          unsigned int src_pixels_per_line,
194411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                          unsigned char *output_ptr,
195411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                          unsigned int output_pitch,
196411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                          unsigned int output_height,
197411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                          int16_t *filter) {
198411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
199411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
200411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
201411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
202411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  unsigned int i;
203411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
204411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
205411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
206411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filtersReg = _mm_loadu_si128((__m128i *)filter);
207411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // converting the 16 bit (short) to  8 bit (byte) and have the same data
208411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // in both lanes of 128 bit register.
209411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
210411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
211411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the first 16 bits (first and second byte)
212411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // across 128 bit register
213411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
214411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the second 16 bits (third and forth byte)
215411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // across 128 bit register
216411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
217411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the third 16 bits (fifth and sixth byte)
218411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // across 128 bit register
219411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
220411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the forth 16 bits (seventh and eighth byte)
221411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // across 128 bit register
222411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
223411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
224411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
225411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
226411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
227411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
228411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
229411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  for (i = 0; i < output_height; i++) {
230411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
231411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
232411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // filter the source buffer
233411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
23488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
235411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
236411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // multiply 2 adjacent elements with the filter and add the result
237411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
23888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
239411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
240411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // add and saturate the results together
241411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
242411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
243411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // filter the source buffer
24488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
245411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
246411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
247411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // multiply 2 adjacent elements with the filter and add the result
24888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
249411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
250411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
251411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // add and saturate the results together
252411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
253411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));
254411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
255411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // reading the next 16 bytes.
256411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // (part of it was being read by earlier read)
257411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
258411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
259411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // add and saturate the results together
260411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
261411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));
262411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
263411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // filter the source buffer
264411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
26588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
266411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
267411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // multiply 2 adjacent elements with the filter and add the result
268411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
26988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
270411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
271411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // add and saturate the results together
272411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
273411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
274411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // filter the source buffer
27588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
276411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
277411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
278411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // multiply 2 adjacent elements with the filter and add the result
27988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
280411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
281411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
282411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // add and saturate the results together
283411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
284411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
285411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
286411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
287411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
288411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
289411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
290411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
291411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // shift by 7 bit each 16 bit
292411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
293411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
294411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
295411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // shrink to 8 bit each 16 bits, the first lane contain the first
296411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // convolve result and the second lane contain the second convolve
297411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // result
298411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
299411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
300411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    src_ptr+=src_pixels_per_line;
301411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
302411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // save 16 bytes
303411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
304411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
305411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    output_ptr+=output_pitch;
306411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  }
307411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}
308411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
309411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgvoid vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
310411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned int src_pitch,
311411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned char *output_ptr,
312411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned int out_pitch,
313411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         unsigned int output_height,
314411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         int16_t *filter) {
315411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
316411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
317411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
318411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  unsigned int i;
319411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
320411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
321411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
322411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filtersReg = _mm_loadu_si128((__m128i *)filter);
323411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // converting the 16 bit (short) to  8 bit (byte) and have the same data
324411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // in both lanes of 128 bit register.
325411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
326411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
327411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the first 16 bits in the filter
328411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
329411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the second 16 bits in the filter
330411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
331411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the third 16 bits in the filter
332411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
333411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the forth 16 bits in the filter
334411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
335411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
336411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  for (i = 0; i < output_height; i++) {
337411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // load the first 8 bytes
338411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
339411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // load the next 8 bytes in stride of src_pitch
340411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
341411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
342411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
343411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
344411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // merge the result together
345411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
346411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
347411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
348411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // load the next 8 bytes in stride of src_pitch
349411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
350411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
351411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
352411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
353411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
354411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // merge the result together
355411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
356411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
357411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
358411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // multiply 2 adjacent elements with the filter and add the result
359411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
360411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
361411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
362411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
363411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
364411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // add and saturate the results together
365411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
366411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
367411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
368411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
369411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
370411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
371411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
372411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // shift by 7 bit each 16 bit
373411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
374411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
375411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // shrink to 8 bit each 16 bits
376411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
377411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
378411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    src_ptr+=src_pitch;
379411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
380411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // save only 8 bytes convolve result
381411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
382411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
383411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    output_ptr+=out_pitch;
384411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  }
385411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}
386411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
387411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgvoid vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
388411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                          unsigned int src_pitch,
389411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                          unsigned char *output_ptr,
390411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                          unsigned int out_pitch,
391411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                          unsigned int output_height,
392411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                          int16_t *filter) {
393411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
394411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
395411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
396411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  unsigned int i;
397411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
398411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
399411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
400411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filtersReg = _mm_loadu_si128((__m128i *)filter);
401411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // converting the 16 bit (short) to  8 bit (byte) and have the same data
402411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // in both lanes of 128 bit register.
403411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
404411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
405411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the first 16 bits in the filter
406411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
407411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the second 16 bits in the filter
408411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
409411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the third 16 bits in the filter
410411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
411411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // duplicate only the forth 16 bits in the filter
412411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
413411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
414411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  for (i = 0; i < output_height; i++) {
415411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // load the first 16 bytes
416411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
417411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // load the next 16 bytes in stride of src_pitch
418411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
419411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
420411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
421411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
422411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // merge the result together
423411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
424411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
425411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
426411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
427411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
428411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // multiply 2 adjacent elements with the filter and add the result
429411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
430411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
431411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
432411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
433411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
434411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // add and saturate the results together
435411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
436411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
437411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
438411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // load the next 16 bytes in stride of two/three src_pitch
439411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
440411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
441411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
442411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // merge the result together
443411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
444411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
445411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
446411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // multiply 2 adjacent elements with the filter and add the result
447411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
448411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
449411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
450411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // load the next 16 bytes in stride of four/five src_pitch
451411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
452411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
453411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
454411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // merge the result together
455411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
456411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
457411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
458411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // multiply 2 adjacent elements with the filter and add the result
459411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
460411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
461411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
462411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // add and saturate the results together
463411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
464411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
465411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
466411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
467411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
468411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // add and saturate the results together
469411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
470411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
471411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
472411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
473411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
474411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
475411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
476411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // shift by 7 bit each 16 bit
477411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
478411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
479411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
480411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // shrink to 8 bit each 16 bits, the first lane contain the first
481411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // convolve result and the second lane contain the second convolve
482411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // result
483411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
484411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
485411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    src_ptr+=src_pitch;
486411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
487411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // save 16 bytes convolve result
488411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
489411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
490411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    output_ptr+=out_pitch;
491411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  }
492411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}
493