1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <immintrin.h>
12#include "vpx_ports/mem.h"
13
14// filters for 16_h8 and 16_v8
15DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
16  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
17  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
18};
19
20DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
21  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
22  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
23};
24
25DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
26  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
27  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
28};
29
30DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
31  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
32  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
33};
34
35#if defined(__clang__)
36# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
37      (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0)
38#  define MM256_BROADCASTSI128_SI256(x) \
39       _mm_broadcastsi128_si256((__m128i const *)&(x))
40# else  // clang > 3.3, and not 5.0 on macosx.
41#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
42# endif  // clang <= 3.3
43#elif defined(__GNUC__)
44# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
45#  define MM256_BROADCASTSI128_SI256(x) \
46       _mm_broadcastsi128_si256((__m128i const *)&(x))
47# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
48#  define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
49# else  // gcc > 4.7
50#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
51# endif  // gcc <= 4.6
52#else  // !(gcc || clang)
53# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
54#endif  // __clang__
55
56void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
57                                  unsigned int src_pixels_per_line,
58                                  unsigned char *output_ptr,
59                                  unsigned int  output_pitch,
60                                  unsigned int  output_height,
61                                  int16_t *filter) {
62  __m128i filtersReg;
63  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
64  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
65  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
66  __m256i srcReg32b1, srcReg32b2, filtersReg32;
67  unsigned int i;
68  unsigned int src_stride, dst_stride;
69
70  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
71  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
72  filtersReg = _mm_loadu_si128((__m128i *)filter);
73  // converting the 16 bit (short) to 8 bit (byte) and have the same data
74  // in both lanes of 128 bit register.
75  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
76  // have the same data in both lanes of a 256 bit register
77  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
78
79  // duplicate only the first 16 bits (first and second byte)
80  // across 256 bit register
81  firstFilters = _mm256_shuffle_epi8(filtersReg32,
82                 _mm256_set1_epi16(0x100u));
83  // duplicate only the second 16 bits (third and forth byte)
84  // across 256 bit register
85  secondFilters = _mm256_shuffle_epi8(filtersReg32,
86                  _mm256_set1_epi16(0x302u));
87  // duplicate only the third 16 bits (fifth and sixth byte)
88  // across 256 bit register
89  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
90                 _mm256_set1_epi16(0x504u));
91  // duplicate only the forth 16 bits (seventh and eighth byte)
92  // across 256 bit register
93  forthFilters = _mm256_shuffle_epi8(filtersReg32,
94                 _mm256_set1_epi16(0x706u));
95
96  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
97  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
98  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
99  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
100
101  // multiple the size of the source and destination stride by two
102  src_stride = src_pixels_per_line << 1;
103  dst_stride = output_pitch << 1;
104  for (i = output_height; i > 1; i-=2) {
105    // load the 2 strides of source
106    srcReg32b1 = _mm256_castsi128_si256(
107                 _mm_loadu_si128((__m128i *)(src_ptr-3)));
108    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
109                 _mm_loadu_si128((__m128i *)
110                 (src_ptr+src_pixels_per_line-3)), 1);
111
112    // filter the source buffer
113    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
114    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
115
116    // multiply 2 adjacent elements with the filter and add the result
117    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
118    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
119
120    // add and saturate the results together
121    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
122
123    // filter the source buffer
124    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
125    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
126
127    // multiply 2 adjacent elements with the filter and add the result
128    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
129    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
130
131    // add and saturate the results together
132    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
133                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
134
135    // reading 2 strides of the next 16 bytes
136    // (part of it was being read by earlier read)
137    srcReg32b2 = _mm256_castsi128_si256(
138                 _mm_loadu_si128((__m128i *)(src_ptr+5)));
139    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
140                 _mm_loadu_si128((__m128i *)
141                 (src_ptr+src_pixels_per_line+5)), 1);
142
143    // add and saturate the results together
144    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
145                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
146
147    // filter the source buffer
148    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
149    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
150
151    // multiply 2 adjacent elements with the filter and add the result
152    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
153    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
154
155    // add and saturate the results together
156    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
157
158    // filter the source buffer
159    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
160    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
161
162    // multiply 2 adjacent elements with the filter and add the result
163    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
164    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
165
166    // add and saturate the results together
167    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
168                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
169    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
170                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
171
172
173    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
174
175    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
176
177    // shift by 7 bit each 16 bit
178    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
179    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
180
181    // shrink to 8 bit each 16 bits, the first lane contain the first
182    // convolve result and the second lane contain the second convolve
183    // result
184    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
185                                           srcRegFilt32b2_1);
186
187    src_ptr+=src_stride;
188
189    // save 16 bytes
190    _mm_store_si128((__m128i*)output_ptr,
191    _mm256_castsi256_si128(srcRegFilt32b1_1));
192
193    // save the next 16 bits
194    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
195    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
196    output_ptr+=dst_stride;
197  }
198
199  // if the number of strides is odd.
200  // process only 16 bytes
201  if (i > 0) {
202    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
203    __m128i srcRegFilt2, srcRegFilt3;
204
205    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
206
207    // filter the source buffer
208    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
209                    _mm256_castsi256_si128(filt1Reg));
210    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
211                  _mm256_castsi256_si128(filt2Reg));
212
213    // multiply 2 adjacent elements with the filter and add the result
214    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
215                    _mm256_castsi256_si128(firstFilters));
216    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
217                  _mm256_castsi256_si128(secondFilters));
218
219    // add and saturate the results together
220    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
221
222    // filter the source buffer
223    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
224                 _mm256_castsi256_si128(filt4Reg));
225    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
226                 _mm256_castsi256_si128(filt3Reg));
227
228    // multiply 2 adjacent elements with the filter and add the result
229    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
230                  _mm256_castsi256_si128(forthFilters));
231    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
232                  _mm256_castsi256_si128(thirdFilters));
233
234    // add and saturate the results together
235    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
236                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
237
238    // reading the next 16 bytes
239    // (part of it was being read by earlier read)
240    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
241
242    // add and saturate the results together
243    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
244                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
245
246    // filter the source buffer
247    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
248                    _mm256_castsi256_si128(filt1Reg));
249    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
250                  _mm256_castsi256_si128(filt2Reg));
251
252    // multiply 2 adjacent elements with the filter and add the result
253    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
254                    _mm256_castsi256_si128(firstFilters));
255    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
256                  _mm256_castsi256_si128(secondFilters));
257
258    // add and saturate the results together
259    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
260
261    // filter the source buffer
262    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
263                  _mm256_castsi256_si128(filt4Reg));
264    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
265                  _mm256_castsi256_si128(filt3Reg));
266
267    // multiply 2 adjacent elements with the filter and add the result
268    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
269                  _mm256_castsi256_si128(forthFilters));
270    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
271                  _mm256_castsi256_si128(thirdFilters));
272
273    // add and saturate the results together
274    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
275                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
276    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
277                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
278
279
280    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
281                    _mm256_castsi256_si128(addFilterReg64));
282
283    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
284                    _mm256_castsi256_si128(addFilterReg64));
285
286    // shift by 7 bit each 16 bit
287    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
288    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
289
290    // shrink to 8 bit each 16 bits, the first lane contain the first
291    // convolve result and the second lane contain the second convolve
292    // result
293    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
294
295    // save 16 bytes
296    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
297  }
298}
299
300void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
301                                  unsigned int src_pitch,
302                                  unsigned char *output_ptr,
303                                  unsigned int out_pitch,
304                                  unsigned int output_height,
305                                  int16_t *filter) {
306  __m128i filtersReg;
307  __m256i addFilterReg64;
308  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
309  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
310  __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
311  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
312  unsigned int i;
313  unsigned int src_stride, dst_stride;
314
315  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
316  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
317  filtersReg = _mm_loadu_si128((__m128i *)filter);
318  // converting the 16 bit (short) to  8 bit (byte) and have the
319  // same data in both lanes of 128 bit register.
320  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
321  // have the same data in both lanes of a 256 bit register
322  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
323
324  // duplicate only the first 16 bits (first and second byte)
325  // across 256 bit register
326  firstFilters = _mm256_shuffle_epi8(filtersReg32,
327                 _mm256_set1_epi16(0x100u));
328  // duplicate only the second 16 bits (third and forth byte)
329  // across 256 bit register
330  secondFilters = _mm256_shuffle_epi8(filtersReg32,
331                  _mm256_set1_epi16(0x302u));
332  // duplicate only the third 16 bits (fifth and sixth byte)
333  // across 256 bit register
334  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
335                 _mm256_set1_epi16(0x504u));
336  // duplicate only the forth 16 bits (seventh and eighth byte)
337  // across 256 bit register
338  forthFilters = _mm256_shuffle_epi8(filtersReg32,
339                 _mm256_set1_epi16(0x706u));
340
341  // multiple the size of the source and destination stride by two
342  src_stride = src_pitch << 1;
343  dst_stride = out_pitch << 1;
344
345  // load 16 bytes 7 times in stride of src_pitch
346  srcReg32b1 = _mm256_castsi128_si256(
347               _mm_loadu_si128((__m128i *)(src_ptr)));
348  srcReg32b2 = _mm256_castsi128_si256(
349               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
350  srcReg32b3 = _mm256_castsi128_si256(
351               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
352  srcReg32b4 = _mm256_castsi128_si256(
353               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
354  srcReg32b5 = _mm256_castsi128_si256(
355               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
356  srcReg32b6 = _mm256_castsi128_si256(
357               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
358  srcReg32b7 = _mm256_castsi128_si256(
359               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
360
361  // have each consecutive loads on the same 256 register
362  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
363               _mm256_castsi256_si128(srcReg32b2), 1);
364  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
365               _mm256_castsi256_si128(srcReg32b3), 1);
366  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
367               _mm256_castsi256_si128(srcReg32b4), 1);
368  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
369               _mm256_castsi256_si128(srcReg32b5), 1);
370  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
371               _mm256_castsi256_si128(srcReg32b6), 1);
372  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
373               _mm256_castsi256_si128(srcReg32b7), 1);
374
375  // merge every two consecutive registers except the last one
376  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
377  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
378
379  // save
380  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
381
382  // save
383  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
384
385  // save
386  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
387
388  // save
389  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
390
391
392  for (i = output_height; i > 1; i-=2) {
393     // load the last 2 loads of 16 bytes and have every two
394     // consecutive loads in the same 256 bit register
395     srcReg32b8 = _mm256_castsi128_si256(
396     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
397     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
398     _mm256_castsi256_si128(srcReg32b8), 1);
399     srcReg32b9 = _mm256_castsi128_si256(
400     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
401     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
402     _mm256_castsi256_si128(srcReg32b9), 1);
403
404     // merge every two consecutive registers
405     // save
406     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
407     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
408
409     // multiply 2 adjacent elements with the filter and add the result
410     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
411     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
412     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
413     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
414
415     // add and saturate the results together
416     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
417     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
418
419
420     // multiply 2 adjacent elements with the filter and add the result
421     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
422     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
423
424     // multiply 2 adjacent elements with the filter and add the result
425     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
426     srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
427
428
429     // add and saturate the results together
430     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
431                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
432     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
433                  _mm256_min_epi16(srcReg32b6, srcReg32b13));
434
435     // add and saturate the results together
436     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
437                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
438     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
439                  _mm256_max_epi16(srcReg32b6, srcReg32b13));
440
441
442     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
443     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
444
445     // shift by 7 bit each 16 bit
446     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
447     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
448
449     // shrink to 8 bit each 16 bits, the first lane contain the first
450     // convolve result and the second lane contain the second convolve
451     // result
452     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
453
454     src_ptr+=src_stride;
455
456     // save 16 bytes
457     _mm_store_si128((__m128i*)output_ptr,
458     _mm256_castsi256_si128(srcReg32b1));
459
460     // save the next 16 bits
461     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
462     _mm256_extractf128_si256(srcReg32b1, 1));
463
464     output_ptr+=dst_stride;
465
466     // save part of the registers for next strides
467     srcReg32b10 = srcReg32b11;
468     srcReg32b1 = srcReg32b3;
469     srcReg32b11 = srcReg32b2;
470     srcReg32b3 = srcReg32b5;
471     srcReg32b2 = srcReg32b4;
472     srcReg32b5 = srcReg32b7;
473     srcReg32b7 = srcReg32b9;
474  }
475  if (i > 0) {
476    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
477    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
478    // load the last 16 bytes
479    srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
480
481    // merge the last 2 results together
482    srcRegFilt4 = _mm_unpacklo_epi8(
483                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
484    srcRegFilt7 = _mm_unpackhi_epi8(
485                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
486
487    // multiply 2 adjacent elements with the filter and add the result
488    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
489                  _mm256_castsi256_si128(firstFilters));
490    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
491                  _mm256_castsi256_si128(forthFilters));
492    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
493                  _mm256_castsi256_si128(firstFilters));
494    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
495                  _mm256_castsi256_si128(forthFilters));
496
497    // add and saturate the results together
498    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
499    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
500
501
502    // multiply 2 adjacent elements with the filter and add the result
503    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
504                  _mm256_castsi256_si128(secondFilters));
505    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
506                  _mm256_castsi256_si128(secondFilters));
507
508    // multiply 2 adjacent elements with the filter and add the result
509    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
510                  _mm256_castsi256_si128(thirdFilters));
511    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
512                  _mm256_castsi256_si128(thirdFilters));
513
514    // add and saturate the results together
515    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
516                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
517    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
518                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));
519
520    // add and saturate the results together
521    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
522                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
523    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
524                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));
525
526
527    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
528                  _mm256_castsi256_si128(addFilterReg64));
529    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
530                  _mm256_castsi256_si128(addFilterReg64));
531
532    // shift by 7 bit each 16 bit
533    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
534    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
535
536    // shrink to 8 bit each 16 bits, the first lane contain the first
537    // convolve result and the second lane contain the second convolve
538    // result
539    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
540
541    // save 16 bytes
542    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
543  }
544}
545