1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <immintrin.h>  // AVX2
12#include "vpx_ports/mem.h"
13#include "vp9/encoder/vp9_variance.h"
14
15DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
16  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
17  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
18  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
19  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
20  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
21  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
22  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
23  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
24  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
25  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
26  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
27  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
28  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
29  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
30  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
31  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
32  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
33  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
34  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
35  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
36  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
37  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
38  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
39  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
40  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
41  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
42  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
43  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
44  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
45  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
46  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
47  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15
48};
49
50#define FILTER_SRC(filter) \
51  /* filter the source */ \
52  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
53  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
54  \
55  /* add 8 to source */ \
56  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
57  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
58  \
59  /* divide source by 16 */ \
60  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
61  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
62
63#define MERGE_WITH_SRC(src_reg, reg) \
64  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
65  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
66
67#define LOAD_SRC_DST \
68  /* load source and destination */ \
69  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
70  dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
71
72#define AVG_NEXT_SRC(src_reg, size_stride) \
73  src_next_reg = _mm256_loadu_si256((__m256i const *) \
74                                   (src + size_stride)); \
75  /* average between current and next stride source */ \
76  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
77
78#define MERGE_NEXT_SRC(src_reg, size_stride) \
79  src_next_reg = _mm256_loadu_si256((__m256i const *) \
80                                   (src + size_stride)); \
81  MERGE_WITH_SRC(src_reg, src_next_reg)
82
83#define CALC_SUM_SSE_INSIDE_LOOP \
84  /* expand each byte to 2 bytes */ \
85  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
86  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
87  /* source - dest */ \
88  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
89  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
90  /* caculate sum */ \
91  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
92  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
93  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
94  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
95  /* calculate sse */ \
96  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
97  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
98
99// final calculation to sum and sse
100#define CALC_SUM_AND_SSE \
101  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
102  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
103  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
104  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
105  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
106  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
107  \
108  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
109  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
110  \
111  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
112  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
113  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
114                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
115  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
116  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
117  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
118        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
119
120
121unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
122                                             int src_stride,
123                                             int x_offset,
124                                             int y_offset,
125                                             const uint8_t *dst,
126                                             int dst_stride,
127                                             int height,
128                                             unsigned int *sse) {
129  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
130  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
131  __m256i zero_reg;
132  int i, sum;
133  sum_reg = _mm256_set1_epi16(0);
134  sse_reg = _mm256_set1_epi16(0);
135  zero_reg = _mm256_set1_epi16(0);
136
137  // x_offset = 0 and y_offset = 0
138  if (x_offset == 0) {
139    if (y_offset == 0) {
140      for (i = 0; i < height ; i++) {
141        LOAD_SRC_DST
142        // expend each byte to 2 bytes
143        MERGE_WITH_SRC(src_reg, zero_reg)
144        CALC_SUM_SSE_INSIDE_LOOP
145        src+= src_stride;
146        dst+= dst_stride;
147      }
148    // x_offset = 0 and y_offset = 8
149    } else if (y_offset == 8) {
150      __m256i src_next_reg;
151      for (i = 0; i < height ; i++) {
152        LOAD_SRC_DST
153        AVG_NEXT_SRC(src_reg, src_stride)
154        // expend each byte to 2 bytes
155        MERGE_WITH_SRC(src_reg, zero_reg)
156        CALC_SUM_SSE_INSIDE_LOOP
157        src+= src_stride;
158        dst+= dst_stride;
159      }
160    // x_offset = 0 and y_offset = bilin interpolation
161    } else {
162      __m256i filter, pw8, src_next_reg;
163
164      y_offset <<= 5;
165      filter = _mm256_load_si256((__m256i const *)
166               (bilinear_filters_avx2 + y_offset));
167      pw8 = _mm256_set1_epi16(8);
168      for (i = 0; i < height ; i++) {
169        LOAD_SRC_DST
170        MERGE_NEXT_SRC(src_reg, src_stride)
171        FILTER_SRC(filter)
172        CALC_SUM_SSE_INSIDE_LOOP
173        src+= src_stride;
174        dst+= dst_stride;
175      }
176    }
177  // x_offset = 8  and y_offset = 0
178  } else if (x_offset == 8) {
179    if (y_offset == 0) {
180      __m256i src_next_reg;
181      for (i = 0; i < height ; i++) {
182        LOAD_SRC_DST
183        AVG_NEXT_SRC(src_reg, 1)
184        // expand each byte to 2 bytes
185        MERGE_WITH_SRC(src_reg, zero_reg)
186        CALC_SUM_SSE_INSIDE_LOOP
187        src+= src_stride;
188        dst+= dst_stride;
189      }
190    // x_offset = 8  and y_offset = 8
191    } else if (y_offset == 8) {
192      __m256i src_next_reg, src_avg;
193      // load source and another source starting from the next
194      // following byte
195      src_reg = _mm256_loadu_si256((__m256i const *) (src));
196      AVG_NEXT_SRC(src_reg, 1)
197      for (i = 0; i < height ; i++) {
198        src_avg = src_reg;
199        src+= src_stride;
200        LOAD_SRC_DST
201        AVG_NEXT_SRC(src_reg, 1)
202        // average between previous average to current average
203        src_avg = _mm256_avg_epu8(src_avg, src_reg);
204        // expand each byte to 2 bytes
205        MERGE_WITH_SRC(src_avg, zero_reg)
206        // save current source average
207        CALC_SUM_SSE_INSIDE_LOOP
208        dst+= dst_stride;
209      }
210    // x_offset = 8  and y_offset = bilin interpolation
211    } else {
212      __m256i filter, pw8, src_next_reg, src_avg;
213      y_offset <<= 5;
214      filter = _mm256_load_si256((__m256i const *)
215               (bilinear_filters_avx2 + y_offset));
216      pw8 = _mm256_set1_epi16(8);
217      // load source and another source starting from the next
218      // following byte
219      src_reg = _mm256_loadu_si256((__m256i const *) (src));
220      AVG_NEXT_SRC(src_reg, 1)
221      for (i = 0; i < height ; i++) {
222        // save current source average
223        src_avg = src_reg;
224        src+= src_stride;
225        LOAD_SRC_DST
226        AVG_NEXT_SRC(src_reg, 1)
227        MERGE_WITH_SRC(src_avg, src_reg)
228        FILTER_SRC(filter)
229        CALC_SUM_SSE_INSIDE_LOOP
230        dst+= dst_stride;
231      }
232    }
233  // x_offset = bilin interpolation and y_offset = 0
234  } else {
235    if (y_offset == 0) {
236      __m256i filter, pw8, src_next_reg;
237      x_offset <<= 5;
238      filter = _mm256_load_si256((__m256i const *)
239               (bilinear_filters_avx2 + x_offset));
240      pw8 = _mm256_set1_epi16(8);
241      for (i = 0; i < height ; i++) {
242        LOAD_SRC_DST
243        MERGE_NEXT_SRC(src_reg, 1)
244        FILTER_SRC(filter)
245        CALC_SUM_SSE_INSIDE_LOOP
246        src+= src_stride;
247        dst+= dst_stride;
248      }
249    // x_offset = bilin interpolation and y_offset = 8
250    } else if (y_offset == 8) {
251      __m256i filter, pw8, src_next_reg, src_pack;
252      x_offset <<= 5;
253      filter = _mm256_load_si256((__m256i const *)
254               (bilinear_filters_avx2 + x_offset));
255      pw8 = _mm256_set1_epi16(8);
256      src_reg = _mm256_loadu_si256((__m256i const *) (src));
257      MERGE_NEXT_SRC(src_reg, 1)
258      FILTER_SRC(filter)
259      // convert each 16 bit to 8 bit to each low and high lane source
260      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
261      for (i = 0; i < height ; i++) {
262        src+= src_stride;
263        LOAD_SRC_DST
264        MERGE_NEXT_SRC(src_reg, 1)
265        FILTER_SRC(filter)
266        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
267        // average between previous pack to the current
268        src_pack = _mm256_avg_epu8(src_pack, src_reg);
269        MERGE_WITH_SRC(src_pack, zero_reg)
270        CALC_SUM_SSE_INSIDE_LOOP
271        src_pack = src_reg;
272        dst+= dst_stride;
273      }
274    // x_offset = bilin interpolation and y_offset = bilin interpolation
275    } else {
276      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
277      x_offset <<= 5;
278      xfilter = _mm256_load_si256((__m256i const *)
279                (bilinear_filters_avx2 + x_offset));
280      y_offset <<= 5;
281      yfilter = _mm256_load_si256((__m256i const *)
282                (bilinear_filters_avx2 + y_offset));
283      pw8 = _mm256_set1_epi16(8);
284      // load source and another source starting from the next
285      // following byte
286      src_reg = _mm256_loadu_si256((__m256i const *) (src));
287      MERGE_NEXT_SRC(src_reg, 1)
288
289      FILTER_SRC(xfilter)
290      // convert each 16 bit to 8 bit to each low and high lane source
291      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
292      for (i = 0; i < height ; i++) {
293        src+= src_stride;
294        LOAD_SRC_DST
295        MERGE_NEXT_SRC(src_reg, 1)
296        FILTER_SRC(xfilter)
297        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
298        // merge previous pack to current pack source
299        MERGE_WITH_SRC(src_pack, src_reg)
300        // filter the source
301        FILTER_SRC(yfilter)
302        src_pack = src_reg;
303        CALC_SUM_SSE_INSIDE_LOOP
304        dst+= dst_stride;
305      }
306    }
307  }
308  CALC_SUM_AND_SSE
309  return sum;
310}
311
312unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
313                                             int src_stride,
314                                             int x_offset,
315                                             int y_offset,
316                                             const uint8_t *dst,
317                                             int dst_stride,
318                                             const uint8_t *sec,
319                                             int sec_stride,
320                                             int height,
321                                             unsigned int *sse) {
322  __m256i sec_reg;
323  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
324  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
325  __m256i zero_reg;
326  int i, sum;
327  sum_reg = _mm256_set1_epi16(0);
328  sse_reg = _mm256_set1_epi16(0);
329  zero_reg = _mm256_set1_epi16(0);
330
331  // x_offset = 0 and y_offset = 0
332  if (x_offset == 0) {
333    if (y_offset == 0) {
334      for (i = 0; i < height ; i++) {
335        LOAD_SRC_DST
336        sec_reg = _mm256_load_si256((__m256i const *) (sec));
337        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
338        sec+= sec_stride;
339        // expend each byte to 2 bytes
340        MERGE_WITH_SRC(src_reg, zero_reg)
341        CALC_SUM_SSE_INSIDE_LOOP
342        src+= src_stride;
343        dst+= dst_stride;
344      }
345    } else if (y_offset == 8) {
346      __m256i src_next_reg;
347      for (i = 0; i < height ; i++) {
348        LOAD_SRC_DST
349        AVG_NEXT_SRC(src_reg, src_stride)
350        sec_reg = _mm256_load_si256((__m256i const *) (sec));
351        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
352        sec+= sec_stride;
353        // expend each byte to 2 bytes
354        MERGE_WITH_SRC(src_reg, zero_reg)
355        CALC_SUM_SSE_INSIDE_LOOP
356        src+= src_stride;
357        dst+= dst_stride;
358      }
359    // x_offset = 0 and y_offset = bilin interpolation
360    } else {
361      __m256i filter, pw8, src_next_reg;
362
363      y_offset <<= 5;
364      filter = _mm256_load_si256((__m256i const *)
365                 (bilinear_filters_avx2 + y_offset));
366      pw8 = _mm256_set1_epi16(8);
367      for (i = 0; i < height ; i++) {
368        LOAD_SRC_DST
369        MERGE_NEXT_SRC(src_reg, src_stride)
370        FILTER_SRC(filter)
371        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
372        sec_reg = _mm256_load_si256((__m256i const *) (sec));
373        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
374        sec+= sec_stride;
375        MERGE_WITH_SRC(src_reg, zero_reg)
376        CALC_SUM_SSE_INSIDE_LOOP
377        src+= src_stride;
378        dst+= dst_stride;
379      }
380    }
381  // x_offset = 8  and y_offset = 0
382  } else if (x_offset == 8) {
383    if (y_offset == 0) {
384      __m256i src_next_reg;
385      for (i = 0; i < height ; i++) {
386        LOAD_SRC_DST
387        AVG_NEXT_SRC(src_reg, 1)
388        sec_reg = _mm256_load_si256((__m256i const *) (sec));
389        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
390        sec+= sec_stride;
391        // expand each byte to 2 bytes
392        MERGE_WITH_SRC(src_reg, zero_reg)
393        CALC_SUM_SSE_INSIDE_LOOP
394        src+= src_stride;
395        dst+= dst_stride;
396      }
397    // x_offset = 8  and y_offset = 8
398    } else if (y_offset == 8) {
399      __m256i src_next_reg, src_avg;
400      // load source and another source starting from the next
401      // following byte
402      src_reg = _mm256_loadu_si256((__m256i const *) (src));
403      AVG_NEXT_SRC(src_reg, 1)
404      for (i = 0; i < height ; i++) {
405        // save current source average
406        src_avg = src_reg;
407        src+= src_stride;
408        LOAD_SRC_DST
409        AVG_NEXT_SRC(src_reg, 1)
410        // average between previous average to current average
411        src_avg = _mm256_avg_epu8(src_avg, src_reg);
412        sec_reg = _mm256_load_si256((__m256i const *) (sec));
413        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
414        sec+= sec_stride;
415        // expand each byte to 2 bytes
416        MERGE_WITH_SRC(src_avg, zero_reg)
417        CALC_SUM_SSE_INSIDE_LOOP
418        dst+= dst_stride;
419      }
420    // x_offset = 8  and y_offset = bilin interpolation
421    } else {
422      __m256i filter, pw8, src_next_reg, src_avg;
423      y_offset <<= 5;
424      filter = _mm256_load_si256((__m256i const *)
425               (bilinear_filters_avx2 + y_offset));
426      pw8 = _mm256_set1_epi16(8);
427      // load source and another source starting from the next
428      // following byte
429      src_reg = _mm256_loadu_si256((__m256i const *) (src));
430      AVG_NEXT_SRC(src_reg, 1)
431      for (i = 0; i < height ; i++) {
432        // save current source average
433        src_avg = src_reg;
434        src+= src_stride;
435        LOAD_SRC_DST
436        AVG_NEXT_SRC(src_reg, 1)
437        MERGE_WITH_SRC(src_avg, src_reg)
438        FILTER_SRC(filter)
439        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
440        sec_reg = _mm256_load_si256((__m256i const *) (sec));
441        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
442        // expand each byte to 2 bytes
443        MERGE_WITH_SRC(src_avg, zero_reg)
444        sec+= sec_stride;
445        CALC_SUM_SSE_INSIDE_LOOP
446        dst+= dst_stride;
447      }
448    }
449  // x_offset = bilin interpolation and y_offset = 0
450  } else {
451    if (y_offset == 0) {
452      __m256i filter, pw8, src_next_reg;
453      x_offset <<= 5;
454      filter = _mm256_load_si256((__m256i const *)
455               (bilinear_filters_avx2 + x_offset));
456      pw8 = _mm256_set1_epi16(8);
457      for (i = 0; i < height ; i++) {
458        LOAD_SRC_DST
459        MERGE_NEXT_SRC(src_reg, 1)
460        FILTER_SRC(filter)
461        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
462        sec_reg = _mm256_load_si256((__m256i const *) (sec));
463        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
464        MERGE_WITH_SRC(src_reg, zero_reg)
465        sec+= sec_stride;
466        CALC_SUM_SSE_INSIDE_LOOP
467        src+= src_stride;
468        dst+= dst_stride;
469      }
470    // x_offset = bilin interpolation and y_offset = 8
471    } else if (y_offset == 8) {
472      __m256i filter, pw8, src_next_reg, src_pack;
473      x_offset <<= 5;
474      filter = _mm256_load_si256((__m256i const *)
475               (bilinear_filters_avx2 + x_offset));
476      pw8 = _mm256_set1_epi16(8);
477      src_reg = _mm256_loadu_si256((__m256i const *) (src));
478      MERGE_NEXT_SRC(src_reg, 1)
479      FILTER_SRC(filter)
480      // convert each 16 bit to 8 bit to each low and high lane source
481      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
482      for (i = 0; i < height ; i++) {
483        src+= src_stride;
484        LOAD_SRC_DST
485        MERGE_NEXT_SRC(src_reg, 1)
486        FILTER_SRC(filter)
487        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
488        // average between previous pack to the current
489        src_pack = _mm256_avg_epu8(src_pack, src_reg);
490        sec_reg = _mm256_load_si256((__m256i const *) (sec));
491        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
492        sec+= sec_stride;
493        MERGE_WITH_SRC(src_pack, zero_reg)
494        src_pack = src_reg;
495        CALC_SUM_SSE_INSIDE_LOOP
496        dst+= dst_stride;
497      }
498    // x_offset = bilin interpolation and y_offset = bilin interpolation
499    } else {
500      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
501      x_offset <<= 5;
502      xfilter = _mm256_load_si256((__m256i const *)
503                (bilinear_filters_avx2 + x_offset));
504      y_offset <<= 5;
505      yfilter = _mm256_load_si256((__m256i const *)
506                (bilinear_filters_avx2 + y_offset));
507      pw8 = _mm256_set1_epi16(8);
508      // load source and another source starting from the next
509      // following byte
510      src_reg = _mm256_loadu_si256((__m256i const *) (src));
511      MERGE_NEXT_SRC(src_reg, 1)
512
513      FILTER_SRC(xfilter)
514      // convert each 16 bit to 8 bit to each low and high lane source
515      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
516      for (i = 0; i < height ; i++) {
517        src+= src_stride;
518        LOAD_SRC_DST
519        MERGE_NEXT_SRC(src_reg, 1)
520        FILTER_SRC(xfilter)
521        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
522        // merge previous pack to current pack source
523        MERGE_WITH_SRC(src_pack, src_reg)
524        // filter the source
525        FILTER_SRC(yfilter)
526        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
527        sec_reg = _mm256_load_si256((__m256i const *) (sec));
528        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
529        MERGE_WITH_SRC(src_pack, zero_reg)
530        src_pack = src_reg;
531        sec+= sec_stride;
532        CALC_SUM_SSE_INSIDE_LOOP
533        dst+= dst_stride;
534      }
535    }
536  }
537  CALC_SUM_AND_SSE
538  return sum;
539}
540