variance_impl_avx2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <immintrin.h>  // AVX2
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx_ports/mem.h"
15
16DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
17  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
18  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
19  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
20  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
21  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
22  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
23  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
24  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
25  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
26  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
27  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
28  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
29  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
30  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
31  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
32  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
33};
34
35
36void vpx_get16x16var_avx2(const unsigned char *src_ptr,
37                          int source_stride,
38                          const unsigned char *ref_ptr,
39                          int recon_stride,
40                          unsigned int *SSE,
41                          int *Sum) {
42    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
43    __m256i ref_expand_high, madd_low, madd_high;
44    unsigned int i, src_2strides, ref_2strides;
45    __m256i zero_reg = _mm256_set1_epi16(0);
46    __m256i sum_ref_src = _mm256_set1_epi16(0);
47    __m256i madd_ref_src = _mm256_set1_epi16(0);
48
49    // processing two strides in a 256 bit register reducing the number
50    // of loop stride by half (comparing to the sse2 code)
51    src_2strides = source_stride << 1;
52    ref_2strides = recon_stride << 1;
53    for (i = 0; i < 8; i++) {
54        src = _mm256_castsi128_si256(
55              _mm_loadu_si128((__m128i const *) (src_ptr)));
56        src = _mm256_inserti128_si256(src,
57              _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1);
58
59        ref =_mm256_castsi128_si256(
60             _mm_loadu_si128((__m128i const *) (ref_ptr)));
61        ref = _mm256_inserti128_si256(ref,
62              _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1);
63
64        // expanding to 16 bit each lane
65        src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
66        src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
67
68        ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
69        ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
70
71        // src-ref
72        src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
73        src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
74
75        // madd low (src - ref)
76        madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
77
78        // add high to low
79        src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
80
81        // madd high (src - ref)
82        madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
83
84        sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
85
86        // add high to low
87        madd_ref_src = _mm256_add_epi32(madd_ref_src,
88                       _mm256_add_epi32(madd_low, madd_high));
89
90        src_ptr+= src_2strides;
91        ref_ptr+= ref_2strides;
92    }
93
94    {
95        __m128i sum_res, madd_res;
96        __m128i expand_sum_low, expand_sum_high, expand_sum;
97        __m128i expand_madd_low, expand_madd_high, expand_madd;
98        __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
99
100        // extract the low lane and add it to the high lane
101        sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
102                                _mm256_extractf128_si256(sum_ref_src, 1));
103
104        madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
105                                 _mm256_extractf128_si256(madd_ref_src, 1));
106
107        // padding each 2 bytes with another 2 zeroed bytes
108        expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg),
109                                            sum_res);
110        expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg),
111                                             sum_res);
112
113        // shifting the sign 16 bits right
114        expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
115        expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
116
117        expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
118
119        // expand each 32 bits of the madd result to 64 bits
120        expand_madd_low = _mm_unpacklo_epi32(madd_res,
121                          _mm256_castsi256_si128(zero_reg));
122        expand_madd_high = _mm_unpackhi_epi32(madd_res,
123                           _mm256_castsi256_si128(zero_reg));
124
125        expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
126
127        ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum,
128                            _mm256_castsi256_si128(zero_reg));
129        ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum,
130                             _mm256_castsi256_si128(zero_reg));
131
132        ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
133
134        // shift 8 bytes eight
135        madd_res = _mm_srli_si128(expand_madd, 8);
136        sum_res = _mm_srli_si128(ex_expand_sum, 8);
137
138        madd_res = _mm_add_epi32(madd_res, expand_madd);
139        sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
140
141        *((int*)SSE)= _mm_cvtsi128_si32(madd_res);
142
143        *((int*)Sum)= _mm_cvtsi128_si32(sum_res);
144    }
145}
146
147void vpx_get32x32var_avx2(const unsigned char *src_ptr,
148                          int source_stride,
149                          const unsigned char *ref_ptr,
150                          int recon_stride,
151                          unsigned int *SSE,
152                          int *Sum) {
153    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
154    __m256i ref_expand_high, madd_low, madd_high;
155    unsigned int i;
156    __m256i zero_reg = _mm256_set1_epi16(0);
157    __m256i sum_ref_src = _mm256_set1_epi16(0);
158    __m256i madd_ref_src = _mm256_set1_epi16(0);
159
160    // processing 32 elements in parallel
161    for (i = 0; i < 16; i++) {
162       src = _mm256_loadu_si256((__m256i const *) (src_ptr));
163
164       ref = _mm256_loadu_si256((__m256i const *) (ref_ptr));
165
166       // expanding to 16 bit each lane
167       src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
168       src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
169
170       ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
171       ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
172
173       // src-ref
174       src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
175       src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
176
177       // madd low (src - ref)
178       madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
179
180       // add high to low
181       src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
182
183       // madd high (src - ref)
184       madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
185
186       sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
187
188       // add high to low
189       madd_ref_src = _mm256_add_epi32(madd_ref_src,
190                      _mm256_add_epi32(madd_low, madd_high));
191
192       src_ptr+= source_stride;
193       ref_ptr+= recon_stride;
194    }
195
196    {
197      __m256i expand_sum_low, expand_sum_high, expand_sum;
198      __m256i expand_madd_low, expand_madd_high, expand_madd;
199      __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
200
201      // padding each 2 bytes with another 2 zeroed bytes
202      expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
203      expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
204
205      // shifting the sign 16 bits right
206      expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
207      expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
208
209      expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
210
211      // expand each 32 bits of the madd result to 64 bits
212      expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
213      expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
214
215      expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
216
217      ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
218      ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
219
220      ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
221
222      // shift 8 bytes eight
223      madd_ref_src = _mm256_srli_si256(expand_madd, 8);
224      sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
225
226      madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
227      sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
228
229      // extract the low lane and the high lane and add the results
230      *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
231      _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
232
233      *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
234      _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
235    }
236}
237
238#define FILTER_SRC(filter) \
239  /* filter the source */ \
240  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
241  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
242  \
243  /* add 8 to source */ \
244  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
245  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
246  \
247  /* divide source by 16 */ \
248  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
249  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
250
251#define MERGE_WITH_SRC(src_reg, reg) \
252  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
253  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
254
255#define LOAD_SRC_DST \
256  /* load source and destination */ \
257  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
258  dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
259
260#define AVG_NEXT_SRC(src_reg, size_stride) \
261  src_next_reg = _mm256_loadu_si256((__m256i const *) \
262                                   (src + size_stride)); \
263  /* average between current and next stride source */ \
264  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
265
266#define MERGE_NEXT_SRC(src_reg, size_stride) \
267  src_next_reg = _mm256_loadu_si256((__m256i const *) \
268                                   (src + size_stride)); \
269  MERGE_WITH_SRC(src_reg, src_next_reg)
270
271#define CALC_SUM_SSE_INSIDE_LOOP \
272  /* expand each byte to 2 bytes */ \
273  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
274  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
275  /* source - dest */ \
276  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
277  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
278  /* caculate sum */ \
279  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
280  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
281  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
282  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
283  /* calculate sse */ \
284  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
285  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
286
287// final calculation to sum and sse
288#define CALC_SUM_AND_SSE \
289  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
290  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
291  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
292  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
293  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
294  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
295  \
296  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
297  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
298  \
299  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
300  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
301  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
302                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
303  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
304  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
305  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
306        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
307
308
309unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
310                                             int src_stride,
311                                             int x_offset,
312                                             int y_offset,
313                                             const uint8_t *dst,
314                                             int dst_stride,
315                                             int height,
316                                             unsigned int *sse) {
317  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
318  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
319  __m256i zero_reg;
320  int i, sum;
321  sum_reg = _mm256_set1_epi16(0);
322  sse_reg = _mm256_set1_epi16(0);
323  zero_reg = _mm256_set1_epi16(0);
324
325  // x_offset = 0 and y_offset = 0
326  if (x_offset == 0) {
327    if (y_offset == 0) {
328      for (i = 0; i < height ; i++) {
329        LOAD_SRC_DST
330        // expend each byte to 2 bytes
331        MERGE_WITH_SRC(src_reg, zero_reg)
332        CALC_SUM_SSE_INSIDE_LOOP
333        src+= src_stride;
334        dst+= dst_stride;
335      }
336    // x_offset = 0 and y_offset = 8
337    } else if (y_offset == 8) {
338      __m256i src_next_reg;
339      for (i = 0; i < height ; i++) {
340        LOAD_SRC_DST
341        AVG_NEXT_SRC(src_reg, src_stride)
342        // expend each byte to 2 bytes
343        MERGE_WITH_SRC(src_reg, zero_reg)
344        CALC_SUM_SSE_INSIDE_LOOP
345        src+= src_stride;
346        dst+= dst_stride;
347      }
348    // x_offset = 0 and y_offset = bilin interpolation
349    } else {
350      __m256i filter, pw8, src_next_reg;
351
352      y_offset <<= 5;
353      filter = _mm256_load_si256((__m256i const *)
354               (bilinear_filters_avx2 + y_offset));
355      pw8 = _mm256_set1_epi16(8);
356      for (i = 0; i < height ; i++) {
357        LOAD_SRC_DST
358        MERGE_NEXT_SRC(src_reg, src_stride)
359        FILTER_SRC(filter)
360        CALC_SUM_SSE_INSIDE_LOOP
361        src+= src_stride;
362        dst+= dst_stride;
363      }
364    }
365  // x_offset = 8  and y_offset = 0
366  } else if (x_offset == 8) {
367    if (y_offset == 0) {
368      __m256i src_next_reg;
369      for (i = 0; i < height ; i++) {
370        LOAD_SRC_DST
371        AVG_NEXT_SRC(src_reg, 1)
372        // expand each byte to 2 bytes
373        MERGE_WITH_SRC(src_reg, zero_reg)
374        CALC_SUM_SSE_INSIDE_LOOP
375        src+= src_stride;
376        dst+= dst_stride;
377      }
378    // x_offset = 8  and y_offset = 8
379    } else if (y_offset == 8) {
380      __m256i src_next_reg, src_avg;
381      // load source and another source starting from the next
382      // following byte
383      src_reg = _mm256_loadu_si256((__m256i const *) (src));
384      AVG_NEXT_SRC(src_reg, 1)
385      for (i = 0; i < height ; i++) {
386        src_avg = src_reg;
387        src+= src_stride;
388        LOAD_SRC_DST
389        AVG_NEXT_SRC(src_reg, 1)
390        // average between previous average to current average
391        src_avg = _mm256_avg_epu8(src_avg, src_reg);
392        // expand each byte to 2 bytes
393        MERGE_WITH_SRC(src_avg, zero_reg)
394        // save current source average
395        CALC_SUM_SSE_INSIDE_LOOP
396        dst+= dst_stride;
397      }
398    // x_offset = 8  and y_offset = bilin interpolation
399    } else {
400      __m256i filter, pw8, src_next_reg, src_avg;
401      y_offset <<= 5;
402      filter = _mm256_load_si256((__m256i const *)
403               (bilinear_filters_avx2 + y_offset));
404      pw8 = _mm256_set1_epi16(8);
405      // load source and another source starting from the next
406      // following byte
407      src_reg = _mm256_loadu_si256((__m256i const *) (src));
408      AVG_NEXT_SRC(src_reg, 1)
409      for (i = 0; i < height ; i++) {
410        // save current source average
411        src_avg = src_reg;
412        src+= src_stride;
413        LOAD_SRC_DST
414        AVG_NEXT_SRC(src_reg, 1)
415        MERGE_WITH_SRC(src_avg, src_reg)
416        FILTER_SRC(filter)
417        CALC_SUM_SSE_INSIDE_LOOP
418        dst+= dst_stride;
419      }
420    }
421  // x_offset = bilin interpolation and y_offset = 0
422  } else {
423    if (y_offset == 0) {
424      __m256i filter, pw8, src_next_reg;
425      x_offset <<= 5;
426      filter = _mm256_load_si256((__m256i const *)
427               (bilinear_filters_avx2 + x_offset));
428      pw8 = _mm256_set1_epi16(8);
429      for (i = 0; i < height ; i++) {
430        LOAD_SRC_DST
431        MERGE_NEXT_SRC(src_reg, 1)
432        FILTER_SRC(filter)
433        CALC_SUM_SSE_INSIDE_LOOP
434        src+= src_stride;
435        dst+= dst_stride;
436      }
437    // x_offset = bilin interpolation and y_offset = 8
438    } else if (y_offset == 8) {
439      __m256i filter, pw8, src_next_reg, src_pack;
440      x_offset <<= 5;
441      filter = _mm256_load_si256((__m256i const *)
442               (bilinear_filters_avx2 + x_offset));
443      pw8 = _mm256_set1_epi16(8);
444      src_reg = _mm256_loadu_si256((__m256i const *) (src));
445      MERGE_NEXT_SRC(src_reg, 1)
446      FILTER_SRC(filter)
447      // convert each 16 bit to 8 bit to each low and high lane source
448      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
449      for (i = 0; i < height ; i++) {
450        src+= src_stride;
451        LOAD_SRC_DST
452        MERGE_NEXT_SRC(src_reg, 1)
453        FILTER_SRC(filter)
454        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
455        // average between previous pack to the current
456        src_pack = _mm256_avg_epu8(src_pack, src_reg);
457        MERGE_WITH_SRC(src_pack, zero_reg)
458        CALC_SUM_SSE_INSIDE_LOOP
459        src_pack = src_reg;
460        dst+= dst_stride;
461      }
462    // x_offset = bilin interpolation and y_offset = bilin interpolation
463    } else {
464      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
465      x_offset <<= 5;
466      xfilter = _mm256_load_si256((__m256i const *)
467                (bilinear_filters_avx2 + x_offset));
468      y_offset <<= 5;
469      yfilter = _mm256_load_si256((__m256i const *)
470                (bilinear_filters_avx2 + y_offset));
471      pw8 = _mm256_set1_epi16(8);
472      // load source and another source starting from the next
473      // following byte
474      src_reg = _mm256_loadu_si256((__m256i const *) (src));
475      MERGE_NEXT_SRC(src_reg, 1)
476
477      FILTER_SRC(xfilter)
478      // convert each 16 bit to 8 bit to each low and high lane source
479      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
480      for (i = 0; i < height ; i++) {
481        src+= src_stride;
482        LOAD_SRC_DST
483        MERGE_NEXT_SRC(src_reg, 1)
484        FILTER_SRC(xfilter)
485        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
486        // merge previous pack to current pack source
487        MERGE_WITH_SRC(src_pack, src_reg)
488        // filter the source
489        FILTER_SRC(yfilter)
490        src_pack = src_reg;
491        CALC_SUM_SSE_INSIDE_LOOP
492        dst+= dst_stride;
493      }
494    }
495  }
496  CALC_SUM_AND_SSE
497  return sum;
498}
499
500unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
501                                             int src_stride,
502                                             int x_offset,
503                                             int y_offset,
504                                             const uint8_t *dst,
505                                             int dst_stride,
506                                             const uint8_t *sec,
507                                             int sec_stride,
508                                             int height,
509                                             unsigned int *sse) {
510  __m256i sec_reg;
511  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
512  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
513  __m256i zero_reg;
514  int i, sum;
515  sum_reg = _mm256_set1_epi16(0);
516  sse_reg = _mm256_set1_epi16(0);
517  zero_reg = _mm256_set1_epi16(0);
518
519  // x_offset = 0 and y_offset = 0
520  if (x_offset == 0) {
521    if (y_offset == 0) {
522      for (i = 0; i < height ; i++) {
523        LOAD_SRC_DST
524        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
525        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
526        sec+= sec_stride;
527        // expend each byte to 2 bytes
528        MERGE_WITH_SRC(src_reg, zero_reg)
529        CALC_SUM_SSE_INSIDE_LOOP
530        src+= src_stride;
531        dst+= dst_stride;
532      }
533    } else if (y_offset == 8) {
534      __m256i src_next_reg;
535      for (i = 0; i < height ; i++) {
536        LOAD_SRC_DST
537        AVG_NEXT_SRC(src_reg, src_stride)
538        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
539        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
540        sec+= sec_stride;
541        // expend each byte to 2 bytes
542        MERGE_WITH_SRC(src_reg, zero_reg)
543        CALC_SUM_SSE_INSIDE_LOOP
544        src+= src_stride;
545        dst+= dst_stride;
546      }
547    // x_offset = 0 and y_offset = bilin interpolation
548    } else {
549      __m256i filter, pw8, src_next_reg;
550
551      y_offset <<= 5;
552      filter = _mm256_load_si256((__m256i const *)
553                 (bilinear_filters_avx2 + y_offset));
554      pw8 = _mm256_set1_epi16(8);
555      for (i = 0; i < height ; i++) {
556        LOAD_SRC_DST
557        MERGE_NEXT_SRC(src_reg, src_stride)
558        FILTER_SRC(filter)
559        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
560        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
561        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
562        sec+= sec_stride;
563        MERGE_WITH_SRC(src_reg, zero_reg)
564        CALC_SUM_SSE_INSIDE_LOOP
565        src+= src_stride;
566        dst+= dst_stride;
567      }
568    }
569  // x_offset = 8  and y_offset = 0
570  } else if (x_offset == 8) {
571    if (y_offset == 0) {
572      __m256i src_next_reg;
573      for (i = 0; i < height ; i++) {
574        LOAD_SRC_DST
575        AVG_NEXT_SRC(src_reg, 1)
576        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
577        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
578        sec+= sec_stride;
579        // expand each byte to 2 bytes
580        MERGE_WITH_SRC(src_reg, zero_reg)
581        CALC_SUM_SSE_INSIDE_LOOP
582        src+= src_stride;
583        dst+= dst_stride;
584      }
585    // x_offset = 8  and y_offset = 8
586    } else if (y_offset == 8) {
587      __m256i src_next_reg, src_avg;
588      // load source and another source starting from the next
589      // following byte
590      src_reg = _mm256_loadu_si256((__m256i const *) (src));
591      AVG_NEXT_SRC(src_reg, 1)
592      for (i = 0; i < height ; i++) {
593        // save current source average
594        src_avg = src_reg;
595        src+= src_stride;
596        LOAD_SRC_DST
597        AVG_NEXT_SRC(src_reg, 1)
598        // average between previous average to current average
599        src_avg = _mm256_avg_epu8(src_avg, src_reg);
600        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
601        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
602        sec+= sec_stride;
603        // expand each byte to 2 bytes
604        MERGE_WITH_SRC(src_avg, zero_reg)
605        CALC_SUM_SSE_INSIDE_LOOP
606        dst+= dst_stride;
607      }
608    // x_offset = 8  and y_offset = bilin interpolation
609    } else {
610      __m256i filter, pw8, src_next_reg, src_avg;
611      y_offset <<= 5;
612      filter = _mm256_load_si256((__m256i const *)
613               (bilinear_filters_avx2 + y_offset));
614      pw8 = _mm256_set1_epi16(8);
615      // load source and another source starting from the next
616      // following byte
617      src_reg = _mm256_loadu_si256((__m256i const *) (src));
618      AVG_NEXT_SRC(src_reg, 1)
619      for (i = 0; i < height ; i++) {
620        // save current source average
621        src_avg = src_reg;
622        src+= src_stride;
623        LOAD_SRC_DST
624        AVG_NEXT_SRC(src_reg, 1)
625        MERGE_WITH_SRC(src_avg, src_reg)
626        FILTER_SRC(filter)
627        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
628        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
629        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
630        // expand each byte to 2 bytes
631        MERGE_WITH_SRC(src_avg, zero_reg)
632        sec+= sec_stride;
633        CALC_SUM_SSE_INSIDE_LOOP
634        dst+= dst_stride;
635      }
636    }
637  // x_offset = bilin interpolation and y_offset = 0
638  } else {
639    if (y_offset == 0) {
640      __m256i filter, pw8, src_next_reg;
641      x_offset <<= 5;
642      filter = _mm256_load_si256((__m256i const *)
643               (bilinear_filters_avx2 + x_offset));
644      pw8 = _mm256_set1_epi16(8);
645      for (i = 0; i < height ; i++) {
646        LOAD_SRC_DST
647        MERGE_NEXT_SRC(src_reg, 1)
648        FILTER_SRC(filter)
649        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
650        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
651        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
652        MERGE_WITH_SRC(src_reg, zero_reg)
653        sec+= sec_stride;
654        CALC_SUM_SSE_INSIDE_LOOP
655        src+= src_stride;
656        dst+= dst_stride;
657      }
658    // x_offset = bilin interpolation and y_offset = 8
659    } else if (y_offset == 8) {
660      __m256i filter, pw8, src_next_reg, src_pack;
661      x_offset <<= 5;
662      filter = _mm256_load_si256((__m256i const *)
663               (bilinear_filters_avx2 + x_offset));
664      pw8 = _mm256_set1_epi16(8);
665      src_reg = _mm256_loadu_si256((__m256i const *) (src));
666      MERGE_NEXT_SRC(src_reg, 1)
667      FILTER_SRC(filter)
668      // convert each 16 bit to 8 bit to each low and high lane source
669      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
670      for (i = 0; i < height ; i++) {
671        src+= src_stride;
672        LOAD_SRC_DST
673        MERGE_NEXT_SRC(src_reg, 1)
674        FILTER_SRC(filter)
675        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
676        // average between previous pack to the current
677        src_pack = _mm256_avg_epu8(src_pack, src_reg);
678        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
679        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
680        sec+= sec_stride;
681        MERGE_WITH_SRC(src_pack, zero_reg)
682        src_pack = src_reg;
683        CALC_SUM_SSE_INSIDE_LOOP
684        dst+= dst_stride;
685      }
686    // x_offset = bilin interpolation and y_offset = bilin interpolation
687    } else {
688      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
689      x_offset <<= 5;
690      xfilter = _mm256_load_si256((__m256i const *)
691                (bilinear_filters_avx2 + x_offset));
692      y_offset <<= 5;
693      yfilter = _mm256_load_si256((__m256i const *)
694                (bilinear_filters_avx2 + y_offset));
695      pw8 = _mm256_set1_epi16(8);
696      // load source and another source starting from the next
697      // following byte
698      src_reg = _mm256_loadu_si256((__m256i const *) (src));
699      MERGE_NEXT_SRC(src_reg, 1)
700
701      FILTER_SRC(xfilter)
702      // convert each 16 bit to 8 bit to each low and high lane source
703      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
704      for (i = 0; i < height ; i++) {
705        src+= src_stride;
706        LOAD_SRC_DST
707        MERGE_NEXT_SRC(src_reg, 1)
708        FILTER_SRC(xfilter)
709        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
710        // merge previous pack to current pack source
711        MERGE_WITH_SRC(src_pack, src_reg)
712        // filter the source
713        FILTER_SRC(yfilter)
714        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
715        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
716        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
717        MERGE_WITH_SRC(src_pack, zero_reg)
718        src_pack = src_reg;
719        sec+= sec_stride;
720        CALC_SUM_SSE_INSIDE_LOOP
721        dst+= dst_stride;
722      }
723    }
724  }
725  CALC_SUM_AND_SSE
726  return sum;
727}
728