193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org/*
293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org *
493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org *  Use of this source code is governed by a BSD-style license
593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org *  that can be found in the LICENSE file in the root of the source
693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org *  tree. An additional intellectual property rights grant can be found
793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org *  in the file PATENTS.  All contributing project authors may
893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org *  be found in the AUTHORS file in the root of the source tree.
993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org */
1093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#include <immintrin.h>  // AVX2
1193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#include "vpx/vpx_integer.h"
1293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
1393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.orgvoid vp9_sad32x32x4d_avx2(uint8_t *src,
1493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                          int src_stride,
1593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                          uint8_t *ref[4],
1693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                          int ref_stride,
1793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                          unsigned int res[4]) {
1893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
1993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
2093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  __m256i sum_mlow, sum_mhigh;
2193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  int i;
2293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  uint8_t *ref0, *ref1, *ref2, *ref3;
2393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
2493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  ref0 = ref[0];
2593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  ref1 = ref[1];
2693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  ref2 = ref[2];
2793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  ref3 = ref[3];
2893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  sum_ref0 = _mm256_set1_epi16(0);
2993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  sum_ref1 = _mm256_set1_epi16(0);
3093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  sum_ref2 = _mm256_set1_epi16(0);
3193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  sum_ref3 = _mm256_set1_epi16(0);
3293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  for (i = 0; i < 32 ; i++) {
3393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // load src and all refs
3441294d96d7dbf9bc215b09832a8336c5fb158f0bjohannkoenig@chromium.org    src_reg = _mm256_loadu_si256((__m256i *)(src));
3593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
3693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
3793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
3893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
3993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // sum of the absolute differences between every ref-i to src
4093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
4193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
4293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
4393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
4493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // sum every ref-i
4593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
4693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
4793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
4893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
4993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
5093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    src+= src_stride;
5193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref0+= ref_stride;
5293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref1+= ref_stride;
5393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref2+= ref_stride;
5493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref3+= ref_stride;
5593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  }
5693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  {
5793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    __m128i sum;
5893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in sum_ref-i the result is saved in the first 4 bytes
5993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // the other 4 bytes are zeroed.
6093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
6193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
6293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
6393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
6493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
6593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
6693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
6793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
6893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // merge every 64 bit from each sum_ref-i
6993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
7093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
7193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
7293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // add the low 64 bit to the high 64 bit
7393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
7493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
7593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // add the low 128 bit to the high 128 bit
7693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
7793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                        _mm256_extractf128_si256(sum_mlow, 1));
7893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
7993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(res), sum);
8093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  }
8193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org}
8293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
8393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.orgvoid vp9_sad64x64x4d_avx2(uint8_t *src,
8493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                          int src_stride,
8593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                          uint8_t *ref[4],
8693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                          int ref_stride,
8793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                          unsigned int res[4]) {
8893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
8993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
9093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  __m256i ref3_reg, ref3next_reg;
9193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
9293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  __m256i sum_mlow, sum_mhigh;
9393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  int i;
9493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  uint8_t *ref0, *ref1, *ref2, *ref3;
9593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
9693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  ref0 = ref[0];
9793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  ref1 = ref[1];
9893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  ref2 = ref[2];
9993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  ref3 = ref[3];
10093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  sum_ref0 = _mm256_set1_epi16(0);
10193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  sum_ref1 = _mm256_set1_epi16(0);
10293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  sum_ref2 = _mm256_set1_epi16(0);
10393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  sum_ref3 = _mm256_set1_epi16(0);
10493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  for (i = 0; i < 64 ; i++) {
10593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // load 64 bytes from src and all refs
10641294d96d7dbf9bc215b09832a8336c5fb158f0bjohannkoenig@chromium.org    src_reg = _mm256_loadu_si256((__m256i *)(src));
10741294d96d7dbf9bc215b09832a8336c5fb158f0bjohannkoenig@chromium.org    srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32));
10893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
10993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32));
11093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
11193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32));
11293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
11393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32));
11493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
11593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32));
11693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // sum of the absolute differences between every ref-i to src
11793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
11893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
11993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
12093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
12193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
12293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
12393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
12493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
12593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
12693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // sum every ref-i
12793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
12893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
12993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
13093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
13193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
13293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
13393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
13493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
13593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    src+= src_stride;
13693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref0+= ref_stride;
13793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref1+= ref_stride;
13893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref2+= ref_stride;
13993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    ref3+= ref_stride;
14093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  }
14193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  {
14293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    __m128i sum;
14393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
14493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // in sum_ref-i the result is saved in the first 4 bytes
14593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // the other 4 bytes are zeroed.
14693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
14793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
14893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
14993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
15093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
15193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
15293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
15393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
15493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // merge every 64 bit from each sum_ref-i
15593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
15693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
15793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
15893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // add the low 64 bit to the high 64 bit
15993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
16093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
16193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    // add the low 128 bit to the high 128 bit
16293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
16393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org                        _mm256_extractf128_si256(sum_mlow, 1));
16493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
16593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(res), sum);
16693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  }
16793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org}
168