14fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang/*
24fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
34fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang *
44fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang *  Use of this source code is governed by a BSD-style license
54fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang *  that can be found in the LICENSE file in the root of the source
64fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang *  tree. An additional intellectual property rights grant can be found
74fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang *  in the file PATENTS.  All contributing project authors may
84fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang *  be found in the AUTHORS file in the root of the source tree.
94fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang */
104fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang#include <immintrin.h>  // AVX2
117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
124fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang#include "vpx/vpx_integer.h"
134fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid vpx_sad32x32x4d_avx2(const uint8_t *src,
154fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang                          int src_stride,
167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                          const uint8_t *const ref[4],
174fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang                          int ref_stride,
187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                          uint32_t res[4]) {
194fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
204fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
214fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  __m256i sum_mlow, sum_mhigh;
224fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  int i;
237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint8_t *ref0, *ref1, *ref2, *ref3;
244fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
254fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  ref0 = ref[0];
264fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  ref1 = ref[1];
274fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  ref2 = ref[2];
284fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  ref3 = ref[3];
294fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  sum_ref0 = _mm256_set1_epi16(0);
304fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  sum_ref1 = _mm256_set1_epi16(0);
314fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  sum_ref2 = _mm256_set1_epi16(0);
324fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  sum_ref3 = _mm256_set1_epi16(0);
334fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  for (i = 0; i < 32 ; i++) {
344fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // load src and all refs
357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    src_reg = _mm256_loadu_si256((const __m256i *)src);
367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
404fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // sum of the absolute differences between every ref-i to src
414fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
424fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
434fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
444fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
454fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // sum every ref-i
464fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
474fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
484fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
494fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
504fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
514fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    src+= src_stride;
524fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref0+= ref_stride;
534fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref1+= ref_stride;
544fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref2+= ref_stride;
554fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref3+= ref_stride;
564fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  }
574fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  {
584fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    __m128i sum;
594fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // in sum_ref-i the result is saved in the first 4 bytes
604fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // the other 4 bytes are zeroed.
614fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
624fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
634fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
644fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
654fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
664fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
674fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
684fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
694fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // merge every 64 bit from each sum_ref-i
704fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
714fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
724fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
734fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // add the low 64 bit to the high 64 bit
744fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
754fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
764fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // add the low 128 bit to the high 128 bit
774fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
784fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang                        _mm256_extractf128_si256(sum_mlow, 1));
794fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
804fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    _mm_storeu_si128((__m128i *)(res), sum);
814fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  }
824fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang}
834fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid vpx_sad64x64x4d_avx2(const uint8_t *src,
854fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang                          int src_stride,
867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                          const uint8_t *const ref[4],
874fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang                          int ref_stride,
887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                          uint32_t res[4]) {
894fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
904fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
914fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  __m256i ref3_reg, ref3next_reg;
924fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
934fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  __m256i sum_mlow, sum_mhigh;
944fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  int i;
957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint8_t *ref0, *ref1, *ref2, *ref3;
964fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
974fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  ref0 = ref[0];
984fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  ref1 = ref[1];
994fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  ref2 = ref[2];
1004fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  ref3 = ref[3];
1014fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  sum_ref0 = _mm256_set1_epi16(0);
1024fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  sum_ref1 = _mm256_set1_epi16(0);
1034fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  sum_ref2 = _mm256_set1_epi16(0);
1044fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  sum_ref3 = _mm256_set1_epi16(0);
1054fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  for (i = 0; i < 64 ; i++) {
1064fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // load 64 bytes from src and all refs
1077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    src_reg = _mm256_loadu_si256((const __m256i *)src);
1087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
1097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
1107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
1117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
1127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
1137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
1147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
1157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
1167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
1174fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // sum of the absolute differences between every ref-i to src
1184fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
1194fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
1204fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
1214fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
1224fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
1234fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
1244fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
1254fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
1264fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
1274fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // sum every ref-i
1284fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
1294fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
1304fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
1314fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
1324fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
1334fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
1344fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
1354fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
1364fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    src+= src_stride;
1374fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref0+= ref_stride;
1384fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref1+= ref_stride;
1394fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref2+= ref_stride;
1404fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    ref3+= ref_stride;
1414fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  }
1424fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  {
1434fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    __m128i sum;
1444fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
1454fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // in sum_ref-i the result is saved in the first 4 bytes
1464fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // the other 4 bytes are zeroed.
1474fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
1484fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
1494fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
1504fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
1514fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
1524fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
1534fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
1544fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
1554fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // merge every 64 bit from each sum_ref-i
1564fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
1574fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
1584fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
1594fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // add the low 64 bit to the high 64 bit
1604fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
1614fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
1624fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    // add the low 128 bit to the high 128 bit
1634fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
1644fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang                        _mm256_extractf128_si256(sum_mlow, 1));
1654fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang
1664fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang    _mm_storeu_si128((__m128i *)(res), sum);
1674fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang  }
1684fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang}
169