14fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang/* 24fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 34fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang * 44fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang * Use of this source code is governed by a BSD-style license 54fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang * that can be found in the LICENSE file in the root of the source 64fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang * tree. An additional intellectual property rights grant can be found 74fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang * in the file PATENTS. All contributing project authors may 84fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang * be found in the AUTHORS file in the root of the source tree. 94fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang */ 104fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang#include <immintrin.h> // AVX2 117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h" 124fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang#include "vpx/vpx_integer.h" 134fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid vpx_sad32x32x4d_avx2(const uint8_t *src, 154fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang int src_stride, 167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8_t *const ref[4], 174fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang int ref_stride, 187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint32_t res[4]) { 194fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; 204fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; 214fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang __m256i sum_mlow, sum_mhigh; 224fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang int i; 237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8_t *ref0, *ref1, *ref2, *ref3; 244fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 254fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref0 = ref[0]; 264fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref1 = ref[1]; 274fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref2 = ref[2]; 284fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref3 = ref[3]; 294fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref0 = _mm256_set1_epi16(0); 304fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref1 = _mm256_set1_epi16(0); 314fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref2 = _mm256_set1_epi16(0); 324fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref3 = _mm256_set1_epi16(0); 334fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang for (i = 0; i < 32 ; i++) { 344fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // load src and all refs 357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian src_reg = _mm256_loadu_si256((const __m256i *)src); 367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); 377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); 387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); 397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); 404fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // sum of the absolute differences between every ref-i to src 414fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); 424fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); 434fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); 444fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); 454fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // sum every ref-i 464fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); 474fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); 484fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); 494fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); 504fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 514fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang src+= src_stride; 524fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref0+= ref_stride; 534fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref1+= ref_stride; 544fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref2+= ref_stride; 554fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref3+= ref_stride; 564fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang } 574fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang { 584fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang __m128i sum; 594fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // in sum_ref-i the result is saved in the first 4 bytes 604fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // the other 4 bytes are zeroed. 614fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // sum_ref1 and sum_ref3 are shifted left by 4 bytes 624fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref1 = _mm256_slli_si256(sum_ref1, 4); 634fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref3 = _mm256_slli_si256(sum_ref3, 4); 644fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 654fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 664fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); 674fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); 684fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 694fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // merge every 64 bit from each sum_ref-i 704fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); 714fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); 724fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 734fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // add the low 64 bit to the high 64 bit 744fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); 754fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 764fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // add the low 128 bit to the high 128 bit 774fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), 784fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang _mm256_extractf128_si256(sum_mlow, 1)); 794fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 804fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang _mm_storeu_si128((__m128i *)(res), sum); 814fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang } 824fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang} 834fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid vpx_sad64x64x4d_avx2(const uint8_t *src, 854fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang int src_stride, 867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8_t *const ref[4], 874fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang int ref_stride, 887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint32_t res[4]) { 894fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; 904fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; 914fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang __m256i ref3_reg, ref3next_reg; 924fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; 934fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang __m256i sum_mlow, sum_mhigh; 944fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang int i; 957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8_t *ref0, *ref1, *ref2, *ref3; 964fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 974fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref0 = ref[0]; 984fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref1 = ref[1]; 994fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref2 = ref[2]; 1004fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref3 = ref[3]; 1014fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref0 = _mm256_set1_epi16(0); 1024fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref1 = _mm256_set1_epi16(0); 1034fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref2 = _mm256_set1_epi16(0); 1044fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref3 = _mm256_set1_epi16(0); 1054fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang for (i = 0; i < 64 ; i++) { 1064fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // load 64 bytes from src and all refs 1077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian src_reg = _mm256_loadu_si256((const __m256i *)src); 1087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); 1097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); 1107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); 1117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); 1127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); 1137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); 1147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); 1157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); 1167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); 1174fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // sum of the absolute differences between every ref-i to src 1184fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); 1194fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); 1204fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); 1214fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); 1224fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); 1234fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); 1244fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); 1254fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); 1264fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 1274fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // sum every ref-i 1284fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); 1294fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); 1304fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); 1314fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); 1324fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); 1334fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); 1344fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); 1354fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); 1364fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang src+= src_stride; 1374fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref0+= ref_stride; 1384fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref1+= ref_stride; 1394fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref2+= ref_stride; 1404fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang ref3+= ref_stride; 1414fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang } 1424fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang { 1434fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang __m128i sum; 1444fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 1454fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // in sum_ref-i the result is saved in the first 4 bytes 1464fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // the other 4 bytes are zeroed. 1474fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // sum_ref1 and sum_ref3 are shifted left by 4 bytes 1484fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref1 = _mm256_slli_si256(sum_ref1, 4); 1494fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref3 = _mm256_slli_si256(sum_ref3, 4); 1504fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 1514fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 1524fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); 1534fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); 1544fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 1554fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // merge every 64 bit from each sum_ref-i 1564fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); 1574fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); 1584fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 1594fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // add the low 64 bit to the high 64 bit 1604fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); 1614fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 1624fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang // add the low 128 bit to the high 128 bit 1634fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), 1644fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang _mm256_extractf128_si256(sum_mlow, 1)); 1654fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang 1664fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang _mm_storeu_si128((__m128i *)(res), sum); 1674fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang } 1684fb68e5dd4e93c7599dc905d861de11ac39c5585hkuang} 169