1dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org/* 2dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * 4dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * Use of this source code is governed by a BSD-style license 5dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * that can be found in the LICENSE file in the root of the source 6dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * tree. An additional intellectual property rights grant can be found 7dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * in the file PATENTS. All contributing project authors may 8dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * be found in the AUTHORS file in the root of the source tree. 9dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org */ 10dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 11dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org#include <immintrin.h> // AVX2 12dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 13dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid vp9_get16x16var_avx2(const unsigned char *src_ptr, 14dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org int source_stride, 15dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org const unsigned char *ref_ptr, 16dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org int recon_stride, 17dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org unsigned int *SSE, 18dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org int *Sum) { 19dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; 20dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i ref_expand_high, madd_low, madd_high; 21dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org unsigned int i, src_2strides, ref_2strides; 22dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i zero_reg = _mm256_set1_epi16(0); 23dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i sum_ref_src = _mm256_set1_epi16(0); 24dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i madd_ref_src = _mm256_set1_epi16(0); 25dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 26dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // processing two strides in a 256 bit register reducing the number 27dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // of loop stride by half (comparing to the sse2 code) 28dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_2strides = source_stride << 1; 29dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ref_2strides = recon_stride << 1; 30dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org for (i = 0; i < 8; i++) { 31dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src = _mm256_castsi128_si256( 32dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm_loadu_si128((__m128i const *) (src_ptr))); 33dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src = _mm256_inserti128_si256(src, 34dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1); 35dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 36dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ref =_mm256_castsi128_si256( 37dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm_loadu_si128((__m128i const *) (ref_ptr))); 38dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ref = _mm256_inserti128_si256(ref, 39dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1); 40dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 41dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // expanding to 16 bit each lane 42dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); 43dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); 44dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 45dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); 46dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); 47dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 48dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // src-ref 49dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); 50dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); 51dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 52dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // madd low (src - ref) 53dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); 54dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 55dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // add high to low 56dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); 57dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 58dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // madd high (src - ref) 59dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); 60dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 61dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); 62dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 63dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // add high to low 64dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org madd_ref_src = _mm256_add_epi32(madd_ref_src, 65dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm256_add_epi32(madd_low, madd_high)); 66dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 67dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_ptr+= src_2strides; 68dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ref_ptr+= ref_2strides; 69dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org } 70dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 71dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org { 72dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m128i sum_res, madd_res; 73dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m128i expand_sum_low, expand_sum_high, expand_sum; 74dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m128i expand_madd_low, expand_madd_high, expand_madd; 75dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; 76dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 77dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // extract the low lane and add it to the high lane 78dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), 79dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm256_extractf128_si256(sum_ref_src, 1)); 80dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 81dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), 82dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm256_extractf128_si256(madd_ref_src, 1)); 83dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 84dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // padding each 2 bytes with another 2 zeroed bytes 85dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), 86dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org sum_res); 87dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), 88dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org sum_res); 89dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 90dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // shifting the sign 16 bits right 91dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); 92dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); 93dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 94dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); 95dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 96dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // expand each 32 bits of the madd result to 64 bits 97dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_madd_low = _mm_unpacklo_epi32(madd_res, 98dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm256_castsi256_si128(zero_reg)); 99dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_madd_high = _mm_unpackhi_epi32(madd_res, 100dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm256_castsi256_si128(zero_reg)); 101dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 102dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); 103dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 104dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum, 105dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm256_castsi256_si128(zero_reg)); 106dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum, 107dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm256_castsi256_si128(zero_reg)); 108dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 109dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); 110dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 111dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // shift 8 bytes eight 112dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org madd_res = _mm_srli_si128(expand_madd, 8); 113dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org sum_res = _mm_srli_si128(ex_expand_sum, 8); 114dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 115dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org madd_res = _mm_add_epi32(madd_res, expand_madd); 116dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org sum_res = _mm_add_epi32(sum_res, ex_expand_sum); 117dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 118dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *((int*)SSE)= _mm_cvtsi128_si32(madd_res); 119dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 120dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *((int*)Sum)= _mm_cvtsi128_si32(sum_res); 121dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org } 122dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org} 123dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 124dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.orgvoid vp9_get32x32var_avx2(const unsigned char *src_ptr, 125dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org int source_stride, 126dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org const unsigned char *ref_ptr, 127dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org int recon_stride, 128dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org unsigned int *SSE, 129dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org int *Sum) { 130dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; 131dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i ref_expand_high, madd_low, madd_high; 132dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org unsigned int i; 133dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i zero_reg = _mm256_set1_epi16(0); 134dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i sum_ref_src = _mm256_set1_epi16(0); 135dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i madd_ref_src = _mm256_set1_epi16(0); 136dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 137dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // processing 32 elements in parallel 138dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org for (i = 0; i < 16; i++) { 139dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src = _mm256_loadu_si256((__m256i const *) (src_ptr)); 140dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 141dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ref = _mm256_loadu_si256((__m256i const *) (ref_ptr)); 142dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 143dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // expanding to 16 bit each lane 144dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); 145dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); 146dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 147dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); 148dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); 149dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 150dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // src-ref 151dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); 152dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); 153dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 154dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // madd low (src - ref) 155dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); 156dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 157dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // add high to low 158dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); 159dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 160dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // madd high (src - ref) 161dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); 162dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 163dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); 164dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 165dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // add high to low 166dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org madd_ref_src = _mm256_add_epi32(madd_ref_src, 167dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm256_add_epi32(madd_low, madd_high)); 168dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 169dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org src_ptr+= source_stride; 170dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ref_ptr+= recon_stride; 171dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org } 172dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 173dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org { 174dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i expand_sum_low, expand_sum_high, expand_sum; 175dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i expand_madd_low, expand_madd_high, expand_madd; 176dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; 177dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 178dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // padding each 2 bytes with another 2 zeroed bytes 179dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); 180dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); 181dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 182dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // shifting the sign 16 bits right 183dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); 184dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); 185dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 186dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); 187dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 188dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // expand each 32 bits of the madd result to 64 bits 189dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); 190dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); 191dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 192dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); 193dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 194dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); 195dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); 196dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 197dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); 198dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 199dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // shift 8 bytes eight 200dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org madd_ref_src = _mm256_srli_si256(expand_madd, 8); 201dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); 202dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 203dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); 204dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); 205dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 206dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org // extract the low lane and the high lane and add the results 207dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + 208dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); 209dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 210dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + 211dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); 212dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org } 213dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org} 214