1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10#include <immintrin.h> 11#include "./vpx_dsp_rtcd.h" 12#include "vpx_ports/mem.h" 13 14#define FSAD64_H(h) \ 15unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \ 16 int src_stride, \ 17 const uint8_t *ref_ptr, \ 18 int ref_stride) { \ 19 int i, res; \ 20 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 21 __m256i sum_sad = _mm256_setzero_si256(); \ 22 __m256i sum_sad_h; \ 23 __m128i sum_sad128; \ 24 for (i = 0 ; i < h ; i++) { \ 25 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 26 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ 27 sad1_reg = _mm256_sad_epu8(ref1_reg, \ 28 _mm256_loadu_si256((__m256i const *)src_ptr)); \ 29 sad2_reg = _mm256_sad_epu8(ref2_reg, \ 30 _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ 31 sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 32 ref_ptr+= ref_stride; \ 33 src_ptr+= src_stride; \ 34 } \ 35 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 36 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 37 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 38 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 39 res = _mm_cvtsi128_si32(sum_sad128); \ 40 return res; \ 41} 42 43#define FSAD32_H(h) \ 44unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \ 45 int src_stride, \ 46 const uint8_t *ref_ptr, \ 47 int ref_stride) { \ 48 int i, res; \ 49 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 50 __m256i sum_sad = _mm256_setzero_si256(); \ 51 __m256i sum_sad_h; \ 52 __m128i sum_sad128; \ 53 int ref2_stride = ref_stride << 1; \ 54 int src2_stride = src_stride << 1; \ 55 int max = h >> 1; \ 56 for (i = 0 ; i < max ; i++) { \ 57 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 58 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ 59 sad1_reg = _mm256_sad_epu8(ref1_reg, \ 60 _mm256_loadu_si256((__m256i const *)src_ptr)); \ 61 sad2_reg = _mm256_sad_epu8(ref2_reg, \ 62 _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ 63 sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 64 ref_ptr+= ref2_stride; \ 65 src_ptr+= src2_stride; \ 66 } \ 67 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 68 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 69 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 70 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 71 res = _mm_cvtsi128_si32(sum_sad128); \ 72 return res; \ 73} 74 75#define FSAD64 \ 76FSAD64_H(64); \ 77FSAD64_H(32); 78 79#define FSAD32 \ 80FSAD32_H(64); \ 81FSAD32_H(32); \ 82FSAD32_H(16); 83 84FSAD64; 85FSAD32; 86 87#undef FSAD64 88#undef FSAD32 89#undef FSAD64_H 90#undef FSAD32_H 91 92#define FSADAVG64_H(h) \ 93unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \ 94 int src_stride, \ 95 const uint8_t *ref_ptr, \ 96 int ref_stride, \ 97 const uint8_t *second_pred) { \ 98 int i, res; \ 99 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 100 __m256i sum_sad = _mm256_setzero_si256(); \ 101 __m256i sum_sad_h; \ 102 __m128i sum_sad128; \ 103 for (i = 0 ; i < h ; i++) { \ 104 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 105 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ 106 ref1_reg = _mm256_avg_epu8(ref1_reg, \ 107 _mm256_loadu_si256((__m256i const *)second_pred)); \ 108 ref2_reg = _mm256_avg_epu8(ref2_reg, \ 109 _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ 110 sad1_reg = _mm256_sad_epu8(ref1_reg, \ 111 _mm256_loadu_si256((__m256i const *)src_ptr)); \ 112 sad2_reg = _mm256_sad_epu8(ref2_reg, \ 113 _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ 114 sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 115 ref_ptr+= ref_stride; \ 116 src_ptr+= src_stride; \ 117 second_pred+= 64; \ 118 } \ 119 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 120 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 121 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 122 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 123 res = _mm_cvtsi128_si32(sum_sad128); \ 124 return res; \ 125} 126 127#define FSADAVG32_H(h) \ 128unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \ 129 int src_stride, \ 130 const uint8_t *ref_ptr, \ 131 int ref_stride, \ 132 const uint8_t *second_pred) { \ 133 int i, res; \ 134 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 135 __m256i sum_sad = _mm256_setzero_si256(); \ 136 __m256i sum_sad_h; \ 137 __m128i sum_sad128; \ 138 int ref2_stride = ref_stride << 1; \ 139 int src2_stride = src_stride << 1; \ 140 int max = h >> 1; \ 141 for (i = 0 ; i < max ; i++) { \ 142 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 143 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ 144 ref1_reg = _mm256_avg_epu8(ref1_reg, \ 145 _mm256_loadu_si256((__m256i const *)second_pred)); \ 146 ref2_reg = _mm256_avg_epu8(ref2_reg, \ 147 _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ 148 sad1_reg = _mm256_sad_epu8(ref1_reg, \ 149 _mm256_loadu_si256((__m256i const *)src_ptr)); \ 150 sad2_reg = _mm256_sad_epu8(ref2_reg, \ 151 _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ 152 sum_sad = _mm256_add_epi32(sum_sad, \ 153 _mm256_add_epi32(sad1_reg, sad2_reg)); \ 154 ref_ptr+= ref2_stride; \ 155 src_ptr+= src2_stride; \ 156 second_pred+= 64; \ 157 } \ 158 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 159 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 160 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 161 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 162 res = _mm_cvtsi128_si32(sum_sad128); \ 163 return res; \ 164} 165 166#define FSADAVG64 \ 167FSADAVG64_H(64); \ 168FSADAVG64_H(32); 169 170#define FSADAVG32 \ 171FSADAVG32_H(64); \ 172FSADAVG32_H(32); \ 173FSADAVG32_H(16); 174 175FSADAVG64; 176FSADAVG32; 177 178#undef FSADAVG64 179#undef FSADAVG32 180#undef FSADAVG64_H 181#undef FSADAVG32_H 182