1dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org/* 2dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * 4dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * Use of this source code is governed by a BSD-style license 5dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * that can be found in the LICENSE file in the root of the source 6dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * tree. An additional intellectual property rights grant can be found 7dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * in the file PATENTS. All contributing project authors may 8dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org * be found in the AUTHORS file in the root of the source tree. 9dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org */ 10dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org#include "./vpx_config.h" 11dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 12dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org#include "vp9/encoder/vp9_variance.h" 13dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org#include "vpx_ports/mem.h" 14dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 15d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgtypedef void (*get_var_avx2)(const uint8_t *src, int src_stride, 16d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *ref, int ref_stride, 17d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int *sse, int *sum); 18d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org 19d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgvoid vp9_get16x16var_avx2(const uint8_t *src, int src_stride, 20d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *ref, int ref_stride, 21d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int *sse, int *sum); 22d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org 23d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgvoid vp9_get32x32var_avx2(const uint8_t *src, int src_stride, 24d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *ref, int ref_stride, 25d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int *sse, int *sum); 26d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org 27d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, 28d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int x_offset, int y_offset, 29d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *dst, int dst_stride, 30d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int height, 31d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int *sse); 32d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org 33d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, 34d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int src_stride, 35d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int x_offset, 36d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int y_offset, 37d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *dst, 38d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int dst_stride, 39d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *sec, 40d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int sec_stride, 41d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int height, 42d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int *sseptr); 43d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org 44d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgstatic void variance_avx2(const uint8_t *src, int src_stride, 45d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *ref, int ref_stride, 46d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int w, int h, unsigned int *sse, int *sum, 47d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org get_var_avx2 var_fn, int block_size) { 48dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org int i, j; 49dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 50dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *sse = 0; 51dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *sum = 0; 52dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 53dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org for (i = 0; i < h; i += 16) { 54dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org for (j = 0; j < w; j += block_size) { 55d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int sse0; 56d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int sum0; 57d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org var_fn(&src[src_stride * i + j], src_stride, 58d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); 59dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *sse += sse0; 60dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *sum += sum0; 61dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org } 62dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org } 63dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org} 64dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 65dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 66d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride, 67d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *ref, int ref_stride, 68d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int *sse) { 69d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int sum; 70d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org variance_avx2(src, src_stride, ref, ref_stride, 16, 16, 71d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org sse, &sum, vp9_get16x16var_avx2, 16); 72d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org return *sse - (((unsigned int)sum * sum) >> 8); 73dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org} 74dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 75d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride, 76d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *ref, int ref_stride, 77d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int *sse) { 78d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int sum; 79d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); 80d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org return *sse; 81dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org} 82dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 83d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride, 84d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *ref, int ref_stride, 85dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org unsigned int *sse) { 86d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int sum; 87d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org variance_avx2(src, src_stride, ref, ref_stride, 32, 16, 88d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org sse, &sum, vp9_get32x32var_avx2, 32); 89d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org return *sse - (((int64_t)sum * sum) >> 9); 90dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org} 91dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 92d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride, 93d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *ref, int ref_stride, 94dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org unsigned int *sse) { 95d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int sum; 96d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org variance_avx2(src, src_stride, ref, ref_stride, 32, 32, 97d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org sse, &sum, vp9_get32x32var_avx2, 32); 98d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org return *sse - (((int64_t)sum * sum) >> 10); 99dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org} 100dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 101d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride, 102d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *ref, int ref_stride, 103dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org unsigned int *sse) { 104d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int sum; 105d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org variance_avx2(src, src_stride, ref, ref_stride, 64, 64, 106d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org sse, &sum, vp9_get32x32var_avx2, 32); 107d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org return *sse - (((int64_t)sum * sum) >> 12); 108dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org} 109dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org 110d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride, 111d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const uint8_t *ref, int ref_stride, 112dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org unsigned int *sse) { 113d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org int sum; 114d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org variance_avx2(src, src_stride, ref, ref_stride, 64, 32, 115d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org sse, &sum, vp9_get32x32var_avx2, 32); 116d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org return *sse - (((int64_t)sum * sum) >> 11); 117dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org} 118411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 119411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgunsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, 120411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int src_stride, 121411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int x_offset, 122411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int y_offset, 123411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org const uint8_t *dst, 124411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int dst_stride, 125d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int *sse) { 126d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int sse1; 127d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 128d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org y_offset, dst, dst_stride, 129d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org 64, &sse1); 130411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int sse2; 131d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, 132d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org x_offset, y_offset, 133d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org dst + 32, dst_stride, 134d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org 64, &sse2); 135d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const int se = se1 + se2; 136d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org *sse = sse1 + sse2; 137d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org return *sse - (((int64_t)se * se) >> 12); 138411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org} 139411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 140411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgunsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, 141411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int src_stride, 142411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int x_offset, 143411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int y_offset, 144411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org const uint8_t *dst, 145411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int dst_stride, 146d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int *sse) { 147d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 148d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org y_offset, dst, dst_stride, 149d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org 32, sse); 150d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org return *sse - (((int64_t)se * se) >> 10); 151411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org} 152411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 153411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgunsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, 154411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int src_stride, 155411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int x_offset, 156411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int y_offset, 157411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org const uint8_t *dst, 158411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int dst_stride, 159d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int *sse, 160411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org const uint8_t *sec) { 161d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int sse1; 162d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 163d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org y_offset, dst, dst_stride, 164d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org sec, 64, 64, &sse1); 165411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org unsigned int sse2; 166d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const int se2 = 167d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, 168d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org y_offset, dst + 32, dst_stride, 169d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org sec + 32, 64, 64, &sse2); 170d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const int se = se1 + se2; 171411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 172d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org *sse = sse1 + sse2; 173d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org 174d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org return *sse - (((int64_t)se * se) >> 12); 175411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org} 176411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 177411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgunsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, 178411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int src_stride, 179411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int x_offset, 180411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int y_offset, 181411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org const uint8_t *dst, 182411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org int dst_stride, 183d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org unsigned int *sse, 184411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org const uint8_t *sec) { 185411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org // processing 32 element in parallel 186d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 187d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org y_offset, dst, dst_stride, 188d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org sec, 32, 32, sse); 189d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org return *sse - (((int64_t)se * se) >> 10); 190411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org} 191