1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_config.h" 12 13#include "vp9/encoder/vp9_variance.h" 14#include "vpx_ports/mem.h" 15 16typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride, 17 const unsigned char *ref, int ref_stride, 18 unsigned int *sse, int *sum); 19 20unsigned int vp9_get4x4var_mmx(const unsigned char *src, int src_stride, 21 const unsigned char *ref, int ref_stride, 22 unsigned int *sse, int *sum); 23 24 25unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride, 26 const unsigned char *ref, int ref_stride, 27 unsigned int *sse, int *sum); 28 29unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride, 30 const unsigned char *ref, int ref_stride, 31 unsigned int *sse, int *sum); 32 33static void variance_sse2(const unsigned char *src, int src_stride, 34 const unsigned char *ref, int ref_stride, 35 int w, int h, unsigned int *sse, int *sum, 36 variance_fn_t var_fn, int block_size) { 37 int i, j; 38 39 *sse = 0; 40 *sum = 0; 41 42 for (i = 0; i < h; i += block_size) { 43 for (j = 0; j < w; j += block_size) { 44 unsigned int sse0; 45 int sum0; 46 var_fn(src + src_stride * i + j, src_stride, 47 ref + ref_stride * i + j, ref_stride, &sse0, &sum0); 48 *sse += sse0; 49 *sum += sum0; 50 } 51 } 52} 53 54unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride, 55 const unsigned char *ref, int ref_stride, 56 unsigned int *sse) { 57 int sum; 58 variance_sse2(src, src_stride, ref, ref_stride, 4, 4, 59 sse, &sum, vp9_get4x4var_mmx, 4); 60 return *sse - (((unsigned int)sum * sum) >> 4); 61} 62 63unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride, 64 const uint8_t *ref, int ref_stride, 65 unsigned int *sse) { 66 int sum; 67 variance_sse2(src, src_stride, ref, ref_stride, 8, 4, 68 sse, &sum, vp9_get4x4var_mmx, 4); 69 return *sse - (((unsigned int)sum * sum) >> 5); 70} 71 72unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride, 73 const uint8_t *ref, int ref_stride, 74 unsigned int *sse) { 75 int sum; 76 variance_sse2(src, src_stride, ref, ref_stride, 4, 8, 77 sse, &sum, vp9_get4x4var_mmx, 4); 78 return *sse - (((unsigned int)sum * sum) >> 5); 79} 80 81unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride, 82 const unsigned char *ref, int ref_stride, 83 unsigned int *sse) { 84 int sum; 85 variance_sse2(src, src_stride, ref, ref_stride, 8, 8, 86 sse, &sum, vp9_get8x8var_sse2, 8); 87 return *sse - (((unsigned int)sum * sum) >> 6); 88} 89 90unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride, 91 const unsigned char *ref, int ref_stride, 92 unsigned int *sse) { 93 int sum; 94 variance_sse2(src, src_stride, ref, ref_stride, 16, 8, 95 sse, &sum, vp9_get8x8var_sse2, 8); 96 return *sse - (((unsigned int)sum * sum) >> 7); 97} 98 99unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride, 100 const unsigned char *ref, int ref_stride, 101 unsigned int *sse) { 102 int sum; 103 variance_sse2(src, src_stride, ref, ref_stride, 8, 16, 104 sse, &sum, vp9_get8x8var_sse2, 8); 105 return *sse - (((unsigned int)sum * sum) >> 7); 106} 107 108unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride, 109 const unsigned char *ref, int ref_stride, 110 unsigned int *sse) { 111 int sum; 112 variance_sse2(src, src_stride, ref, ref_stride, 16, 16, 113 sse, &sum, vp9_get16x16var_sse2, 16); 114 return *sse - (((unsigned int)sum * sum) >> 8); 115} 116 117unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride, 118 const unsigned char *ref, int ref_stride, 119 unsigned int *sse) { 120 int sum; 121 vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); 122 return *sse; 123} 124 125unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride, 126 const uint8_t *ref, int ref_stride, 127 unsigned int *sse) { 128 int sum; 129 variance_sse2(src, src_stride, ref, ref_stride, 32, 32, 130 sse, &sum, vp9_get16x16var_sse2, 16); 131 return *sse - (((int64_t)sum * sum) >> 10); 132} 133 134unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride, 135 const uint8_t *ref, int ref_stride, 136 unsigned int *sse) { 137 int sum; 138 variance_sse2(src, src_stride, ref, ref_stride, 32, 16, 139 sse, &sum, vp9_get16x16var_sse2, 16); 140 return *sse - (((int64_t)sum * sum) >> 9); 141} 142 143unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride, 144 const uint8_t *ref, int ref_stride, 145 unsigned int *sse) { 146 int sum; 147 variance_sse2(src, src_stride, ref, ref_stride, 16, 32, 148 sse, &sum, vp9_get16x16var_sse2, 16); 149 return *sse - (((int64_t)sum * sum) >> 9); 150} 151 152unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride, 153 const uint8_t *ref, int ref_stride, 154 unsigned int *sse) { 155 int sum; 156 variance_sse2(src, src_stride, ref, ref_stride, 64, 64, 157 sse, &sum, vp9_get16x16var_sse2, 16); 158 return *sse - (((int64_t)sum * sum) >> 12); 159} 160 161unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride, 162 const uint8_t *ref, int ref_stride, 163 unsigned int *sse) { 164 int sum; 165 variance_sse2(src, src_stride, ref, ref_stride, 64, 32, 166 sse, &sum, vp9_get16x16var_sse2, 16); 167 return *sse - (((int64_t)sum * sum) >> 11); 168} 169 170unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride, 171 const uint8_t *ref, int ref_stride, 172 unsigned int *sse) { 173 int sum; 174 variance_sse2(src, src_stride, ref, ref_stride, 32, 64, 175 sse, &sum, vp9_get16x16var_sse2, 16); 176 return *sse - (((int64_t)sum * sum) >> 11); 177} 178 179#define DECL(w, opt) \ 180int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ 181 ptrdiff_t src_stride, \ 182 int x_offset, int y_offset, \ 183 const uint8_t *dst, \ 184 ptrdiff_t dst_stride, \ 185 int height, unsigned int *sse) 186#define DECLS(opt1, opt2) \ 187DECL(4, opt2); \ 188DECL(8, opt1); \ 189DECL(16, opt1) 190 191DECLS(sse2, sse); 192DECLS(ssse3, ssse3); 193#undef DECLS 194#undef DECL 195 196#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 197unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \ 198 int src_stride, \ 199 int x_offset, \ 200 int y_offset, \ 201 const uint8_t *dst, \ 202 int dst_stride, \ 203 unsigned int *sse_ptr) { \ 204 unsigned int sse; \ 205 int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ 206 y_offset, dst, dst_stride, \ 207 h, &sse); \ 208 if (w > wf) { \ 209 unsigned int sse2; \ 210 int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \ 211 x_offset, y_offset, \ 212 dst + 16, dst_stride, \ 213 h, &sse2); \ 214 se += se2; \ 215 sse += sse2; \ 216 if (w > wf * 2) { \ 217 se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ 218 x_offset, y_offset, \ 219 dst + 32, dst_stride, \ 220 h, &sse2); \ 221 se += se2; \ 222 sse += sse2; \ 223 se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ 224 x_offset, y_offset, \ 225 dst + 48, dst_stride, \ 226 h, &sse2); \ 227 se += se2; \ 228 sse += sse2; \ 229 } \ 230 } \ 231 *sse_ptr = sse; \ 232 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 233} 234 235#define FNS(opt1, opt2) \ 236FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ 237FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ 238FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ 239FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ 240FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ 241FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ 242FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ 243FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \ 244FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \ 245FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \ 246FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \ 247FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \ 248FN(4, 4, 4, 2, 2, opt2, (unsigned int)) 249 250FNS(sse2, sse); 251FNS(ssse3, ssse3); 252 253#undef FNS 254#undef FN 255 256#define DECL(w, opt) \ 257int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \ 258 ptrdiff_t src_stride, \ 259 int x_offset, int y_offset, \ 260 const uint8_t *dst, \ 261 ptrdiff_t dst_stride, \ 262 const uint8_t *sec, \ 263 ptrdiff_t sec_stride, \ 264 int height, unsigned int *sse) 265#define DECLS(opt1, opt2) \ 266DECL(4, opt2); \ 267DECL(8, opt1); \ 268DECL(16, opt1) 269 270DECLS(sse2, sse); 271DECLS(ssse3, ssse3); 272#undef DECL 273#undef DECLS 274 275#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 276unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \ 277 int src_stride, \ 278 int x_offset, \ 279 int y_offset, \ 280 const uint8_t *dst, \ 281 int dst_stride, \ 282 unsigned int *sseptr, \ 283 const uint8_t *sec) { \ 284 unsigned int sse; \ 285 int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \ 286 y_offset, dst, dst_stride, \ 287 sec, w, h, &sse); \ 288 if (w > wf) { \ 289 unsigned int sse2; \ 290 int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \ 291 x_offset, y_offset, \ 292 dst + 16, dst_stride, \ 293 sec + 16, w, h, &sse2); \ 294 se += se2; \ 295 sse += sse2; \ 296 if (w > wf * 2) { \ 297 se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \ 298 x_offset, y_offset, \ 299 dst + 32, dst_stride, \ 300 sec + 32, w, h, &sse2); \ 301 se += se2; \ 302 sse += sse2; \ 303 se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \ 304 x_offset, y_offset, \ 305 dst + 48, dst_stride, \ 306 sec + 48, w, h, &sse2); \ 307 se += se2; \ 308 sse += sse2; \ 309 } \ 310 } \ 311 *sseptr = sse; \ 312 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 313} 314 315#define FNS(opt1, opt2) \ 316FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ 317FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ 318FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ 319FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ 320FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ 321FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ 322FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ 323FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \ 324FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \ 325FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \ 326FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \ 327FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \ 328FN(4, 4, 4, 2, 2, opt2, (unsigned int)) 329 330FNS(sse2, sse); 331FNS(ssse3, ssse3); 332 333#undef FNS 334#undef FN 335